#!/usr/bin/env python # -*- coding: utf-8 -*- __author__ = '' import logging, os, argparse, textwrap import time import chardet import shutil # Default configuration will take effect when corresponding input args are missing. # Feel free to change this for your convenience. DEFAULT_CONF = { 'add_BOM' : False, 'convert_UTF' : False, 'confi_thres' : 0.8, } # We have to set a minimum threshold. Only those target_encoding results returned by # chartdet that are above that threshold level would be accepted. # See https://github.com/x1angli/convert2utf/issues/4 for further details #for root, dirs, files in os.walk(dirname): # for name in files: # extension = os.path.splitext(name)[1][1:].strip().lower() # fullname = os.path.join(root, name) # # On linux there is a newline at the end which will cause the match to fail, so we just 'strip()' the '\n' # # Also, add 'lower()' to ensure matching # if (extension in self.args.skip_exts or name in self.args.skip_files): # log.info("Skipped file %s", fullname) # self.copy_file(fullname) # continue # try: # self.convert_file(fullname) # except IOError: # log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname) # except KeyboardInterrupt: # log.warning("Interrupted by keyboard (e.g. Ctrl+C)") # exit() # for name in dirs: # fullname = os.path.join(root, name) # if name in self.args.skip_dirs: # log.info("Skipped dir %s", fullname) # else: # None # # self.walk_dir(fullname) #, filename='D:\\gbk2utf8.txt' logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) log = logging.getLogger(__name__) class Arguments: def __init__(self): self.dst_dir = "" self.root = "" self.skip_dirs = ['.git', '.vs', 'out', 'build'] self.skip_files = [".git", ".vs"] self.skip_exts = ["exe", "xml", "vcxproj", "rc", "rc2"] self.convert_utf = False """utf-8-sig""" self.target_encoding = 'utf-8' class Convert2Utf8: def __init__(self, args): self.args = args def copy_file(self, filename): dst_dir = self.args.dst_dir src_dir_length = len(self.args.root) relate_filename = filename[src_dir_length:] dst_filename = dst_dir + relate_filename (filepath, tempfilename) = os.path.split(dst_filename) (filenameonly, extension) = os.path.splitext(tempfilename) if not os.path.exists(filepath): os.makedirs(filepath) shutil.copyfile(filename, dst_filename) log.info("copy file from %s to %s", filename, dst_filename) def walk_dir(self, dirname): filelist = os.listdir(dirname) for file in filelist: fullname = os.path.join(dirname, file) if os.path.isdir(fullname): if file in self.args.skip_dirs: log.info("Skipped dir %s", fullname) else: self.walk_dir(fullname) else: extension = os.path.splitext(file)[1][1:].strip().lower() if (extension in self.args.skip_exts or file in self.args.skip_files): log.info("Skipped file %s", fullname) self.copy_file(fullname) continue try: self.convert_file(fullname) except IOError: log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname) exit() except KeyboardInterrupt: log.warning("Interrupted by keyboard (e.g. Ctrl+C)") exit() def convert_file(self, filename): with open(filename, 'rb') as f: # read under the binary mode bytedata = f.read() if len(bytedata) == 0: log.info("Skipped empty file %s", filename) self.copy_file(filename) return chr_res = chardet.detect(bytedata) if chr_res['encoding'] == None or chr_res['confidence'] < DEFAULT_CONF['confi_thres']: log.warning("Ignoring %s, since its encoding is unable to detect.", filename) self.copy_file(filename) return src_enc = chr_res['encoding'].lower() log.debug("Scanned %s, whose encoding is %s ", filename, src_enc) #if (src_enc == 'ascii'): # log.info("Skipped %s, whose encoding is %s", filename, src_enc) # self.copy_file(filename) # return if (not self.args.convert_utf) and src_enc.startswith('utf'): log.info("Skipped %s, whose encoding is %s", filename, src_enc) self.copy_file(filename) return # Since chardet only recognized all GB-based target_encoding as 'gb2312', the decoding will fail when the text file # contains certain special charaters. To make it more special-character-tolerant, we should # upgrade the target_encoding to 'gb18030', which is a character set larger than gb2312. if src_enc.lower() == 'gb2312': src_enc = 'gb18030' try: strdata = bytedata.decode(src_enc) except UnicodeDecodeError as e: log.error("Unicode error for file %s", filename) print(e) copy_file(filename) return # preserving file time information (modification time and access time) src_stat = os.stat(filename) tgt_enc = self.args.target_encoding dst_dir = self.args.dst_dir src_dir_length = len(self.args.root) relate_filename = filename[src_dir_length:] dst_filename = dst_dir + relate_filename (filepath, tempfilename) = os.path.split(dst_filename) if not os.path.exists(filepath): os.makedirs(filepath) log.debug("Writing the file: %s in %s", dst_filename, tgt_enc) with open(dst_filename, 'wb') as t: # write under the binary mode log.info("wreite") t.write(strdata.encode(tgt_enc)) log.info("Converted the file: %s from %s to %s", filename, src_enc, tgt_enc) # setting the new file's time to the old file os.utime(dst_filename, times = (src_stat.st_atime, src_stat.st_ctime)) # end of def convert_file(self, filename) def run(self): root = self.args.root if not os.path.exists(root): log.error("The file specified %s is neither a directory nor a regular file", root) return dst = self.args.dst_dir if not os.path.exists(dst) or not os.path.isdir(dst): log.info("make dst directory") os.makedirs(dst) else: clean_backups(dst) log.info("Start working now!") if os.path.isdir(root): log.info("The root is: %s. ", root) self.walk_dir(root) else: log.info("Wow, only a single file will be processed: %s", root) self.convert_file(root) log.info("Finished all.") # end of def run(self, root): def remove_cursive(dirname): for root, dirs, files in os.walk(dirname): for name in files: fullname = os.path.join(root, name) os.remove(fullname) log.info("Removed the file: %s", fullname) for name in dirs: fullname = os.path.join(root, name) remove_cursive(fullname) os.rmdir(fullname) log.info("Removed the dir: %s", fullname) def clean_backups(dirname): if not os.path.isdir(dirname): log.error("The file specified %s is not a directory ", dirname) return log.info("Removing all newly-created files under %s", dirname) remove_cursive(dirname) def cli(): args = Arguments() args.dst_dir = "D:/Temp/utf8" args.root = "D:/GitReposity/wmpplayer" cvt2utf8 = Convert2Utf8(args) cvt2utf8.run() if __name__ == '__main__': cli()