|
@@ -0,0 +1,221 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+
|
|
|
+__author__ = ''
|
|
|
+
|
|
|
+import logging, os, argparse, textwrap
|
|
|
+import time
|
|
|
+import chardet
|
|
|
+import shutil
|
|
|
+
|
|
|
+# Default configuration will take effect when corresponding input args are missing.
|
|
|
+# Feel free to change this for your convenience.
|
|
|
+DEFAULT_CONF = {
|
|
|
+ 'add_BOM' : False,
|
|
|
+ 'convert_UTF' : False,
|
|
|
+ 'confi_thres' : 0.8,
|
|
|
+}
|
|
|
+
|
|
|
+# We have to set a minimum threshold. Only those target_encoding results returned by
|
|
|
+# chartdet that are above that threshold level would be accepted.
|
|
|
+# See https://github.com/x1angli/convert2utf/issues/4 for further details
|
|
|
+
|
|
|
+
|
|
|
+ #for root, dirs, files in os.walk(dirname):
|
|
|
+ # for name in files:
|
|
|
+ # extension = os.path.splitext(name)[1][1:].strip().lower()
|
|
|
+ # fullname = os.path.join(root, name)
|
|
|
+ # # On linux there is a newline at the end which will cause the match to fail, so we just 'strip()' the '\n'
|
|
|
+ # # Also, add 'lower()' to ensure matching
|
|
|
+ # if (extension in self.args.skip_exts or name in self.args.skip_files):
|
|
|
+ # log.info("Skipped file %s", fullname)
|
|
|
+ # self.copy_file(fullname)
|
|
|
+ # continue
|
|
|
+
|
|
|
+ # try:
|
|
|
+ # self.convert_file(fullname)
|
|
|
+ # except IOError:
|
|
|
+ # log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
|
|
|
+ # except KeyboardInterrupt:
|
|
|
+ # log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
|
|
|
+ # exit()
|
|
|
+ # for name in dirs:
|
|
|
+ # fullname = os.path.join(root, name)
|
|
|
+ # if name in self.args.skip_dirs:
|
|
|
+ # log.info("Skipped dir %s", fullname)
|
|
|
+ # else:
|
|
|
+ # None
|
|
|
+ # # self.walk_dir(fullname)
|
|
|
+
|
|
|
+ #, filename='D:\\gbk2utf8.txt'
|
|
|
+logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
|
|
|
+log = logging.getLogger(__name__)
|
|
|
+
|
|
|
+class Arguments:
|
|
|
+ def __init__(self):
|
|
|
+ self.dst_dir = ""
|
|
|
+ self.root = ""
|
|
|
+ self.skip_dirs = ['.git', '.vs', 'out', 'build']
|
|
|
+ self.skip_files = [".git", ".vs"]
|
|
|
+ self.skip_exts = ["exe", "xml", "vcxproj", "rc", "rc2"]
|
|
|
+ self.convert_utf = False
|
|
|
+ """utf-8-sig"""
|
|
|
+ self.target_encoding = 'utf-8'
|
|
|
+
|
|
|
+class Convert2Utf8:
|
|
|
+ def __init__(self, args):
|
|
|
+ self.args = args
|
|
|
+
|
|
|
+ def copy_file(self, filename):
|
|
|
+ dst_dir = self.args.dst_dir
|
|
|
+ src_dir_length = len(self.args.root)
|
|
|
+ relate_filename = filename[src_dir_length:]
|
|
|
+ dst_filename = dst_dir + relate_filename
|
|
|
+ (filepath, tempfilename) = os.path.split(dst_filename)
|
|
|
+ (filenameonly, extension) = os.path.splitext(tempfilename)
|
|
|
+ if not os.path.exists(filepath):
|
|
|
+ os.makedirs(filepath)
|
|
|
+ shutil.copyfile(filename, dst_filename)
|
|
|
+ log.info("copy file from %s to %s", filename, dst_filename)
|
|
|
+
|
|
|
+ def walk_dir(self, dirname):
|
|
|
+ filelist = os.listdir(dirname)
|
|
|
+ for file in filelist:
|
|
|
+ fullname = os.path.join(dirname, file)
|
|
|
+ if os.path.isdir(fullname):
|
|
|
+ if file in self.args.skip_dirs:
|
|
|
+ log.info("Skipped dir %s", fullname)
|
|
|
+ else:
|
|
|
+ self.walk_dir(fullname)
|
|
|
+ else:
|
|
|
+ extension = os.path.splitext(file)[1][1:].strip().lower()
|
|
|
+ if (extension in self.args.skip_exts or file in self.args.skip_files):
|
|
|
+ log.info("Skipped file %s", fullname)
|
|
|
+ self.copy_file(fullname)
|
|
|
+ continue
|
|
|
+
|
|
|
+ try:
|
|
|
+ self.convert_file(fullname)
|
|
|
+ except IOError:
|
|
|
+ log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
|
|
|
+ exit()
|
|
|
+ except KeyboardInterrupt:
|
|
|
+ log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
|
|
|
+ exit()
|
|
|
+
|
|
|
+ def convert_file(self, filename):
|
|
|
+ with open(filename, 'rb') as f: # read under the binary mode
|
|
|
+ bytedata = f.read()
|
|
|
+
|
|
|
+ if len(bytedata) == 0:
|
|
|
+ log.info("Skipped empty file %s", filename)
|
|
|
+ self.copy_file(filename)
|
|
|
+ return
|
|
|
+ chr_res = chardet.detect(bytedata)
|
|
|
+ if chr_res['encoding'] == None or chr_res['confidence'] < DEFAULT_CONF['confi_thres']:
|
|
|
+ log.warning("Ignoring %s, since its encoding is unable to detect.", filename)
|
|
|
+ self.copy_file(filename)
|
|
|
+ return
|
|
|
+ src_enc = chr_res['encoding'].lower()
|
|
|
+ log.debug("Scanned %s, whose encoding is %s ", filename, src_enc)
|
|
|
+
|
|
|
+ #if (src_enc == 'ascii'):
|
|
|
+ # log.info("Skipped %s, whose encoding is %s", filename, src_enc)
|
|
|
+ # self.copy_file(filename)
|
|
|
+ # return
|
|
|
+
|
|
|
+ if (not self.args.convert_utf) and src_enc.startswith('utf'):
|
|
|
+ log.info("Skipped %s, whose encoding is %s", filename, src_enc)
|
|
|
+ self.copy_file(filename)
|
|
|
+ return
|
|
|
+
|
|
|
+ # Since chardet only recognized all GB-based target_encoding as 'gb2312', the decoding will fail when the text file
|
|
|
+ # contains certain special charaters. To make it more special-character-tolerant, we should
|
|
|
+ # upgrade the target_encoding to 'gb18030', which is a character set larger than gb2312.
|
|
|
+ if src_enc.lower() == 'gb2312':
|
|
|
+ src_enc = 'gb18030'
|
|
|
+ try:
|
|
|
+ strdata = bytedata.decode(src_enc)
|
|
|
+ except UnicodeDecodeError as e:
|
|
|
+ log.error("Unicode error for file %s", filename)
|
|
|
+ print(e)
|
|
|
+ copy_file(filename)
|
|
|
+ return
|
|
|
+
|
|
|
+ # preserving file time information (modification time and access time)
|
|
|
+ src_stat = os.stat(filename)
|
|
|
+
|
|
|
+ tgt_enc = self.args.target_encoding
|
|
|
+ dst_dir = self.args.dst_dir
|
|
|
+ src_dir_length = len(self.args.root)
|
|
|
+ relate_filename = filename[src_dir_length:]
|
|
|
+ dst_filename = dst_dir + relate_filename
|
|
|
+
|
|
|
+ (filepath, tempfilename) = os.path.split(dst_filename)
|
|
|
+ if not os.path.exists(filepath):
|
|
|
+ os.makedirs(filepath)
|
|
|
+
|
|
|
+ log.debug("Writing the file: %s in %s", dst_filename, tgt_enc)
|
|
|
+ with open(dst_filename, 'wb') as t: # write under the binary mode
|
|
|
+ log.info("wreite")
|
|
|
+ t.write(strdata.encode(tgt_enc))
|
|
|
+ log.info("Converted the file: %s from %s to %s", filename, src_enc, tgt_enc)
|
|
|
+
|
|
|
+ # setting the new file's time to the old file
|
|
|
+ os.utime(dst_filename, times = (src_stat.st_atime, src_stat.st_ctime))
|
|
|
+ # end of def convert_file(self, filename)
|
|
|
+
|
|
|
+ def run(self):
|
|
|
+ root = self.args.root
|
|
|
+ if not os.path.exists(root):
|
|
|
+ log.error("The file specified %s is neither a directory nor a regular file", root)
|
|
|
+ return
|
|
|
+
|
|
|
+ dst = self.args.dst_dir
|
|
|
+ if not os.path.exists(dst) or not os.path.isdir(dst):
|
|
|
+ log.info("make dst directory")
|
|
|
+ os.makedirs(dst)
|
|
|
+ else:
|
|
|
+ clean_backups(dst)
|
|
|
+
|
|
|
+ log.info("Start working now!")
|
|
|
+
|
|
|
+ if os.path.isdir(root):
|
|
|
+ log.info("The root is: %s. ", root)
|
|
|
+ self.walk_dir(root)
|
|
|
+ else:
|
|
|
+ log.info("Wow, only a single file will be processed: %s", root)
|
|
|
+ self.convert_file(root)
|
|
|
+
|
|
|
+ log.info("Finished all.")
|
|
|
+ # end of def run(self, root):
|
|
|
+
|
|
|
+def remove_cursive(dirname):
|
|
|
+ for root, dirs, files in os.walk(dirname):
|
|
|
+ for name in files:
|
|
|
+ fullname = os.path.join(root, name)
|
|
|
+ os.remove(fullname)
|
|
|
+ log.info("Removed the file: %s", fullname)
|
|
|
+ for name in dirs:
|
|
|
+ fullname = os.path.join(root, name)
|
|
|
+ remove_cursive(fullname)
|
|
|
+ os.rmdir(fullname)
|
|
|
+ log.info("Removed the dir: %s", fullname)
|
|
|
+
|
|
|
+def clean_backups(dirname):
|
|
|
+ if not os.path.isdir(dirname):
|
|
|
+ log.error("The file specified %s is not a directory ", dirname)
|
|
|
+ return
|
|
|
+ log.info("Removing all newly-created files under %s", dirname)
|
|
|
+ remove_cursive(dirname)
|
|
|
+
|
|
|
+def cli():
|
|
|
+ args = Arguments()
|
|
|
+ args.dst_dir = "D:/Temp/utf8"
|
|
|
+ args.root = "D:/GitReposity/wmpplayer"
|
|
|
+ cvt2utf8 = Convert2Utf8(args)
|
|
|
+ cvt2utf8.run()
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ cli()
|