hace 1 año · a4d6797288
--- a/addin/GBK2Utf8.py
+++ b/addin/GBK2Utf8.py
@@ -0,0 +1,221 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+ 
			
 
				+__author__ = ''
			
 
				+ 
			
 
				+import logging, os, argparse, textwrap
			
 
				+import time
			
 
				+import chardet
			
 
				+import shutil
			
 
				+ 
			
 
				+# Default configuration will take effect when corresponding input args are missing.
			
 
				+# Feel free to change this for your convenience.
			
 
				+DEFAULT_CONF = {
			
 
				+    'add_BOM'   : False,
			
 
				+    'convert_UTF'   : False,
			
 
				+    'confi_thres' : 0.8,
			
 
				+}
			
 
				+ 
			
 
				+# We have to set a minimum threshold. Only those target_encoding results returned by 
			
 
				+# chartdet that are above that threshold level would be accepted.
			
 
				+# See https://github.com/x1angli/convert2utf/issues/4 for further details
			
 
				+ 
			
 
				+
			
 
				+    #for root, dirs, files in os.walk(dirname):
			
 
				+    #    for name in files:
			
 
				+    #        extension = os.path.splitext(name)[1][1:].strip().lower()
			
 
				+    #        fullname = os.path.join(root, name)
			
 
				+    #        # On linux there is a newline at the end which will cause the match to fail, so we just 'strip()' the '\n'
			
 
				+    #        # Also, add 'lower()' to ensure matching
			
 
				+    #        if (extension in self.args.skip_exts or name in self.args.skip_files):
			
 
				+    #            log.info("Skipped file %s", fullname)
			
 
				+    #            self.copy_file(fullname)
			
 
				+    #            continue
			
 
				+
			
 
				+    #        try:
			
 
				+    #            self.convert_file(fullname)
			
 
				+    #        except IOError:
			
 
				+    #            log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
			
 
				+    #        except KeyboardInterrupt:
			
 
				+    #            log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
			
 
				+    #            exit()
			
 
				+    #    for name in dirs:
			
 
				+    #        fullname = os.path.join(root, name)
			
 
				+    #        if name in self.args.skip_dirs:
			
 
				+    #            log.info("Skipped dir %s", fullname)
			
 
				+    #        else:
			
 
				+    #            None
			
 
				+    #            # self.walk_dir(fullname)
			
 
				+
			
 
				+ #, filename='D:\\gbk2utf8.txt'
			
 
				+logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
			
 
				+log = logging.getLogger(__name__)
			
 
				+ 
			
 
				+class Arguments:
			
 
				+    def __init__(self):
			
 
				+        self.dst_dir = ""
			
 
				+        self.root = ""
			
 
				+        self.skip_dirs = ['.git', '.vs', 'out', 'build']
			
 
				+        self.skip_files = [".git", ".vs"]
			
 
				+        self.skip_exts = ["exe", "xml", "vcxproj", "rc", "rc2"]
			
 
				+        self.convert_utf = False
			
 
				+        """utf-8-sig"""
			
 
				+        self.target_encoding = 'utf-8' 
			
 
				+ 
			
 
				+class Convert2Utf8:
			
 
				+    def __init__(self, args):
			
 
				+        self.args = args
			
 
				+ 
			
 
				+    def copy_file(self, filename):
			
 
				+        dst_dir = self.args.dst_dir
			
 
				+        src_dir_length = len(self.args.root)
			
 
				+        relate_filename = filename[src_dir_length:]
			
 
				+        dst_filename = dst_dir + relate_filename
			
 
				+        (filepath, tempfilename) = os.path.split(dst_filename)
			
 
				+        (filenameonly, extension) = os.path.splitext(tempfilename)
			
 
				+        if not os.path.exists(filepath):
			
 
				+            os.makedirs(filepath)
			
 
				+        shutil.copyfile(filename, dst_filename)
			
 
				+        log.info("copy file from %s to %s", filename, dst_filename)
			
 
				+ 
			
 
				+    def walk_dir(self, dirname):
			
 
				+        filelist = os.listdir(dirname)
			
 
				+        for file in filelist:
			
 
				+            fullname = os.path.join(dirname, file)
			
 
				+            if os.path.isdir(fullname):
			
 
				+                if file in self.args.skip_dirs:
			
 
				+                    log.info("Skipped dir %s", fullname)
			
 
				+                else:
			
 
				+                    self.walk_dir(fullname)
			
 
				+            else:
			
 
				+                extension = os.path.splitext(file)[1][1:].strip().lower()
			
 
				+                if (extension in self.args.skip_exts or file in self.args.skip_files):
			
 
				+                    log.info("Skipped file %s", fullname)
			
 
				+                    self.copy_file(fullname)
			
 
				+                    continue
			
 
				+
			
 
				+                try:
			
 
				+                    self.convert_file(fullname)
			
 
				+                except IOError:
			
 
				+                    log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
			
 
				+                    exit()
			
 
				+                except KeyboardInterrupt:
			
 
				+                    log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
			
 
				+                    exit()
			
 
				+ 
			
 
				+    def convert_file(self, filename):
			
 
				+        with open(filename, 'rb') as f: # read under the binary mode
			
 
				+            bytedata = f.read()
			
 
				+ 
			
 
				+        if len(bytedata) == 0:
			
 
				+            log.info("Skipped empty file %s", filename)
			
 
				+            self.copy_file(filename)
			
 
				+            return
			
 
				+        chr_res = chardet.detect(bytedata)
			
 
				+        if chr_res['encoding'] == None or chr_res['confidence'] < DEFAULT_CONF['confi_thres']:
			
 
				+            log.warning("Ignoring %s, since its encoding is unable to detect.", filename)
			
 
				+            self.copy_file(filename)
			
 
				+            return
			
 
				+        src_enc = chr_res['encoding'].lower()
			
 
				+        log.debug("Scanned %s, whose encoding is %s ", filename, src_enc)
			
 
				+ 
			
 
				+        #if (src_enc == 'ascii'):
			
 
				+        #    log.info("Skipped %s, whose encoding is %s", filename, src_enc)
			
 
				+        #    self.copy_file(filename)
			
 
				+        #    return
			
 
				+ 
			
 
				+        if (not self.args.convert_utf) and src_enc.startswith('utf'):
			
 
				+            log.info("Skipped %s, whose encoding is %s", filename, src_enc)
			
 
				+            self.copy_file(filename)
			
 
				+            return
			
 
				+ 
			
 
				+        # Since chardet only recognized all GB-based target_encoding as 'gb2312', the decoding will fail when the text file
			
 
				+        # contains certain special charaters. To make it more special-character-tolerant, we should
			
 
				+        # upgrade the target_encoding to 'gb18030', which is a character set larger than gb2312.
			
 
				+        if src_enc.lower() == 'gb2312':
			
 
				+            src_enc = 'gb18030'
			
 
				+        try:
			
 
				+            strdata = bytedata.decode(src_enc)
			
 
				+        except UnicodeDecodeError as e:
			
 
				+            log.error("Unicode error for file %s", filename)
			
 
				+            print(e)
			
 
				+            copy_file(filename)
			
 
				+            return
			
 
				+ 
			
 
				+        # preserving file time information (modification time and access time)
			
 
				+        src_stat = os.stat(filename)
			
 
				+ 
			
 
				+        tgt_enc = self.args.target_encoding
			
 
				+        dst_dir = self.args.dst_dir
			
 
				+        src_dir_length = len(self.args.root)
			
 
				+        relate_filename = filename[src_dir_length:]
			
 
				+        dst_filename = dst_dir + relate_filename
			
 
				+
			
 
				+        (filepath, tempfilename) = os.path.split(dst_filename)
			
 
				+        if not os.path.exists(filepath):
			
 
				+            os.makedirs(filepath)
			
 
				+
			
 
				+        log.debug("Writing the file: %s in %s", dst_filename, tgt_enc)
			
 
				+        with open(dst_filename, 'wb') as t: # write under the binary mode
			
 
				+            log.info("wreite")
			
 
				+            t.write(strdata.encode(tgt_enc))
			
 
				+        log.info("Converted the file: %s from %s to %s", filename, src_enc, tgt_enc)
			
 
				+ 
			
 
				+        # setting the new file's time to the old file
			
 
				+        os.utime(dst_filename, times = (src_stat.st_atime, src_stat.st_ctime))
			
 
				+    # end of def convert_file(self, filename)
			
 
				+ 
			
 
				+    def run(self):
			
 
				+        root = self.args.root
			
 
				+        if not os.path.exists(root):
			
 
				+            log.error("The file specified %s is neither a directory nor a regular file", root)
			
 
				+            return
			
 
				+ 
			
 
				+        dst = self.args.dst_dir
			
 
				+        if not os.path.exists(dst) or not os.path.isdir(dst):
			
 
				+            log.info("make dst directory")
			
 
				+            os.makedirs(dst)
			
 
				+        else:
			
 
				+            clean_backups(dst)
			
 
				+
			
 
				+        log.info("Start working now!")
			
 
				+ 
			
 
				+        if os.path.isdir(root):
			
 
				+            log.info("The root is: %s. ", root)
			
 
				+            self.walk_dir(root)
			
 
				+        else:
			
 
				+            log.info("Wow, only a single file will be processed: %s", root)
			
 
				+            self.convert_file(root)
			
 
				+ 
			
 
				+        log.info("Finished all.")
			
 
				+    # end of def run(self, root):
			
 
				+ 
			
 
				+def remove_cursive(dirname):
			
 
				+    for root, dirs, files in os.walk(dirname):
			
 
				+        for name in files:
			
 
				+            fullname = os.path.join(root, name)
			
 
				+            os.remove(fullname)
			
 
				+            log.info("Removed the file: %s", fullname)
			
 
				+        for name in dirs:
			
 
				+            fullname = os.path.join(root, name)
			
 
				+            remove_cursive(fullname)
			
 
				+            os.rmdir(fullname)
			
 
				+            log.info("Removed the dir: %s", fullname)
			
 
				+
			
 
				+def clean_backups(dirname):
			
 
				+    if not os.path.isdir(dirname):
			
 
				+        log.error("The file specified %s is not a directory ", dirname)
			
 
				+        return
			
 
				+    log.info("Removing all newly-created files under %s", dirname)
			
 
				+    remove_cursive(dirname)
			
 
				+
			
 
				+def cli():
			
 
				+    args = Arguments()
			
 
				+    args.dst_dir = "D:/Temp/utf8"
			
 
				+    args.root = "D:/GitReposity/wmpplayer"
			
 
				+    cvt2utf8 = Convert2Utf8(args)
			
 
				+    cvt2utf8.run()
			
 
				+        
			
 
				+ 
			
 
				+if __name__ == '__main__':
			
 
				+    cli()