Browse Source

#IQBX #comment utf8编码转换源码

80374374 1 năm trước cách đây
mục cha
commit
a4d6797288
1 tập tin đã thay đổi với 221 bổ sung0 xóa
  1. 221 0
      addin/GBK2Utf8.py

+ 221 - 0
addin/GBK2Utf8.py

@@ -0,0 +1,221 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+ 
+__author__ = ''
+ 
+import logging, os, argparse, textwrap
+import time
+import chardet
+import shutil
+ 
+# Default configuration will take effect when corresponding input args are missing.
+# Feel free to change this for your convenience.
+DEFAULT_CONF = {
+    'add_BOM'   : False,
+    'convert_UTF'   : False,
+    'confi_thres' : 0.8,
+}
+ 
+# We have to set a minimum threshold. Only those target_encoding results returned by 
+# chartdet that are above that threshold level would be accepted.
+# See https://github.com/x1angli/convert2utf/issues/4 for further details
+ 
+
+    #for root, dirs, files in os.walk(dirname):
+    #    for name in files:
+    #        extension = os.path.splitext(name)[1][1:].strip().lower()
+    #        fullname = os.path.join(root, name)
+    #        # On linux there is a newline at the end which will cause the match to fail, so we just 'strip()' the '\n'
+    #        # Also, add 'lower()' to ensure matching
+    #        if (extension in self.args.skip_exts or name in self.args.skip_files):
+    #            log.info("Skipped file %s", fullname)
+    #            self.copy_file(fullname)
+    #            continue
+
+    #        try:
+    #            self.convert_file(fullname)
+    #        except IOError:
+    #            log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
+    #        except KeyboardInterrupt:
+    #            log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
+    #            exit()
+    #    for name in dirs:
+    #        fullname = os.path.join(root, name)
+    #        if name in self.args.skip_dirs:
+    #            log.info("Skipped dir %s", fullname)
+    #        else:
+    #            None
+    #            # self.walk_dir(fullname)
+
+ #, filename='D:\\gbk2utf8.txt'
+logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
+log = logging.getLogger(__name__)
+ 
+class Arguments:
+    def __init__(self):
+        self.dst_dir = ""
+        self.root = ""
+        self.skip_dirs = ['.git', '.vs', 'out', 'build']
+        self.skip_files = [".git", ".vs"]
+        self.skip_exts = ["exe", "xml", "vcxproj", "rc", "rc2"]
+        self.convert_utf = False
+        """utf-8-sig"""
+        self.target_encoding = 'utf-8' 
+ 
+class Convert2Utf8:
+    def __init__(self, args):
+        self.args = args
+ 
+    def copy_file(self, filename):
+        dst_dir = self.args.dst_dir
+        src_dir_length = len(self.args.root)
+        relate_filename = filename[src_dir_length:]
+        dst_filename = dst_dir + relate_filename
+        (filepath, tempfilename) = os.path.split(dst_filename)
+        (filenameonly, extension) = os.path.splitext(tempfilename)
+        if not os.path.exists(filepath):
+            os.makedirs(filepath)
+        shutil.copyfile(filename, dst_filename)
+        log.info("copy file from %s to %s", filename, dst_filename)
+ 
+    def walk_dir(self, dirname):
+        filelist = os.listdir(dirname)
+        for file in filelist:
+            fullname = os.path.join(dirname, file)
+            if os.path.isdir(fullname):
+                if file in self.args.skip_dirs:
+                    log.info("Skipped dir %s", fullname)
+                else:
+                    self.walk_dir(fullname)
+            else:
+                extension = os.path.splitext(file)[1][1:].strip().lower()
+                if (extension in self.args.skip_exts or file in self.args.skip_files):
+                    log.info("Skipped file %s", fullname)
+                    self.copy_file(fullname)
+                    continue
+
+                try:
+                    self.convert_file(fullname)
+                except IOError:
+                    log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
+                    exit()
+                except KeyboardInterrupt:
+                    log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
+                    exit()
+ 
+    def convert_file(self, filename):
+        with open(filename, 'rb') as f: # read under the binary mode
+            bytedata = f.read()
+ 
+        if len(bytedata) == 0:
+            log.info("Skipped empty file %s", filename)
+            self.copy_file(filename)
+            return
+        chr_res = chardet.detect(bytedata)
+        if chr_res['encoding'] == None or chr_res['confidence'] < DEFAULT_CONF['confi_thres']:
+            log.warning("Ignoring %s, since its encoding is unable to detect.", filename)
+            self.copy_file(filename)
+            return
+        src_enc = chr_res['encoding'].lower()
+        log.debug("Scanned %s, whose encoding is %s ", filename, src_enc)
+ 
+        #if (src_enc == 'ascii'):
+        #    log.info("Skipped %s, whose encoding is %s", filename, src_enc)
+        #    self.copy_file(filename)
+        #    return
+ 
+        if (not self.args.convert_utf) and src_enc.startswith('utf'):
+            log.info("Skipped %s, whose encoding is %s", filename, src_enc)
+            self.copy_file(filename)
+            return
+ 
+        # Since chardet only recognized all GB-based target_encoding as 'gb2312', the decoding will fail when the text file
+        # contains certain special charaters. To make it more special-character-tolerant, we should
+        # upgrade the target_encoding to 'gb18030', which is a character set larger than gb2312.
+        if src_enc.lower() == 'gb2312':
+            src_enc = 'gb18030'
+        try:
+            strdata = bytedata.decode(src_enc)
+        except UnicodeDecodeError as e:
+            log.error("Unicode error for file %s", filename)
+            print(e)
+            copy_file(filename)
+            return
+ 
+        # preserving file time information (modification time and access time)
+        src_stat = os.stat(filename)
+ 
+        tgt_enc = self.args.target_encoding
+        dst_dir = self.args.dst_dir
+        src_dir_length = len(self.args.root)
+        relate_filename = filename[src_dir_length:]
+        dst_filename = dst_dir + relate_filename
+
+        (filepath, tempfilename) = os.path.split(dst_filename)
+        if not os.path.exists(filepath):
+            os.makedirs(filepath)
+
+        log.debug("Writing the file: %s in %s", dst_filename, tgt_enc)
+        with open(dst_filename, 'wb') as t: # write under the binary mode
+            log.info("wreite")
+            t.write(strdata.encode(tgt_enc))
+        log.info("Converted the file: %s from %s to %s", filename, src_enc, tgt_enc)
+ 
+        # setting the new file's time to the old file
+        os.utime(dst_filename, times = (src_stat.st_atime, src_stat.st_ctime))
+    # end of def convert_file(self, filename)
+ 
+    def run(self):
+        root = self.args.root
+        if not os.path.exists(root):
+            log.error("The file specified %s is neither a directory nor a regular file", root)
+            return
+ 
+        dst = self.args.dst_dir
+        if not os.path.exists(dst) or not os.path.isdir(dst):
+            log.info("make dst directory")
+            os.makedirs(dst)
+        else:
+            clean_backups(dst)
+
+        log.info("Start working now!")
+ 
+        if os.path.isdir(root):
+            log.info("The root is: %s. ", root)
+            self.walk_dir(root)
+        else:
+            log.info("Wow, only a single file will be processed: %s", root)
+            self.convert_file(root)
+ 
+        log.info("Finished all.")
+    # end of def run(self, root):
+ 
+def remove_cursive(dirname):
+    for root, dirs, files in os.walk(dirname):
+        for name in files:
+            fullname = os.path.join(root, name)
+            os.remove(fullname)
+            log.info("Removed the file: %s", fullname)
+        for name in dirs:
+            fullname = os.path.join(root, name)
+            remove_cursive(fullname)
+            os.rmdir(fullname)
+            log.info("Removed the dir: %s", fullname)
+
+def clean_backups(dirname):
+    if not os.path.isdir(dirname):
+        log.error("The file specified %s is not a directory ", dirname)
+        return
+    log.info("Removing all newly-created files under %s", dirname)
+    remove_cursive(dirname)
+
+def cli():
+    args = Arguments()
+    args.dst_dir = "D:/Temp/utf8"
+    args.root = "D:/GitReposity/wmpplayer"
+    cvt2utf8 = Convert2Utf8(args)
+    cvt2utf8.run()
+        
+ 
+if __name__ == '__main__':
+    cli()