123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
-
- __author__ = ''
-
- import logging, os, argparse, textwrap
- import time
- import chardet
- import shutil
-
- # Default configuration will take effect when corresponding input args are missing.
- # Feel free to change this for your convenience.
- DEFAULT_CONF = {
- 'add_BOM' : False,
- 'convert_UTF' : False,
- 'confi_thres' : 0.8,
- }
-
- # We have to set a minimum threshold. Only those target_encoding results returned by
- # chartdet that are above that threshold level would be accepted.
- # See https://github.com/x1angli/convert2utf/issues/4 for further details
-
- #for root, dirs, files in os.walk(dirname):
- # for name in files:
- # extension = os.path.splitext(name)[1][1:].strip().lower()
- # fullname = os.path.join(root, name)
- # # On linux there is a newline at the end which will cause the match to fail, so we just 'strip()' the '\n'
- # # Also, add 'lower()' to ensure matching
- # if (extension in self.args.skip_exts or name in self.args.skip_files):
- # log.info("Skipped file %s", fullname)
- # self.copy_file(fullname)
- # continue
- # try:
- # self.convert_file(fullname)
- # except IOError:
- # log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
- # except KeyboardInterrupt:
- # log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
- # exit()
- # for name in dirs:
- # fullname = os.path.join(root, name)
- # if name in self.args.skip_dirs:
- # log.info("Skipped dir %s", fullname)
- # else:
- # None
- # # self.walk_dir(fullname)
- #, filename='D:\\gbk2utf8.txt'
- logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
- log = logging.getLogger(__name__)
-
- class Arguments:
- def __init__(self):
- self.dst_dir = ""
- self.root = ""
- self.skip_dirs = ['.git', '.vs', 'out', 'build']
- self.skip_files = [".git", ".vs"]
- self.skip_exts = ["exe", "xml", "vcxproj", "rc", "rc2"]
- self.convert_utf = False
- """utf-8-sig"""
- self.target_encoding = 'utf-8'
-
- class Convert2Utf8:
- def __init__(self, args):
- self.args = args
-
- def copy_file(self, filename):
- dst_dir = self.args.dst_dir
- src_dir_length = len(self.args.root)
- relate_filename = filename[src_dir_length:]
- dst_filename = dst_dir + relate_filename
- (filepath, tempfilename) = os.path.split(dst_filename)
- (filenameonly, extension) = os.path.splitext(tempfilename)
- if not os.path.exists(filepath):
- os.makedirs(filepath)
- shutil.copyfile(filename, dst_filename)
- log.info("copy file from %s to %s", filename, dst_filename)
-
- def walk_dir(self, dirname):
- filelist = os.listdir(dirname)
- for file in filelist:
- fullname = os.path.join(dirname, file)
- if os.path.isdir(fullname):
- if file in self.args.skip_dirs:
- log.info("Skipped dir %s", fullname)
- else:
- self.walk_dir(fullname)
- else:
- extension = os.path.splitext(file)[1][1:].strip().lower()
- if (extension in self.args.skip_exts or file in self.args.skip_files):
- log.info("Skipped file %s", fullname)
- self.copy_file(fullname)
- continue
- try:
- self.convert_file(fullname)
- except IOError:
- log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
- exit()
- except KeyboardInterrupt:
- log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
- exit()
-
- def convert_file(self, filename):
- with open(filename, 'rb') as f: # read under the binary mode
- bytedata = f.read()
-
- if len(bytedata) == 0:
- log.info("Skipped empty file %s", filename)
- self.copy_file(filename)
- return
- chr_res = chardet.detect(bytedata)
- if chr_res['encoding'] == None or chr_res['confidence'] < DEFAULT_CONF['confi_thres']:
- log.warning("Ignoring %s, since its encoding is unable to detect.", filename)
- self.copy_file(filename)
- return
- src_enc = chr_res['encoding'].lower()
- log.debug("Scanned %s, whose encoding is %s ", filename, src_enc)
-
- #if (src_enc == 'ascii'):
- # log.info("Skipped %s, whose encoding is %s", filename, src_enc)
- # self.copy_file(filename)
- # return
-
- if (not self.args.convert_utf) and src_enc.startswith('utf'):
- log.info("Skipped %s, whose encoding is %s", filename, src_enc)
- self.copy_file(filename)
- return
-
- # Since chardet only recognized all GB-based target_encoding as 'gb2312', the decoding will fail when the text file
- # contains certain special charaters. To make it more special-character-tolerant, we should
- # upgrade the target_encoding to 'gb18030', which is a character set larger than gb2312.
- if src_enc.lower() == 'gb2312':
- src_enc = 'gb18030'
- try:
- strdata = bytedata.decode(src_enc)
- except UnicodeDecodeError as e:
- log.error("Unicode error for file %s", filename)
- print(e)
- copy_file(filename)
- return
-
- # preserving file time information (modification time and access time)
- src_stat = os.stat(filename)
-
- tgt_enc = self.args.target_encoding
- dst_dir = self.args.dst_dir
- src_dir_length = len(self.args.root)
- relate_filename = filename[src_dir_length:]
- dst_filename = dst_dir + relate_filename
- (filepath, tempfilename) = os.path.split(dst_filename)
- if not os.path.exists(filepath):
- os.makedirs(filepath)
- log.debug("Writing the file: %s in %s", dst_filename, tgt_enc)
- with open(dst_filename, 'wb') as t: # write under the binary mode
- log.info("wreite")
- t.write(strdata.encode(tgt_enc))
- log.info("Converted the file: %s from %s to %s", filename, src_enc, tgt_enc)
-
- # setting the new file's time to the old file
- os.utime(dst_filename, times = (src_stat.st_atime, src_stat.st_ctime))
- # end of def convert_file(self, filename)
-
- def run(self):
- root = self.args.root
- if not os.path.exists(root):
- log.error("The file specified %s is neither a directory nor a regular file", root)
- return
-
- dst = self.args.dst_dir
- if not os.path.exists(dst) or not os.path.isdir(dst):
- log.info("make dst directory")
- os.makedirs(dst)
- else:
- clean_backups(dst)
- log.info("Start working now!")
-
- if os.path.isdir(root):
- log.info("The root is: %s. ", root)
- self.walk_dir(root)
- else:
- log.info("Wow, only a single file will be processed: %s", root)
- self.convert_file(root)
-
- log.info("Finished all.")
- # end of def run(self, root):
-
- def remove_cursive(dirname):
- for root, dirs, files in os.walk(dirname):
- for name in files:
- fullname = os.path.join(root, name)
- os.remove(fullname)
- log.info("Removed the file: %s", fullname)
- for name in dirs:
- fullname = os.path.join(root, name)
- remove_cursive(fullname)
- os.rmdir(fullname)
- log.info("Removed the dir: %s", fullname)
- def clean_backups(dirname):
- if not os.path.isdir(dirname):
- log.error("The file specified %s is not a directory ", dirname)
- return
- log.info("Removing all newly-created files under %s", dirname)
- remove_cursive(dirname)
- def cli():
- args = Arguments()
- args.dst_dir = "D:/Temp/utf8"
- args.root = "D:/GitReposity/wmpplayer"
- cvt2utf8 = Convert2Utf8(args)
- cvt2utf8.run()
-
-
- if __name__ == '__main__':
- cli()
|