GBK2Utf8.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. __author__ = ''
  4. import logging, os, argparse, textwrap
  5. import time
  6. import chardet
  7. import shutil
  8. # Default configuration will take effect when corresponding input args are missing.
  9. # Feel free to change this for your convenience.
  10. DEFAULT_CONF = {
  11. 'add_BOM' : False,
  12. 'convert_UTF' : False,
  13. 'confi_thres' : 0.8,
  14. }
  15. # We have to set a minimum threshold. Only those target_encoding results returned by
  16. # chartdet that are above that threshold level would be accepted.
  17. # See https://github.com/x1angli/convert2utf/issues/4 for further details
  18. #for root, dirs, files in os.walk(dirname):
  19. # for name in files:
  20. # extension = os.path.splitext(name)[1][1:].strip().lower()
  21. # fullname = os.path.join(root, name)
  22. # # On linux there is a newline at the end which will cause the match to fail, so we just 'strip()' the '\n'
  23. # # Also, add 'lower()' to ensure matching
  24. # if (extension in self.args.skip_exts or name in self.args.skip_files):
  25. # log.info("Skipped file %s", fullname)
  26. # self.copy_file(fullname)
  27. # continue
  28. # try:
  29. # self.convert_file(fullname)
  30. # except IOError:
  31. # log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
  32. # except KeyboardInterrupt:
  33. # log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
  34. # exit()
  35. # for name in dirs:
  36. # fullname = os.path.join(root, name)
  37. # if name in self.args.skip_dirs:
  38. # log.info("Skipped dir %s", fullname)
  39. # else:
  40. # None
  41. # # self.walk_dir(fullname)
  42. #, filename='D:\\gbk2utf8.txt'
  43. logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
  44. log = logging.getLogger(__name__)
  45. class Arguments:
  46. def __init__(self):
  47. self.dst_dir = ""
  48. self.root = ""
  49. self.skip_dirs = ['.git', '.vs', 'out', 'build']
  50. self.skip_files = [".git", ".vs"]
  51. self.skip_exts = ["exe", "xml", "vcxproj", "rc", "rc2"]
  52. self.convert_utf = False
  53. """utf-8-sig"""
  54. self.target_encoding = 'utf-8'
  55. class Convert2Utf8:
  56. def __init__(self, args):
  57. self.args = args
  58. def copy_file(self, filename):
  59. dst_dir = self.args.dst_dir
  60. src_dir_length = len(self.args.root)
  61. relate_filename = filename[src_dir_length:]
  62. dst_filename = dst_dir + relate_filename
  63. (filepath, tempfilename) = os.path.split(dst_filename)
  64. (filenameonly, extension) = os.path.splitext(tempfilename)
  65. if not os.path.exists(filepath):
  66. os.makedirs(filepath)
  67. shutil.copyfile(filename, dst_filename)
  68. log.info("copy file from %s to %s", filename, dst_filename)
  69. def walk_dir(self, dirname):
  70. filelist = os.listdir(dirname)
  71. for file in filelist:
  72. fullname = os.path.join(dirname, file)
  73. if os.path.isdir(fullname):
  74. if file in self.args.skip_dirs:
  75. log.info("Skipped dir %s", fullname)
  76. else:
  77. self.walk_dir(fullname)
  78. else:
  79. extension = os.path.splitext(file)[1][1:].strip().lower()
  80. if (extension in self.args.skip_exts or file in self.args.skip_files):
  81. log.info("Skipped file %s", fullname)
  82. self.copy_file(fullname)
  83. continue
  84. try:
  85. self.convert_file(fullname)
  86. except IOError:
  87. log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
  88. exit()
  89. except KeyboardInterrupt:
  90. log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
  91. exit()
  92. def convert_file(self, filename):
  93. with open(filename, 'rb') as f: # read under the binary mode
  94. bytedata = f.read()
  95. if len(bytedata) == 0:
  96. log.info("Skipped empty file %s", filename)
  97. self.copy_file(filename)
  98. return
  99. chr_res = chardet.detect(bytedata)
  100. if chr_res['encoding'] == None or chr_res['confidence'] < DEFAULT_CONF['confi_thres']:
  101. log.warning("Ignoring %s, since its encoding is unable to detect.", filename)
  102. self.copy_file(filename)
  103. return
  104. src_enc = chr_res['encoding'].lower()
  105. log.debug("Scanned %s, whose encoding is %s ", filename, src_enc)
  106. #if (src_enc == 'ascii'):
  107. # log.info("Skipped %s, whose encoding is %s", filename, src_enc)
  108. # self.copy_file(filename)
  109. # return
  110. if (not self.args.convert_utf) and src_enc.startswith('utf'):
  111. log.info("Skipped %s, whose encoding is %s", filename, src_enc)
  112. self.copy_file(filename)
  113. return
  114. # Since chardet only recognized all GB-based target_encoding as 'gb2312', the decoding will fail when the text file
  115. # contains certain special charaters. To make it more special-character-tolerant, we should
  116. # upgrade the target_encoding to 'gb18030', which is a character set larger than gb2312.
  117. if src_enc.lower() == 'gb2312':
  118. src_enc = 'gb18030'
  119. try:
  120. strdata = bytedata.decode(src_enc)
  121. except UnicodeDecodeError as e:
  122. log.error("Unicode error for file %s", filename)
  123. print(e)
  124. copy_file(filename)
  125. return
  126. # preserving file time information (modification time and access time)
  127. src_stat = os.stat(filename)
  128. tgt_enc = self.args.target_encoding
  129. dst_dir = self.args.dst_dir
  130. src_dir_length = len(self.args.root)
  131. relate_filename = filename[src_dir_length:]
  132. dst_filename = dst_dir + relate_filename
  133. (filepath, tempfilename) = os.path.split(dst_filename)
  134. if not os.path.exists(filepath):
  135. os.makedirs(filepath)
  136. log.debug("Writing the file: %s in %s", dst_filename, tgt_enc)
  137. with open(dst_filename, 'wb') as t: # write under the binary mode
  138. log.info("wreite")
  139. t.write(strdata.encode(tgt_enc))
  140. log.info("Converted the file: %s from %s to %s", filename, src_enc, tgt_enc)
  141. # setting the new file's time to the old file
  142. os.utime(dst_filename, times = (src_stat.st_atime, src_stat.st_ctime))
  143. # end of def convert_file(self, filename)
  144. def run(self):
  145. root = self.args.root
  146. if not os.path.exists(root):
  147. log.error("The file specified %s is neither a directory nor a regular file", root)
  148. return
  149. dst = self.args.dst_dir
  150. if not os.path.exists(dst) or not os.path.isdir(dst):
  151. log.info("make dst directory")
  152. os.makedirs(dst)
  153. else:
  154. clean_backups(dst)
  155. log.info("Start working now!")
  156. if os.path.isdir(root):
  157. log.info("The root is: %s. ", root)
  158. self.walk_dir(root)
  159. else:
  160. log.info("Wow, only a single file will be processed: %s", root)
  161. self.convert_file(root)
  162. log.info("Finished all.")
  163. # end of def run(self, root):
  164. def remove_cursive(dirname):
  165. for root, dirs, files in os.walk(dirname):
  166. for name in files:
  167. fullname = os.path.join(root, name)
  168. os.remove(fullname)
  169. log.info("Removed the file: %s", fullname)
  170. for name in dirs:
  171. fullname = os.path.join(root, name)
  172. remove_cursive(fullname)
  173. os.rmdir(fullname)
  174. log.info("Removed the dir: %s", fullname)
  175. def clean_backups(dirname):
  176. if not os.path.isdir(dirname):
  177. log.error("The file specified %s is not a directory ", dirname)
  178. return
  179. log.info("Removing all newly-created files under %s", dirname)
  180. remove_cursive(dirname)
  181. def cli():
  182. args = Arguments()
  183. args.dst_dir = "D:/Temp/utf8"
  184. args.root = "D:/GitReposity/wmpplayer"
  185. cvt2utf8 = Convert2Utf8(args)
  186. cvt2utf8.run()
  187. if __name__ == '__main__':
  188. cli()