#!/usr/bin/env python
# -*- coding: utf-8 -*-
 
__author__ = ''
 
import logging, os, argparse, textwrap
import time
import chardet
import shutil
 
# Default configuration will take effect when corresponding input args are missing.
# Feel free to change this for your convenience.
DEFAULT_CONF = {
    'add_BOM'   : False,
    'convert_UTF'   : False,
    'confi_thres' : 0.8,
}
 
# We have to set a minimum threshold. Only those target_encoding results returned by 
# chartdet that are above that threshold level would be accepted.
# See https://github.com/x1angli/convert2utf/issues/4 for further details
 

    #for root, dirs, files in os.walk(dirname):
    #    for name in files:
    #        extension = os.path.splitext(name)[1][1:].strip().lower()
    #        fullname = os.path.join(root, name)
    #        # On linux there is a newline at the end which will cause the match to fail, so we just 'strip()' the '\n'
    #        # Also, add 'lower()' to ensure matching
    #        if (extension in self.args.skip_exts or name in self.args.skip_files):
    #            log.info("Skipped file %s", fullname)
    #            self.copy_file(fullname)
    #            continue

    #        try:
    #            self.convert_file(fullname)
    #        except IOError:
    #            log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
    #        except KeyboardInterrupt:
    #            log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
    #            exit()
    #    for name in dirs:
    #        fullname = os.path.join(root, name)
    #        if name in self.args.skip_dirs:
    #            log.info("Skipped dir %s", fullname)
    #        else:
    #            None
    #            # self.walk_dir(fullname)

 #, filename='D:\\gbk2utf8.txt'
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
log = logging.getLogger(__name__)
 
class Arguments:
    def __init__(self):
        self.dst_dir = ""
        self.root = ""
        self.skip_dirs = ['.git', '.vs', 'out', 'build']
        self.skip_files = [".git", ".vs"]
        self.skip_exts = ["exe", "xml", "vcxproj", "rc", "rc2"]
        self.convert_utf = False
        """utf-8-sig"""
        self.target_encoding = 'utf-8' 
 
class Convert2Utf8:
    def __init__(self, args):
        self.args = args
 
    def copy_file(self, filename):
        dst_dir = self.args.dst_dir
        src_dir_length = len(self.args.root)
        relate_filename = filename[src_dir_length:]
        dst_filename = dst_dir + relate_filename
        (filepath, tempfilename) = os.path.split(dst_filename)
        (filenameonly, extension) = os.path.splitext(tempfilename)
        if not os.path.exists(filepath):
            os.makedirs(filepath)
        shutil.copyfile(filename, dst_filename)
        log.info("copy file from %s to %s", filename, dst_filename)
 
    def walk_dir(self, dirname):
        filelist = os.listdir(dirname)
        for file in filelist:
            fullname = os.path.join(dirname, file)
            if os.path.isdir(fullname):
                if file in self.args.skip_dirs:
                    log.info("Skipped dir %s", fullname)
                else:
                    self.walk_dir(fullname)
            else:
                extension = os.path.splitext(file)[1][1:].strip().lower()
                if (extension in self.args.skip_exts or file in self.args.skip_files):
                    log.info("Skipped file %s", fullname)
                    self.copy_file(fullname)
                    continue

                try:
                    self.convert_file(fullname)
                except IOError:
                    log.error("Unable to read or write the file: %s. Please check the file's permission.", fullname)
                    exit()
                except KeyboardInterrupt:
                    log.warning("Interrupted by keyboard (e.g. Ctrl+C)")
                    exit()
 
    def convert_file(self, filename):
        with open(filename, 'rb') as f: # read under the binary mode
            bytedata = f.read()
 
        if len(bytedata) == 0:
            log.info("Skipped empty file %s", filename)
            self.copy_file(filename)
            return
        chr_res = chardet.detect(bytedata)
        if chr_res['encoding'] == None or chr_res['confidence'] < DEFAULT_CONF['confi_thres']:
            log.warning("Ignoring %s, since its encoding is unable to detect.", filename)
            self.copy_file(filename)
            return
        src_enc = chr_res['encoding'].lower()
        log.debug("Scanned %s, whose encoding is %s ", filename, src_enc)
 
        #if (src_enc == 'ascii'):
        #    log.info("Skipped %s, whose encoding is %s", filename, src_enc)
        #    self.copy_file(filename)
        #    return
 
        if (not self.args.convert_utf) and src_enc.startswith('utf'):
            log.info("Skipped %s, whose encoding is %s", filename, src_enc)
            self.copy_file(filename)
            return
 
        # Since chardet only recognized all GB-based target_encoding as 'gb2312', the decoding will fail when the text file
        # contains certain special charaters. To make it more special-character-tolerant, we should
        # upgrade the target_encoding to 'gb18030', which is a character set larger than gb2312.
        if src_enc.lower() == 'gb2312':
            src_enc = 'gb18030'
        try:
            strdata = bytedata.decode(src_enc)
        except UnicodeDecodeError as e:
            log.error("Unicode error for file %s", filename)
            print(e)
            copy_file(filename)
            return
 
        # preserving file time information (modification time and access time)
        src_stat = os.stat(filename)
 
        tgt_enc = self.args.target_encoding
        dst_dir = self.args.dst_dir
        src_dir_length = len(self.args.root)
        relate_filename = filename[src_dir_length:]
        dst_filename = dst_dir + relate_filename

        (filepath, tempfilename) = os.path.split(dst_filename)
        if not os.path.exists(filepath):
            os.makedirs(filepath)

        log.debug("Writing the file: %s in %s", dst_filename, tgt_enc)
        with open(dst_filename, 'wb') as t: # write under the binary mode
            log.info("wreite")
            t.write(strdata.encode(tgt_enc))
        log.info("Converted the file: %s from %s to %s", filename, src_enc, tgt_enc)
 
        # setting the new file's time to the old file
        os.utime(dst_filename, times = (src_stat.st_atime, src_stat.st_ctime))
    # end of def convert_file(self, filename)
 
    def run(self):
        root = self.args.root
        if not os.path.exists(root):
            log.error("The file specified %s is neither a directory nor a regular file", root)
            return
 
        dst = self.args.dst_dir
        if not os.path.exists(dst) or not os.path.isdir(dst):
            log.info("make dst directory")
            os.makedirs(dst)
        else:
            clean_backups(dst)

        log.info("Start working now!")
 
        if os.path.isdir(root):
            log.info("The root is: %s. ", root)
            self.walk_dir(root)
        else:
            log.info("Wow, only a single file will be processed: %s", root)
            self.convert_file(root)
 
        log.info("Finished all.")
    # end of def run(self, root):
 
def remove_cursive(dirname):
    for root, dirs, files in os.walk(dirname):
        for name in files:
            fullname = os.path.join(root, name)
            os.remove(fullname)
            log.info("Removed the file: %s", fullname)
        for name in dirs:
            fullname = os.path.join(root, name)
            remove_cursive(fullname)
            os.rmdir(fullname)
            log.info("Removed the dir: %s", fullname)

def clean_backups(dirname):
    if not os.path.isdir(dirname):
        log.error("The file specified %s is not a directory ", dirname)
        return
    log.info("Removing all newly-created files under %s", dirname)
    remove_cursive(dirname)

def cli():
    args = Arguments()
    args.dst_dir = "D:/Temp/utf8"
    args.root = "D:/GitReposity/wmpplayer"
    cvt2utf8 = Convert2Utf8(args)
    cvt2utf8.run()
        
 
if __name__ == '__main__':
    cli()