pcacc
/
myvtm


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
							#include "precompile.h"
#include "charset.h"
#include <stdlib.h>  
#include <stdio.h>  
#include <fcntl.h>  
#include <string.h>  

#ifdef _WIN32
#include <windows.h>
#else
#include <unistd.h>  
#include <iconv.h>
#include <locale.h>
#include <sys/stat.h>
#include <errno.h>
#endif

#define TAG TOOLKIT_TAG("charaset")


// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.

#define UTF8_ACCEPT 0
#define UTF8_REJECT 1

static const uint8_t utf8d[] = {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};

static uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte)
{
    uint32_t type = utf8d[byte];
    *codep = (*state != UTF8_ACCEPT) ?
        (byte & 0x3fu) | (*codep << 6) :
        (0xff >> type) & (byte);
    *state = utf8d[256 + *state * 16 + type];
    return *state;
}

uint32_t validate_utf8(char* str, size_t len)
{
    size_t i;
    uint32_t type;
    uint32_t state = UTF8_ACCEPT;
    for (i = 0; i < len; i++) {
        // We don't care about the codepoint, so this is a simplified version of the decode function.
        type = utf8d[(uint8_t)str[i]];
        state = utf8d[256 + (state) * 16 + type];
        if (state == UTF8_REJECT)
            break;
    }
    return state;
}

#ifndef _WIN32
int code_convert(char* from_charset, char* to_charset, char* inbuf, size_t inlen, char* outbuf, size_t outlen)
{
    iconv_t cd;
    char** pin = &inbuf;
    char** pout = &outbuf;
    cd = iconv_open(to_charset, from_charset);
    if (cd == 0)
        return -1;
    memset(outbuf, 0, outlen);
    if (iconv(cd, pin, &inlen, pout, &outlen) == -1)
        return -1;
    iconv_close(cd);
    *pout = '\0';
    return 0;
}

#endif //NOT _WIN32

#ifdef _WIN32
void utf82gbk_2(const char* src, char* dst, int len)
{
    int ret = 0;
    WCHAR* strA;
    int i = MultiByteToWideChar(CP_UTF8, 0, src, -1, NULL, 0);
    if (i <= 0) {
        printf("ERROR.");
        return;
    }
    strA = (WCHAR*)malloc(i * 2);
    MultiByteToWideChar(CP_UTF8, 0, src, -1, strA, i);
    i = WideCharToMultiByte(CP_ACP, 0, strA, -1, NULL, 0, NULL, NULL);
    if (len >= i) {
        ret = WideCharToMultiByte(CP_ACP, 0, strA, -1, dst, i, NULL, NULL);
        dst[i] = 0;
    }
    if (ret <= 0) {
        free(strA);
        return;
    }

    free(strA);
}

void gbk2utf8_2(const char* src, char* dst, int len)
{
    int ret = 0;
    WCHAR* strA;
    int i = MultiByteToWideChar(CP_ACP, 0, src, -1, NULL, 0);
    if (i <= 0) {
        printf("ERROR.");
        return;
    }
    strA = (WCHAR*)malloc(i * 2);
    MultiByteToWideChar(CP_ACP, 0, src, -1, strA, i);
    i = WideCharToMultiByte(CP_UTF8, 0, strA, -1, NULL, 0, NULL, NULL);
    if (len >= i) {
        ret = WideCharToMultiByte(CP_UTF8, 0, strA, -1, dst, i, NULL, NULL);
        dst[i] = 0;
    }

    if (ret <= 0) {
        free(strA);
        return;
    }
    free(strA);
}
#else   //Linux

// starkwong: In iconv implementations, inlen and outlen should be type of size_t not uint, which is different in length on Mac
void utf82gbk_2(const char* src, char* dst, int len)
{
    int ret = 0;
    size_t inlen = strlen(src) + 1;
    size_t outlen = len;

    // duanqn: The iconv function in Linux requires non-const char *
    // So we need to copy the source string
    char* inbuf = (char*)malloc(len);
    char* inbuf_hold = inbuf;   // iconv may change the address of inbuf
                                // so we use another pointer to keep the address
    memcpy(inbuf, src, len);

    char* outbuf = dst;
    iconv_t cd;
    cd = iconv_open("GBK", "UTF-8");
    if (cd != (iconv_t)-1) {
        ret = iconv(cd, &inbuf, &inlen, &outbuf, &outlen);
        if (ret != 0) {
            printf("iconv failed err: %s\n", strerror(errno));
        }

        iconv_close(cd);
    }
    free(inbuf_hold);   // Don't pass in inbuf as it may have been modified
}

void gbk2utf8_2(const char* src, char* dst, int len)
{
    int ret = 0;
    size_t inlen = strlen(src) + 1;
    size_t outlen = len;

    // duanqn: The iconv function in Linux requires non-const char *
    // So we need to copy the source string
    char* inbuf = (char*)malloc(len);
    char* inbuf_hold = inbuf;   // iconv may change the address of inbuf
                                // so we use another pointer to keep the address
    memcpy(inbuf, src, len);

    char* outbuf2 = NULL;
    char* outbuf = dst;
    iconv_t cd;

    // starkwong: if src==dst, the string will become invalid during conversion since UTF-8 is 3 chars in Chinese but GBK is mostly 2 chars
    if (src == dst) {
        outbuf2 = (char*)malloc(len);
        memset(outbuf2, 0, len);
        outbuf = outbuf2;
    }

    cd = iconv_open("UTF-8", "GBK");
    if (cd != (iconv_t)-1) {
        ret = iconv(cd, &inbuf, &inlen, &outbuf, &outlen);
        if (ret != 0)
            printf("iconv failed err: %s\n", strerror(errno));

        if (outbuf2 != NULL) {
            strcpy(dst, outbuf2);
            free(outbuf2);
        }

        iconv_close(cd);
    }
    free(inbuf_hold);   // Don't pass in inbuf as it may have been modified
}
#endif

static int is_utf8(const char* str)
{
    const unsigned char* bytes = (const unsigned char*)str;

    if (str == NULL)
        return 1;

#if 0
    while (*bytes != 0x00) {
        if ((*bytes & 0x80) == 0x00) {
            // U+0000 to U+007F 
            num = 1;
        } else if ((*bytes & 0xE0) == 0xC0) {
            // U+0080 to U+07FF 
            num = 2;
        } else if ((*bytes & 0xF0) == 0xE0) {
            // U+0800 to U+FFFF 
            num = 3;
        } else if ((*bytes & 0xF8) == 0xF0) {
            // U+10000 to U+10FFFF 
            num = 4;
        } else {
            return 0;
        }

        bytes += 1;

        for (i = 1; i < num; ++i) {
            if ((*bytes & 0xC0) != 0x80)
                return 0;
            bytes += 1;
        }
    }

    return 1;

#else

    while (*bytes) {
        if ((// ASCII
             // use bytes[0] <= 0x7F to allow ASCII control characters
             bytes[0] == 0x09 ||
             bytes[0] == 0x0A ||
             bytes[0] == 0x0D ||
             (0x20 <= bytes[0] && bytes[0] <= 0x7E)
             )
            ) {
            bytes += 1;
            continue;
        }

        if ((// non-overlong 2-byte
             (0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
             (0x80 <= bytes[1] && bytes[1] <= 0xBF)
             )
            ) {
            bytes += 2;
            continue;
        }

        if ((// excluding overlongs
             bytes[0] == 0xE0 &&
             (0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
             (0x80 <= bytes[2] && bytes[2] <= 0xBF)
             ) ||
            (// straight 3-byte
             ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
              bytes[0] == 0xEE ||
              bytes[0] == 0xEF) &&
             (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
             (0x80 <= bytes[2] && bytes[2] <= 0xBF)
             ) ||
            (// excluding surrogates
             bytes[0] == 0xED &&
             (0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
             (0x80 <= bytes[2] && bytes[2] <= 0xBF)
             )
            ) {
            bytes += 3;
            continue;
        }

        if ((// planes 1-3
             bytes[0] == 0xF0 &&
             (0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
             (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
             (0x80 <= bytes[3] && bytes[3] <= 0xBF)
             ) ||
            (// planes 4-15
             (0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
             (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
             (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
             (0x80 <= bytes[3] && bytes[3] <= 0xBF)
             ) ||
            (// plane 16
             bytes[0] == 0xF4 &&
             (0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
             (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
             (0x80 <= bytes[3] && bytes[3] <= 0xBF)
             )
            ) {
            bytes += 4;
            continue;
        }

        return 0;
    }
    return 1;

#endif
}

int is_valid_utf8(const char* str)
{
    const unsigned char* bytes = (const unsigned char*)str;
    unsigned int cp;
    int num, i;

    if (!str) return 1;

    while (*bytes != 0x00) {
        if ((*bytes & 0x80) == 0x00) {
            // U+0000 to U+007F 
            cp = (*bytes & 0x7F);
            num = 1;
        } else if ((*bytes & 0xE0) == 0xC0) {
            // U+0080 to U+07FF 
            cp = (*bytes & 0x1F);
            num = 2;
        } else if ((*bytes & 0xF0) == 0xE0) {
            // U+0800 to U+FFFF 
            cp = (*bytes & 0x0F);
            num = 3;
        } else if ((*bytes & 0xF8) == 0xF0) {
            // U+10000 to U+10FFFF 
            cp = (*bytes & 0x07);
            num = 4;
        } else
            return 0;

        bytes += 1;
        for (i = 1; i < num; ++i) {
            if ((*bytes & 0xC0) != 0x80)
                return 0;
            cp = (cp << 6) | (*bytes & 0x3F);
            bytes += 1;
        }

        if ((cp > 0x10FFFF) ||
            ((cp >= 0xD800) && (cp <= 0xDFFF)) ||
            ((cp <= 0x007F) && (num != 1)) ||
            ((cp >= 0x0080) && (cp <= 0x07FF) && (num != 2)) ||
            ((cp >= 0x0800) && (cp <= 0xFFFF) && (num != 3)) ||
            ((cp >= 0x10000) && (cp <= 0x1FFFFF) && (num != 4)))
            return 0;
    }

    return 1;
}


int s2w(char* locale_charset, char* inbuf, size_t inlen, wchar_t* outbuf, size_t outlen)
{
#ifdef _MSC_VER
    wchar_t* wstr = NULL;
    int n = MultiByteToWideChar(CP_ACP, 0, inbuf, -1, NULL, 0);
    if (outbuf == NULL || outlen == 0)
        return n;

    if (n > 0) {
        wstr = malloc(sizeof(wchar_t) * (n + 1));
        if (wstr == NULL) {
            return 0;
        }
        memset(wstr, 0, (n + 1) * sizeof(wchar_t));
        MultiByteToWideChar(CP_ACP, 0, inbuf, -1, &wstr[0], n);
        wcscpy_s(outbuf, outlen, wstr);
        free(wstr);
        return n;
    }
    return 0;
#else
    unsigned len = 0;
    if (inbuf == NULL || inlen == 0) {
        return 0;
    }
    const char* origin = setlocale(LC_CTYPE, NULL);
    WLog_DBG(TAG, "%s: origin locale: %s, aim locale: %s, data:%s", __FUNCTION__, origin, locale_charset, inbuf);
    if (NULL == setlocale(LC_CTYPE, locale_charset)) {
        WLog_DBG(TAG, "setlocale with \"%s\" failed: %d", locale_charset, errno);
        return -1;
    }
    len = mbstowcs(NULL, inbuf, inlen);
    if (len == (unsigned)-1) {
        WLog_DBG(TAG, "mbstowcs failed: %d, %s", errno, strerror(errno));
        goto on_end;
    }
    len += 1;

    if (outbuf == NULL || outlen == 0) {
        WLog_DBG(TAG, "mbstowcs to fetch need capacity: %d", len);
        goto on_end;
    }

    wmemset(outbuf, 0, outlen);
    len = mbstowcs(outbuf, inbuf, outlen > len ? len : outlen);
    if (len == (unsigned)-1) {
        WLog_DBG(TAG, "mbstowcs after new malloc failed: %d", errno);
        goto on_end;
    }
    WLog_DBG(TAG, "after mbstowcs returned: %ls(%d)", outbuf, len);

on_end:

    setlocale(LC_CTYPE, origin);
    return len;

#endif
}
int w2s(char* locale_charset, wchar_t* inbuf, size_t inlen, char* outbuf, size_t outlen)
{
#if defined(_MSC_VER)
    char* str = NULL;
    int n = 0;

    n = WideCharToMultiByte(CP_ACP, 0, inbuf, -1, NULL, 0, NULL, NULL);
    if (outbuf == NULL || outlen == 0)
        return n;

    if (n > 0) {
        str = malloc(sizeof(char) * (n + 1));
        if (str == NULL) {
            return 0;
        }
        memset(str, 0, sizeof(char) * (n + 1));
        WideCharToMultiByte(CP_ACP, 0, inbuf, -1, &str[0], n, NULL, NULL);
        memset(outbuf, 0, sizeof(char) * (outlen));
        strcpy_s(outbuf, outlen, str);
        free(str);
        return n;
    }
    return 0;

#else

    unsigned len;
    char* str = NULL;
    char* origin = NULL;

    len = wcslen(inbuf);
    if (inbuf == NULL || inlen == 0) {
        return 0;
    }
    origin = setlocale(LC_CTYPE, NULL);
    WLog_DBG(TAG, "%s: origin locale: %s, aim locale: %s, data: %ls", __FUNCTION__, origin,  locale_charset, inbuf);
    if (NULL == setlocale(LC_CTYPE, locale_charset)) {
        WLog_ERR(TAG, "setlocale with \"%s\" failed: %d", locale_charset, errno);
    }

    len = wcstombs(NULL, inbuf, inlen);
    if (len == (unsigned)-1) {
        WLog_DBG(TAG, "wcstombs failed: %d", errno);
        goto on_end;
    }
    len += 1;

    if (outbuf == NULL || outlen == 0) {
        WLog_DBG(TAG, "wcstombs to fetch need capacity: %d", len);
        goto on_end;
    }

    memset(outbuf, 0, outlen * sizeof(char));
    len = wcstombs(outbuf, inbuf, len > outlen ? outlen : len);
    if (len == (unsigned)-1) {
        WLog_DBG(TAG, "wcstombs after new malloc failed: %d", errno);
        goto on_end;
    }

    WLog_DBG(TAG, "after wcstombs returned: %s(%d)", outbuf, len);

on_end:

    setlocale(LC_CTYPE, origin);
    return len;

#endif //_MSC_VER
}

TOOLKIT_API int toolkit_utf82gbk(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
{
    //return code_convert("utf-8", "gb2312", inbuf, inlen, outbuf, outlen);
    
    int in = 0;
    wchar_t* p = NULL;
    int n = 0;

    n = s2w("zh_CN.utf8", inbuf, inlen, NULL, 0);
    if (n <= 0) { return n; }
    p = malloc(sizeof(wchar_t) * n);
    if (p == NULL) {
        return 0;
    }
    wmemset(p, 0, n);
    n = s2w("zh_CN.utf8", inbuf, inlen, p, n);
    n = w2s("zh_CN.gbk", p, n, outbuf, outlen);
    free(p);
    return n;
}

TOOLKIT_API int toolkit_gbk2utf8(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
{
    //return code_convert("gb2312", "utf-8", inbuf, inlen, outbuf, outlen);

    int in = 0;
    wchar_t* p = NULL;
    int n = 0;

    n = s2w("zh_CN.gbk", inbuf, inlen, NULL, 0);
    if (n <= 0) { return n; }
    p = malloc(sizeof(wchar_t) * n);
    wmemset(p, 0, n);
    n = s2w("zh_CN.gbk", inbuf, inlen, p, n);
    n = w2s("zh_CN.utf8", p, n, outbuf, outlen);
    free(p);
    return n;
}

/*when the text file is UTF-8 without BOM, the following function will mistakenly regard it as a ANSI file.*/
TOOLKIT_API char_encoding  detect_file_encoding(const char* file_path)
{
    FILE* fp = NULL;
    long file_size = 0;
    char* buf = NULL;
    char_encoding result = unknown;
    unsigned char two[20];
    unsigned char unic[] = { 0xFF, 0xFE, 0x00 };           // Unicode file header
    unsigned char unic_big[] = { 0xFE, 0xFF,0x00 };    // Unicode big endian file header
    unsigned char utf8[] = { 0xEF, 0xBB, 0xBF };          // UTF_8 file header

    fp = fopen(file_path, "rb");
    if (NULL == fp) {
        WLog_ERR(TAG, "fopen(%s) failed: %d", file_path, errno);
        return result;
    }
    fseek(fp, 0, SEEK_SET);
    memset(two, 0, sizeof(two));
    if (fread(two, 1, 2, fp) != 2)
        goto on_end;
    
    if (two[0] == unic[0] && two[1] == unic[1])
        result = unicode;
    else if (two[0] == unic_big[0] && two[1] == unic_big[1])
        result = unicode_with_big_endian;
    else if (two[0] == utf8[0] && two[1] == utf8[1] /*&& two[2] == utf8[2]*/)
        result = unicode_with_bom;
    else
        result = ansi;

on_end:
    WLog_DBG(TAG, "firstchar 0x%X, 0x%X: %d", two[0], two[1], result);
    fclose(fp);
    return result;
}

TOOLKIT_API int toolkit_detect_utf8_file(const char* file_path)
{
    FILE* f = NULL;
    long file_size = 0;
    char* buf = NULL;
    int result = 1;

    f = fopen(file_path, "rb");
    if (NULL == f) {
        return -1;
    }

    fseek(f, 0, SEEK_END);
    file_size = ftell(f);
    if (0 == file_size) {
        fclose(f);
        return 1;
    }
    fseek(f, 0, SEEK_SET);

    buf = (char*)malloc((file_size+1) * sizeof(char));
    memset(buf, 0, file_size + 1);
    fread(buf, file_size, 1, f);
    fclose(f);

    result = is_utf8(buf);
    free(buf);

    return result;
}

TOOLKIT_API int  toolkit_detect_utf8_str(const char* str)
{
    //if (validate_utf8(str, strlen(str)) == UTF8_ACCEPT) return 1;
    return is_utf8(str);
}