123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606 |
- #include "precompile.h"
- #include "charset.h"
- #include <stdlib.h>
- #include <stdio.h>
- #include <fcntl.h>
- #include <string.h>
- #ifdef _WIN32
- #include <windows.h>
- #else
- #include <unistd.h>
- #include <iconv.h>
- #include <locale.h>
- #include <sys/stat.h>
- #include <errno.h>
- #endif
- #define TAG TOOLKIT_TAG("charaset")
- // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
- // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
- #define UTF8_ACCEPT 0
- #define UTF8_REJECT 1
- static const uint8_t utf8d[] = {
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
- 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
- 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
- 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
- 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
- 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
- 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
- };
- static uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte)
- {
- uint32_t type = utf8d[byte];
- *codep = (*state != UTF8_ACCEPT) ?
- (byte & 0x3fu) | (*codep << 6) :
- (0xff >> type) & (byte);
- *state = utf8d[256 + *state * 16 + type];
- return *state;
- }
- uint32_t validate_utf8(char* str, size_t len)
- {
- size_t i;
- uint32_t type;
- uint32_t state = UTF8_ACCEPT;
- for (i = 0; i < len; i++) {
- // We don't care about the codepoint, so this is a simplified version of the decode function.
- type = utf8d[(uint8_t)str[i]];
- state = utf8d[256 + (state) * 16 + type];
- if (state == UTF8_REJECT)
- break;
- }
- return state;
- }
- #ifndef _WIN32
- int code_convert(char* from_charset, char* to_charset, char* inbuf, size_t inlen, char* outbuf, size_t outlen)
- {
- iconv_t cd;
- char** pin = &inbuf;
- char** pout = &outbuf;
- cd = iconv_open(to_charset, from_charset);
- if (cd == 0)
- return -1;
- memset(outbuf, 0, outlen);
- if (iconv(cd, pin, &inlen, pout, &outlen) == -1)
- return -1;
- iconv_close(cd);
- *pout = '\0';
- return 0;
- }
- #endif //NOT _WIN32
- #ifdef _WIN32
- void utf82gbk_2(const char* src, char* dst, int len)
- {
- int ret = 0;
- WCHAR* strA;
- int i = MultiByteToWideChar(CP_UTF8, 0, src, -1, NULL, 0);
- if (i <= 0) {
- printf("ERROR.");
- return;
- }
- strA = (WCHAR*)malloc(i * 2);
- MultiByteToWideChar(CP_UTF8, 0, src, -1, strA, i);
- i = WideCharToMultiByte(CP_ACP, 0, strA, -1, NULL, 0, NULL, NULL);
- if (len >= i) {
- ret = WideCharToMultiByte(CP_ACP, 0, strA, -1, dst, i, NULL, NULL);
- dst[i] = 0;
- }
- if (ret <= 0) {
- free(strA);
- return;
- }
- free(strA);
- }
- void gbk2utf8_2(const char* src, char* dst, int len)
- {
- int ret = 0;
- WCHAR* strA;
- int i = MultiByteToWideChar(CP_ACP, 0, src, -1, NULL, 0);
- if (i <= 0) {
- printf("ERROR.");
- return;
- }
- strA = (WCHAR*)malloc(i * 2);
- MultiByteToWideChar(CP_ACP, 0, src, -1, strA, i);
- i = WideCharToMultiByte(CP_UTF8, 0, strA, -1, NULL, 0, NULL, NULL);
- if (len >= i) {
- ret = WideCharToMultiByte(CP_UTF8, 0, strA, -1, dst, i, NULL, NULL);
- dst[i] = 0;
- }
- if (ret <= 0) {
- free(strA);
- return;
- }
- free(strA);
- }
- #else //Linux
- // starkwong: In iconv implementations, inlen and outlen should be type of size_t not uint, which is different in length on Mac
- void utf82gbk_2(const char* src, char* dst, int len)
- {
- int ret = 0;
- size_t inlen = strlen(src) + 1;
- size_t outlen = len;
- // duanqn: The iconv function in Linux requires non-const char *
- // So we need to copy the source string
- char* inbuf = (char*)malloc(len);
- char* inbuf_hold = inbuf; // iconv may change the address of inbuf
- // so we use another pointer to keep the address
- memcpy(inbuf, src, len);
- char* outbuf = dst;
- iconv_t cd;
- cd = iconv_open("GBK", "UTF-8");
- if (cd != (iconv_t)-1) {
- ret = iconv(cd, &inbuf, &inlen, &outbuf, &outlen);
- if (ret != 0) {
- printf("iconv failed err: %s\n", strerror(errno));
- }
- iconv_close(cd);
- }
- free(inbuf_hold); // Don't pass in inbuf as it may have been modified
- }
- void gbk2utf8_2(const char* src, char* dst, int len)
- {
- int ret = 0;
- size_t inlen = strlen(src) + 1;
- size_t outlen = len;
- // duanqn: The iconv function in Linux requires non-const char *
- // So we need to copy the source string
- char* inbuf = (char*)malloc(len);
- char* inbuf_hold = inbuf; // iconv may change the address of inbuf
- // so we use another pointer to keep the address
- memcpy(inbuf, src, len);
- char* outbuf2 = NULL;
- char* outbuf = dst;
- iconv_t cd;
- // starkwong: if src==dst, the string will become invalid during conversion since UTF-8 is 3 chars in Chinese but GBK is mostly 2 chars
- if (src == dst) {
- outbuf2 = (char*)malloc(len);
- memset(outbuf2, 0, len);
- outbuf = outbuf2;
- }
- cd = iconv_open("UTF-8", "GBK");
- if (cd != (iconv_t)-1) {
- ret = iconv(cd, &inbuf, &inlen, &outbuf, &outlen);
- if (ret != 0)
- printf("iconv failed err: %s\n", strerror(errno));
- if (outbuf2 != NULL) {
- strcpy(dst, outbuf2);
- free(outbuf2);
- }
- iconv_close(cd);
- }
- free(inbuf_hold); // Don't pass in inbuf as it may have been modified
- }
- #endif
- static int is_utf8(const char* str)
- {
- const unsigned char* bytes = (const unsigned char*)str;
- if (str == NULL)
- return 1;
- #if 0
- while (*bytes != 0x00) {
- if ((*bytes & 0x80) == 0x00) {
- // U+0000 to U+007F
- num = 1;
- } else if ((*bytes & 0xE0) == 0xC0) {
- // U+0080 to U+07FF
- num = 2;
- } else if ((*bytes & 0xF0) == 0xE0) {
- // U+0800 to U+FFFF
- num = 3;
- } else if ((*bytes & 0xF8) == 0xF0) {
- // U+10000 to U+10FFFF
- num = 4;
- } else {
- return 0;
- }
- bytes += 1;
- for (i = 1; i < num; ++i) {
- if ((*bytes & 0xC0) != 0x80)
- return 0;
- bytes += 1;
- }
- }
- return 1;
- #else
- while (*bytes) {
- if ((// ASCII
- // use bytes[0] <= 0x7F to allow ASCII control characters
- bytes[0] == 0x09 ||
- bytes[0] == 0x0A ||
- bytes[0] == 0x0D ||
- (0x20 <= bytes[0] && bytes[0] <= 0x7E)
- )
- ) {
- bytes += 1;
- continue;
- }
- if ((// non-overlong 2-byte
- (0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
- (0x80 <= bytes[1] && bytes[1] <= 0xBF)
- )
- ) {
- bytes += 2;
- continue;
- }
- if ((// excluding overlongs
- bytes[0] == 0xE0 &&
- (0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
- (0x80 <= bytes[2] && bytes[2] <= 0xBF)
- ) ||
- (// straight 3-byte
- ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
- bytes[0] == 0xEE ||
- bytes[0] == 0xEF) &&
- (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
- (0x80 <= bytes[2] && bytes[2] <= 0xBF)
- ) ||
- (// excluding surrogates
- bytes[0] == 0xED &&
- (0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
- (0x80 <= bytes[2] && bytes[2] <= 0xBF)
- )
- ) {
- bytes += 3;
- continue;
- }
- if ((// planes 1-3
- bytes[0] == 0xF0 &&
- (0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
- (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
- (0x80 <= bytes[3] && bytes[3] <= 0xBF)
- ) ||
- (// planes 4-15
- (0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
- (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
- (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
- (0x80 <= bytes[3] && bytes[3] <= 0xBF)
- ) ||
- (// plane 16
- bytes[0] == 0xF4 &&
- (0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
- (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
- (0x80 <= bytes[3] && bytes[3] <= 0xBF)
- )
- ) {
- bytes += 4;
- continue;
- }
- return 0;
- }
- return 1;
- #endif
- }
- int is_valid_utf8(const char* str)
- {
- const unsigned char* bytes = (const unsigned char*)str;
- unsigned int cp;
- int num, i;
- if (!str) return 1;
- while (*bytes != 0x00) {
- if ((*bytes & 0x80) == 0x00) {
- // U+0000 to U+007F
- cp = (*bytes & 0x7F);
- num = 1;
- } else if ((*bytes & 0xE0) == 0xC0) {
- // U+0080 to U+07FF
- cp = (*bytes & 0x1F);
- num = 2;
- } else if ((*bytes & 0xF0) == 0xE0) {
- // U+0800 to U+FFFF
- cp = (*bytes & 0x0F);
- num = 3;
- } else if ((*bytes & 0xF8) == 0xF0) {
- // U+10000 to U+10FFFF
- cp = (*bytes & 0x07);
- num = 4;
- } else
- return 0;
- bytes += 1;
- for (i = 1; i < num; ++i) {
- if ((*bytes & 0xC0) != 0x80)
- return 0;
- cp = (cp << 6) | (*bytes & 0x3F);
- bytes += 1;
- }
- if ((cp > 0x10FFFF) ||
- ((cp >= 0xD800) && (cp <= 0xDFFF)) ||
- ((cp <= 0x007F) && (num != 1)) ||
- ((cp >= 0x0080) && (cp <= 0x07FF) && (num != 2)) ||
- ((cp >= 0x0800) && (cp <= 0xFFFF) && (num != 3)) ||
- ((cp >= 0x10000) && (cp <= 0x1FFFFF) && (num != 4)))
- return 0;
- }
- return 1;
- }
- int s2w(char* locale_charset, char* inbuf, size_t inlen, wchar_t* outbuf, size_t outlen)
- {
- #ifdef _MSC_VER
- wchar_t* wstr = NULL;
- int n = MultiByteToWideChar(CP_ACP, 0, inbuf, -1, NULL, 0);
- if (outbuf == NULL || outlen == 0)
- return n;
- if (n > 0) {
- wstr = malloc(sizeof(wchar_t) * (n + 1));
- if (wstr == NULL) {
- return 0;
- }
- memset(wstr, 0, (n + 1) * sizeof(wchar_t));
- MultiByteToWideChar(CP_ACP, 0, inbuf, -1, &wstr[0], n);
- wcscpy_s(outbuf, outlen, wstr);
- free(wstr);
- return n;
- }
- return 0;
- #else
- unsigned len = 0;
- if (inbuf == NULL || inlen == 0) {
- return 0;
- }
- const char* origin = setlocale(LC_CTYPE, NULL);
- WLog_DBG(TAG, "%s: origin locale: %s, aim locale: %s, data:%s", __FUNCTION__, origin, locale_charset, inbuf);
- if (NULL == setlocale(LC_CTYPE, locale_charset)) {
- WLog_DBG(TAG, "setlocale with \"%s\" failed: %d", locale_charset, errno);
- return -1;
- }
- len = mbstowcs(NULL, inbuf, inlen);
- if (len == (unsigned)-1) {
- WLog_DBG(TAG, "mbstowcs failed: %d, %s", errno, strerror(errno));
- goto on_end;
- }
- len += 1;
- if (outbuf == NULL || outlen == 0) {
- WLog_DBG(TAG, "mbstowcs to fetch need capacity: %d", len);
- goto on_end;
- }
- wmemset(outbuf, 0, outlen);
- len = mbstowcs(outbuf, inbuf, outlen > len ? len : outlen);
- if (len == (unsigned)-1) {
- WLog_DBG(TAG, "mbstowcs after new malloc failed: %d", errno);
- goto on_end;
- }
- WLog_DBG(TAG, "after mbstowcs returned: %ls(%d)", outbuf, len);
- on_end:
- setlocale(LC_CTYPE, origin);
- return len;
- #endif
- }
- int w2s(char* locale_charset, wchar_t* inbuf, size_t inlen, char* outbuf, size_t outlen)
- {
- #if defined(_MSC_VER)
- char* str = NULL;
- int n = 0;
- n = WideCharToMultiByte(CP_ACP, 0, inbuf, -1, NULL, 0, NULL, NULL);
- if (outbuf == NULL || outlen == 0)
- return n;
- if (n > 0) {
- str = malloc(sizeof(char) * (n + 1));
- if (str == NULL) {
- return 0;
- }
- memset(str, 0, sizeof(char) * (n + 1));
- WideCharToMultiByte(CP_ACP, 0, inbuf, -1, &str[0], n, NULL, NULL);
- memset(outbuf, 0, sizeof(char) * (outlen));
- strcpy_s(outbuf, outlen, str);
- free(str);
- return n;
- }
- return 0;
- #else
- unsigned len;
- char* str = NULL;
- char* origin = NULL;
- len = wcslen(inbuf);
- if (inbuf == NULL || inlen == 0) {
- return 0;
- }
- origin = setlocale(LC_CTYPE, NULL);
- WLog_DBG(TAG, "%s: origin locale: %s, aim locale: %s, data: %ls", __FUNCTION__, origin, locale_charset, inbuf);
- if (NULL == setlocale(LC_CTYPE, locale_charset)) {
- WLog_ERR(TAG, "setlocale with \"%s\" failed: %d", locale_charset, errno);
- }
- len = wcstombs(NULL, inbuf, inlen);
- if (len == (unsigned)-1) {
- WLog_DBG(TAG, "wcstombs failed: %d", errno);
- goto on_end;
- }
- len += 1;
- if (outbuf == NULL || outlen == 0) {
- WLog_DBG(TAG, "wcstombs to fetch need capacity: %d", len);
- goto on_end;
- }
- memset(outbuf, 0, outlen * sizeof(char));
- len = wcstombs(outbuf, inbuf, len > outlen ? outlen : len);
- if (len == (unsigned)-1) {
- WLog_DBG(TAG, "wcstombs after new malloc failed: %d", errno);
- goto on_end;
- }
- WLog_DBG(TAG, "after wcstombs returned: %s(%d)", outbuf, len);
- on_end:
- setlocale(LC_CTYPE, origin);
- return len;
- #endif //_MSC_VER
- }
- TOOLKIT_API int toolkit_utf82gbk(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
- {
- //return code_convert("utf-8", "gb2312", inbuf, inlen, outbuf, outlen);
-
- int in = 0;
- wchar_t* p = NULL;
- int n = 0;
- n = s2w("zh_CN.utf8", inbuf, inlen, NULL, 0);
- if (n <= 0) { return n; }
- p = malloc(sizeof(wchar_t) * n);
- if (p == NULL) {
- return 0;
- }
- wmemset(p, 0, n);
- n = s2w("zh_CN.utf8", inbuf, inlen, p, n);
- n = w2s("zh_CN.gbk", p, n, outbuf, outlen);
- free(p);
- return n;
- }
- TOOLKIT_API int toolkit_gbk2utf8(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
- {
- //return code_convert("gb2312", "utf-8", inbuf, inlen, outbuf, outlen);
- int in = 0;
- wchar_t* p = NULL;
- int n = 0;
- n = s2w("zh_CN.gbk", inbuf, inlen, NULL, 0);
- if (n <= 0) { return n; }
- p = malloc(sizeof(wchar_t) * n);
- wmemset(p, 0, n);
- n = s2w("zh_CN.gbk", inbuf, inlen, p, n);
- n = w2s("zh_CN.utf8", p, n, outbuf, outlen);
- free(p);
- return n;
- }
- /*when the text file is UTF-8 without BOM, the following function will mistakenly regard it as a ANSI file.*/
- TOOLKIT_API char_encoding detect_file_encoding(const char* file_path)
- {
- FILE* fp = NULL;
- long file_size = 0;
- char* buf = NULL;
- char_encoding result = unknown;
- unsigned char two[20];
- unsigned char unic[] = { 0xFF, 0xFE, 0x00 }; // Unicode file header
- unsigned char unic_big[] = { 0xFE, 0xFF,0x00 }; // Unicode big endian file header
- unsigned char utf8[] = { 0xEF, 0xBB, 0xBF }; // UTF_8 file header
- fp = fopen(file_path, "rb");
- if (NULL == fp) {
- WLog_ERR(TAG, "fopen(%s) failed: %d", file_path, errno);
- return result;
- }
- fseek(fp, 0, SEEK_SET);
- memset(two, 0, sizeof(two));
- if (fread(two, 1, 2, fp) != 2)
- goto on_end;
-
- if (two[0] == unic[0] && two[1] == unic[1])
- result = unicode;
- else if (two[0] == unic_big[0] && two[1] == unic_big[1])
- result = unicode_with_big_endian;
- else if (two[0] == utf8[0] && two[1] == utf8[1] /*&& two[2] == utf8[2]*/)
- result = unicode_with_bom;
- else
- result = ansi;
- on_end:
- WLog_DBG(TAG, "firstchar 0x%X, 0x%X: %d", two[0], two[1], result);
- fclose(fp);
- return result;
- }
- TOOLKIT_API int toolkit_detect_utf8_file(const char* file_path)
- {
- FILE* f = NULL;
- long file_size = 0;
- char* buf = NULL;
- int result = 1;
- f = fopen(file_path, "rb");
- if (NULL == f) {
- return -1;
- }
- fseek(f, 0, SEEK_END);
- file_size = ftell(f);
- if (0 == file_size) {
- fclose(f);
- return 1;
- }
- fseek(f, 0, SEEK_SET);
- buf = (char*)malloc((file_size+1) * sizeof(char));
- memset(buf, 0, file_size + 1);
- fread(buf, file_size, 1, f);
- fclose(f);
- result = is_utf8(buf);
- free(buf);
- return result;
- }
- TOOLKIT_API int toolkit_detect_utf8_str(const char* str)
- {
- //if (validate_utf8(str, strlen(str)) == UTF8_ACCEPT) return 1;
- return is_utf8(str);
- }
|