charset.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851
  1. #include "precompile.h"
  2. #include "charset.h"
  3. #include "memutil.h"
  4. #include <stdlib.h>
  5. #include <stdio.h>
  6. #include <fcntl.h>
  7. #include <string.h>
  8. #ifdef _WIN32
  9. #include <windows.h>
  10. #else
  11. #include <unistd.h>
  12. #include <iconv.h>
  13. #include <locale.h>
  14. #include <sys/stat.h>
  15. #include <errno.h>
  16. #endif
  17. #define TAG TOOLKIT_TAG("charaset")
  18. // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
  19. // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
  20. #define UTF8_ACCEPT 0
  21. #define UTF8_REJECT 1
  22. static const uint8_t utf8d[] = {
  23. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
  24. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
  25. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
  26. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
  27. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
  28. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
  29. 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
  30. 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
  31. 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
  32. 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
  33. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
  34. 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
  35. 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
  36. 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
  37. };
  38. static uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte)
  39. {
  40. uint32_t type = utf8d[byte];
  41. *codep = (*state != UTF8_ACCEPT) ?
  42. (byte & 0x3fu) | (*codep << 6) :
  43. (0xff >> type) & (byte);
  44. *state = utf8d[256 + *state * 16 + type];
  45. return *state;
  46. }
  47. uint32_t validate_utf8(char* str, size_t len)
  48. {
  49. size_t i;
  50. uint32_t type;
  51. uint32_t state = UTF8_ACCEPT;
  52. for (i = 0; i < len; i++) {
  53. // We don't care about the codepoint, so this is a simplified version of the decode function.
  54. type = utf8d[(uint8_t)str[i]];
  55. state = utf8d[256 + (state) * 16 + type];
  56. if (state == UTF8_REJECT)
  57. break;
  58. }
  59. return state;
  60. }
  61. #ifndef _WIN32
  62. int code_convert(char* from_charset, char* to_charset, char* inbuf, size_t inlen, char* outbuf, size_t outlen)
  63. {
  64. iconv_t cd;
  65. char** pin = &inbuf;
  66. char** pout = &outbuf;
  67. cd = iconv_open(to_charset, from_charset);
  68. if (cd == 0)
  69. return -1;
  70. memset(outbuf, 0, outlen);
  71. if (iconv(cd, pin, &inlen, pout, &outlen) == -1)
  72. return -1;
  73. iconv_close(cd);
  74. *pout = '\0';
  75. return 0;
  76. }
  77. #endif //NOT _WIN32
  78. #ifdef _WIN32
  79. void utf82gbk_2(const char* src, char* dst, int len)
  80. {
  81. int ret = 0;
  82. WCHAR* strA;
  83. int i = MultiByteToWideChar(CP_UTF8, 0, src, -1, NULL, 0);
  84. if (i <= 0) {
  85. printf("ERROR.");
  86. return;
  87. }
  88. strA = (WCHAR*)malloc(i * 2);
  89. MultiByteToWideChar(CP_UTF8, 0, src, -1, strA, i);
  90. i = WideCharToMultiByte(CP_ACP, 0, strA, -1, NULL, 0, NULL, NULL);
  91. if (len >= i) {
  92. ret = WideCharToMultiByte(CP_ACP, 0, strA, -1, dst, i, NULL, NULL);
  93. dst[i] = 0;
  94. }
  95. if (ret <= 0) {
  96. free(strA);
  97. return;
  98. }
  99. free(strA);
  100. }
  101. void gbk2utf8_2(const char* src, char* dst, int len)
  102. {
  103. int ret = 0;
  104. WCHAR* strA;
  105. int i = MultiByteToWideChar(CP_ACP, 0, src, -1, NULL, 0);
  106. if (i <= 0) {
  107. printf("ERROR.");
  108. return;
  109. }
  110. strA = (WCHAR*)malloc(i * 2);
  111. MultiByteToWideChar(CP_ACP, 0, src, -1, strA, i);
  112. i = WideCharToMultiByte(CP_UTF8, 0, strA, -1, NULL, 0, NULL, NULL);
  113. if (len >= i) {
  114. ret = WideCharToMultiByte(CP_UTF8, 0, strA, -1, dst, i, NULL, NULL);
  115. dst[i] = 0;
  116. }
  117. if (ret <= 0) {
  118. free(strA);
  119. return;
  120. }
  121. free(strA);
  122. }
  123. #else //Linux
  124. // starkwong: In iconv implementations, inlen and outlen should be type of size_t not uint, which is different in length on Mac
  125. void utf82gbk_2(const char* src, char* dst, int len)
  126. {
  127. int ret = 0;
  128. size_t inlen = strlen(src) + 1;
  129. size_t outlen = len;
  130. // duanqn: The iconv function in Linux requires non-const char *
  131. // So we need to copy the source string
  132. char* inbuf = (char*)malloc(len);
  133. char* inbuf_hold = inbuf; // iconv may change the address of inbuf
  134. // so we use another pointer to keep the address
  135. memcpy(inbuf, src, len);
  136. char* outbuf = dst;
  137. iconv_t cd;
  138. cd = iconv_open("GBK", "UTF-8");
  139. if (cd != (iconv_t)-1) {
  140. ret = iconv(cd, &inbuf, &inlen, &outbuf, &outlen);
  141. if (ret != 0) {
  142. printf("iconv failed err: %s\n", strerror(errno));
  143. }
  144. iconv_close(cd);
  145. }
  146. free(inbuf_hold); // Don't pass in inbuf as it may have been modified
  147. }
  148. void gbk2utf8_2(const char* src, char* dst, int len)
  149. {
  150. int ret = 0;
  151. size_t inlen = strlen(src) + 1;
  152. size_t outlen = len;
  153. // duanqn: The iconv function in Linux requires non-const char *
  154. // So we need to copy the source string
  155. char* inbuf = (char*)malloc(len);
  156. char* inbuf_hold = inbuf; // iconv may change the address of inbuf
  157. // so we use another pointer to keep the address
  158. memcpy(inbuf, src, len);
  159. char* outbuf2 = NULL;
  160. char* outbuf = dst;
  161. iconv_t cd;
  162. // starkwong: if src==dst, the string will become invalid during conversion since UTF-8 is 3 chars in Chinese but GBK is mostly 2 chars
  163. if (src == dst) {
  164. outbuf2 = (char*)malloc(len);
  165. memset(outbuf2, 0, len);
  166. outbuf = outbuf2;
  167. }
  168. cd = iconv_open("UTF-8", "GBK");
  169. if (cd != (iconv_t)-1) {
  170. ret = iconv(cd, &inbuf, &inlen, &outbuf, &outlen);
  171. if (ret != 0)
  172. printf("iconv failed err: %s\n", strerror(errno));
  173. if (outbuf2 != NULL) {
  174. strcpy(dst, outbuf2);
  175. free(outbuf2);
  176. }
  177. iconv_close(cd);
  178. }
  179. free(inbuf_hold); // Don't pass in inbuf as it may have been modified
  180. }
  181. #endif
  182. static int is_utf8(const char* str)
  183. {
  184. const unsigned char* bytes = (const unsigned char*)str;
  185. if (str == NULL)
  186. return 1;
  187. #if 0
  188. while (*bytes != 0x00) {
  189. if ((*bytes & 0x80) == 0x00) {
  190. // U+0000 to U+007F
  191. num = 1;
  192. } else if ((*bytes & 0xE0) == 0xC0) {
  193. // U+0080 to U+07FF
  194. num = 2;
  195. } else if ((*bytes & 0xF0) == 0xE0) {
  196. // U+0800 to U+FFFF
  197. num = 3;
  198. } else if ((*bytes & 0xF8) == 0xF0) {
  199. // U+10000 to U+10FFFF
  200. num = 4;
  201. } else {
  202. return 0;
  203. }
  204. bytes += 1;
  205. for (i = 1; i < num; ++i) {
  206. if ((*bytes & 0xC0) != 0x80)
  207. return 0;
  208. bytes += 1;
  209. }
  210. }
  211. return 1;
  212. #else
  213. while (*bytes) {
  214. if ((// ASCII
  215. // use bytes[0] <= 0x7F to allow ASCII control characters
  216. bytes[0] == 0x09 ||
  217. bytes[0] == 0x0A ||
  218. bytes[0] == 0x0D ||
  219. (0x20 <= bytes[0] && bytes[0] <= 0x7E)
  220. )
  221. ) {
  222. bytes += 1;
  223. continue;
  224. }
  225. if ((// non-overlong 2-byte
  226. (0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
  227. (0x80 <= bytes[1] && bytes[1] <= 0xBF)
  228. )
  229. ) {
  230. bytes += 2;
  231. continue;
  232. }
  233. if ((// excluding overlongs
  234. bytes[0] == 0xE0 &&
  235. (0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
  236. (0x80 <= bytes[2] && bytes[2] <= 0xBF)
  237. ) ||
  238. (// straight 3-byte
  239. ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
  240. bytes[0] == 0xEE ||
  241. bytes[0] == 0xEF) &&
  242. (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
  243. (0x80 <= bytes[2] && bytes[2] <= 0xBF)
  244. ) ||
  245. (// excluding surrogates
  246. bytes[0] == 0xED &&
  247. (0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
  248. (0x80 <= bytes[2] && bytes[2] <= 0xBF)
  249. )
  250. ) {
  251. bytes += 3;
  252. continue;
  253. }
  254. if ((// planes 1-3
  255. bytes[0] == 0xF0 &&
  256. (0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
  257. (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
  258. (0x80 <= bytes[3] && bytes[3] <= 0xBF)
  259. ) ||
  260. (// planes 4-15
  261. (0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
  262. (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
  263. (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
  264. (0x80 <= bytes[3] && bytes[3] <= 0xBF)
  265. ) ||
  266. (// plane 16
  267. bytes[0] == 0xF4 &&
  268. (0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
  269. (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
  270. (0x80 <= bytes[3] && bytes[3] <= 0xBF)
  271. )
  272. ) {
  273. bytes += 4;
  274. continue;
  275. }
  276. return 0;
  277. }
  278. return 1;
  279. #endif
  280. }
  281. int is_valid_utf8(const char* str)
  282. {
  283. const unsigned char* bytes = (const unsigned char*)str;
  284. unsigned int cp;
  285. int num, i;
  286. if (!str) return 1;
  287. while (*bytes != 0x00) {
  288. if ((*bytes & 0x80) == 0x00) {
  289. // U+0000 to U+007F
  290. cp = (*bytes & 0x7F);
  291. num = 1;
  292. } else if ((*bytes & 0xE0) == 0xC0) {
  293. // U+0080 to U+07FF
  294. cp = (*bytes & 0x1F);
  295. num = 2;
  296. } else if ((*bytes & 0xF0) == 0xE0) {
  297. // U+0800 to U+FFFF
  298. cp = (*bytes & 0x0F);
  299. num = 3;
  300. } else if ((*bytes & 0xF8) == 0xF0) {
  301. // U+10000 to U+10FFFF
  302. cp = (*bytes & 0x07);
  303. num = 4;
  304. } else
  305. return 0;
  306. bytes += 1;
  307. for (i = 1; i < num; ++i) {
  308. if ((*bytes & 0xC0) != 0x80)
  309. return 0;
  310. cp = (cp << 6) | (*bytes & 0x3F);
  311. bytes += 1;
  312. }
  313. if ((cp > 0x10FFFF) ||
  314. ((cp >= 0xD800) && (cp <= 0xDFFF)) ||
  315. ((cp <= 0x007F) && (num != 1)) ||
  316. ((cp >= 0x0080) && (cp <= 0x07FF) && (num != 2)) ||
  317. ((cp >= 0x0800) && (cp <= 0xFFFF) && (num != 3)) ||
  318. ((cp >= 0x10000) && (cp <= 0x1FFFFF) && (num != 4)))
  319. return 0;
  320. }
  321. return 1;
  322. }
  323. int s2w(char* locale_charset, char* inbuf, size_t inlen, wchar_t* outbuf, size_t outlen)
  324. {
  325. #ifdef _MSC_VER
  326. wchar_t* wstr = NULL;
  327. int n = MultiByteToWideChar(CP_ACP, 0, inbuf, -1, NULL, 0);
  328. if (outbuf == NULL || outlen == 0)
  329. return n;
  330. if (n > 0) {
  331. wstr = malloc(sizeof(wchar_t) * (n + 1));
  332. if (wstr == NULL) {
  333. return 0;
  334. }
  335. memset(wstr, 0, (n + 1) * sizeof(wchar_t));
  336. MultiByteToWideChar(CP_ACP, 0, inbuf, -1, &wstr[0], n);
  337. wcscpy_s(outbuf, outlen, wstr);
  338. free(wstr);
  339. return n;
  340. }
  341. return 0;
  342. #else
  343. unsigned len = 0;
  344. if (inbuf == NULL || inlen == 0) {
  345. return 0;
  346. }
  347. const char* origin = setlocale(LC_CTYPE, NULL);
  348. WLog_DBG(TAG, "%s: origin locale: %s, aim locale: %s, data:%s", __FUNCTION__, origin, locale_charset, inbuf);
  349. if (NULL == setlocale(LC_CTYPE, locale_charset)) {
  350. WLog_DBG(TAG, "setlocale with \"%s\" failed: %d", locale_charset, errno);
  351. return -1;
  352. }
  353. len = mbstowcs(NULL, inbuf, inlen);
  354. if (len == (unsigned)-1) {
  355. WLog_DBG(TAG, "mbstowcs failed: %d, %s", errno, strerror(errno));
  356. goto on_end;
  357. }
  358. len += 1;
  359. if (outbuf == NULL || outlen == 0) {
  360. WLog_DBG(TAG, "mbstowcs to fetch need capacity: %d", len);
  361. goto on_end;
  362. }
  363. wmemset(outbuf, 0, outlen);
  364. len = mbstowcs(outbuf, inbuf, outlen > len ? len : outlen);
  365. if (len == (unsigned)-1) {
  366. WLog_DBG(TAG, "mbstowcs after new malloc failed: %d", errno);
  367. goto on_end;
  368. }
  369. WLog_DBG(TAG, "after mbstowcs returned: %ls(%d)", outbuf, len);
  370. on_end:
  371. setlocale(LC_CTYPE, origin);
  372. return len;
  373. #endif
  374. }
  375. int w2s(char* locale_charset, wchar_t* inbuf, size_t inlen, char* outbuf, size_t outlen)
  376. {
  377. #if defined(_MSC_VER)
  378. char* str = NULL;
  379. int n = 0;
  380. n = WideCharToMultiByte(CP_ACP, 0, inbuf, -1, NULL, 0, NULL, NULL);
  381. if (outbuf == NULL || outlen == 0)
  382. return n;
  383. if (n > 0) {
  384. str = malloc(sizeof(char) * (n + 1));
  385. if (str == NULL) {
  386. return 0;
  387. }
  388. memset(str, 0, sizeof(char) * (n + 1));
  389. WideCharToMultiByte(CP_ACP, 0, inbuf, -1, &str[0], n, NULL, NULL);
  390. memset(outbuf, 0, sizeof(char) * (outlen));
  391. strcpy_s(outbuf, outlen, str);
  392. free(str);
  393. return n;
  394. }
  395. return 0;
  396. #else
  397. unsigned len;
  398. char* str = NULL;
  399. char* origin = NULL;
  400. len = wcslen(inbuf);
  401. if (inbuf == NULL || inlen == 0) {
  402. return 0;
  403. }
  404. origin = setlocale(LC_CTYPE, NULL);
  405. WLog_DBG(TAG, "%s: origin locale: %s, aim locale: %s, data: %ls", __FUNCTION__, origin, locale_charset, inbuf);
  406. if (NULL == setlocale(LC_CTYPE, locale_charset)) {
  407. WLog_ERR(TAG, "setlocale with \"%s\" failed: %d", locale_charset, errno);
  408. }
  409. len = wcstombs(NULL, inbuf, inlen);
  410. if (len == (unsigned)-1) {
  411. WLog_DBG(TAG, "wcstombs failed: %d", errno);
  412. goto on_end;
  413. }
  414. len += 1;
  415. if (outbuf == NULL || outlen == 0) {
  416. WLog_DBG(TAG, "wcstombs to fetch need capacity: %d", len);
  417. goto on_end;
  418. }
  419. memset(outbuf, 0, outlen * sizeof(char));
  420. len = wcstombs(outbuf, inbuf, len > outlen ? outlen : len);
  421. if (len == (unsigned)-1) {
  422. WLog_DBG(TAG, "wcstombs after new malloc failed: %d", errno);
  423. goto on_end;
  424. }
  425. WLog_DBG(TAG, "after wcstombs returned: %s(%d)", outbuf, len);
  426. on_end:
  427. setlocale(LC_CTYPE, origin);
  428. return len;
  429. #endif //_MSC_VER
  430. }
  431. TOOLKIT_API int toolkit_utf82gbk(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
  432. {
  433. //return code_convert("utf-8", "gb2312", inbuf, inlen, outbuf, outlen);
  434. int in = 0;
  435. wchar_t* p = NULL;
  436. int n = 0;
  437. n = s2w("zh_CN.utf8", inbuf, inlen, NULL, 0);
  438. if (n <= 0) { return n; }
  439. p = malloc(sizeof(wchar_t) * n);
  440. if (p == NULL) {
  441. return 0;
  442. }
  443. wmemset(p, 0, n);
  444. n = s2w("zh_CN.utf8", inbuf, inlen, p, n);
  445. n = w2s("zh_CN.gbk", p, n, outbuf, outlen);
  446. free(p);
  447. return n;
  448. }
  449. TOOLKIT_API int toolkit_gbk2utf8(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
  450. {
  451. //return code_convert("gb2312", "utf-8", inbuf, inlen, outbuf, outlen);
  452. int in = 0;
  453. wchar_t* p = NULL;
  454. int n = 0;
  455. n = s2w("zh_CN.gbk", inbuf, inlen, NULL, 0);
  456. if (n <= 0) { return n; }
  457. p = malloc(sizeof(wchar_t) * n);
  458. wmemset(p, 0, n);
  459. n = s2w("zh_CN.gbk", inbuf, inlen, p, n);
  460. n = w2s("zh_CN.utf8", p, n, outbuf, outlen);
  461. free(p);
  462. return n;
  463. }
  464. /*when the text file is UTF-8 without BOM, the following function will mistakenly regard it as a ANSI file.*/
  465. TOOLKIT_API char_encoding detect_file_encoding(const char* file_path)
  466. {
  467. FILE* fp = NULL;
  468. long file_size = 0;
  469. char* buf = NULL;
  470. char_encoding result = unknown;
  471. unsigned char two[20];
  472. unsigned char unic[] = { 0xFF, 0xFE, 0x00 }; // Unicode file header
  473. unsigned char unic_big[] = { 0xFE, 0xFF,0x00 }; // Unicode big endian file header
  474. unsigned char utf8[] = { 0xEF, 0xBB, 0xBF }; // UTF_8 file header
  475. fp = fopen(file_path, "rb");
  476. if (NULL == fp) {
  477. WLog_ERR(TAG, "fopen(%s) failed: %d", file_path, errno);
  478. return result;
  479. }
  480. fseek(fp, 0, SEEK_SET);
  481. memset(two, 0, sizeof(two));
  482. if (fread(two, 1, 2, fp) != 2)
  483. goto on_end;
  484. if (two[0] == unic[0] && two[1] == unic[1])
  485. result = unicode;
  486. else if (two[0] == unic_big[0] && two[1] == unic_big[1])
  487. result = unicode_with_big_endian;
  488. else if (two[0] == utf8[0] && two[1] == utf8[1] /*&& two[2] == utf8[2]*/)
  489. result = unicode_with_bom;
  490. else
  491. result = ansi;
  492. on_end:
  493. WLog_DBG(TAG, "firstchar 0x%X, 0x%X: %d", two[0], two[1], result);
  494. fclose(fp);
  495. return result;
  496. }
  497. TOOLKIT_API int toolkit_detect_utf8_file(const char* file_path)
  498. {
  499. FILE* f = NULL;
  500. long file_size = 0;
  501. char* buf = NULL;
  502. int result = 1;
  503. f = fopen(file_path, "rb");
  504. if (NULL == f) {
  505. return -1;
  506. }
  507. fseek(f, 0, SEEK_END);
  508. file_size = ftell(f);
  509. if (0 == file_size) {
  510. fclose(f);
  511. return 1;
  512. }
  513. fseek(f, 0, SEEK_SET);
  514. buf = (char*)malloc((file_size+1) * sizeof(char));
  515. memset(buf, 0, file_size + 1);
  516. fread(buf, file_size, 1, f);
  517. fclose(f);
  518. result = is_utf8(buf);
  519. free(buf);
  520. return result;
  521. }
  522. TOOLKIT_API int toolkit_detect_utf8_str(const char* str)
  523. {
  524. //if (validate_utf8(str, strlen(str)) == UTF8_ACCEPT) return 1;
  525. return is_utf8(str);
  526. }
  527. TOOLKIT_API int toolkit_detect_utf8_str2(const char* str)
  528. {
  529. unsigned int nBytes = 0;//UFT8可用1-6个字节编码,ASCII用一个字节
  530. unsigned char chr = *str;
  531. int bAllAscii = 1;
  532. for (unsigned int i = 0; str[i] != '\0'; ++i) {
  533. chr = *(str + i);
  534. //判断是否ASCII编码,如果不是,说明有可能是UTF8,ASCII用7位编码,最高位标记为0,0xxxxxxx
  535. if (nBytes == 0 && (chr & 0x80) != 0) {
  536. bAllAscii = 0;
  537. }
  538. if (nBytes == 0) {
  539. //如果不是ASCII码,应该是多字节符,计算字节数
  540. if (chr >= 0x80) {
  541. if (chr >= 0xFC && chr <= 0xFD) {
  542. nBytes = 6;
  543. }
  544. else if (chr >= 0xF8) {
  545. nBytes = 5;
  546. }
  547. else if (chr >= 0xF0) {
  548. nBytes = 4;
  549. }
  550. else if (chr >= 0xE0) {
  551. nBytes = 3;
  552. }
  553. else if (chr >= 0xC0) {
  554. nBytes = 2;
  555. }
  556. else {
  557. return 0;
  558. }
  559. nBytes--;
  560. }
  561. }
  562. else {
  563. //多字节符的非首字节,应为 10xxxxxx
  564. if ((chr & 0xC0) != 0x80) {
  565. return 0;
  566. }
  567. //减到为零为止
  568. nBytes--;
  569. }
  570. }
  571. //违返UTF8编码规则
  572. if (nBytes != 0) {
  573. return 0;
  574. }
  575. if (bAllAscii) { //如果全部都是ASCII, 也是UTF8
  576. return 1;
  577. }
  578. return 1;
  579. }
  580. size_t toolkit_mbs2wcs(const char* src, wchar_t* dst, size_t dst_size)
  581. {
  582. #ifdef _MSC_VER
  583. wchar_t* wstr = NULL;
  584. int n = MultiByteToWideChar(CP_ACP, 0, src, -1, NULL, 0);
  585. if (n > 0) {
  586. wstr = malloc(sizeof(wchar_t) * (n + 1));
  587. if (wstr == NULL) {
  588. return 0;
  589. }
  590. memset(wstr, 0, (n + 1) * sizeof(wchar_t));
  591. MultiByteToWideChar(CP_ACP, 0, src, -1, &wstr[0], n);
  592. wcscpy(dst, wstr);
  593. FREE(wstr);
  594. return n;
  595. }
  596. return 0;
  597. #else
  598. unsigned len = 0;
  599. wchar_t* p = NULL;
  600. if (strlen(src) == 0) {
  601. return 0;
  602. }
  603. const char* origin = setlocale(LC_CTYPE, NULL);
  604. WLog_DBG(TAG, "%s: origin locale: %s, data:%s", __FUNCTION__, origin, src);
  605. setlocale(LC_CTYPE, "");
  606. len = mbstowcs(NULL, src, 0);
  607. if (len == -1) {
  608. WLog_DBG(TAG, "mbstowcs failed: %d", errno);
  609. goto on_end;
  610. }
  611. len += 1;
  612. if (dst == NULL) {
  613. WLog_DBG(TAG, "mbstowcs to fetch need capacity: %d", len);
  614. goto on_end;
  615. }
  616. p = malloc(sizeof(wchar_t) * len);
  617. wmemset(p, 0, len);
  618. len = mbstowcs(p, src, len);
  619. if (len == -1) {
  620. WLog_DBG(TAG, "mbstowcs after new malloc failed: %d", errno);
  621. goto on_del;
  622. }
  623. wmemset(dst, 0, dst_size);
  624. wcscpy(dst, p);
  625. len = wcslen(dst);
  626. WLog_DBG(TAG, "after mbstowcs returned: %ls(%d)", dst, len);
  627. on_del:
  628. free(p);
  629. on_end:
  630. setlocale(LC_CTYPE, origin);
  631. return len;
  632. #endif
  633. }
  634. size_t toolkit_wcs2mbs(const wchar_t* src, char* dst, size_t dst_size)
  635. {
  636. #if defined(_MSC_VER)
  637. char* str = NULL;
  638. int n = 0;
  639. n = WideCharToMultiByte(CP_ACP, 0, src, -1, NULL, 0, NULL, NULL);
  640. if (n > 0) {
  641. str = malloc(sizeof(char) * (n + 1));
  642. if (str == NULL) {
  643. return 0;
  644. }
  645. memset(str, 0, sizeof(char) * (n + 1));
  646. WideCharToMultiByte(CP_ACP, 0, src, -1, &str[0], n, NULL, NULL);
  647. memset(dst, 0, sizeof(char) * (dst_size));
  648. strcpy(dst, str);
  649. FREE(str);
  650. return n;
  651. }
  652. return 0;
  653. #else
  654. unsigned len;
  655. char* str = NULL;
  656. char* origin = NULL;
  657. len = wcslen(src);
  658. if (len == 0) {
  659. return 0;
  660. }
  661. origin = setlocale(LC_CTYPE, NULL);
  662. WLog_DBG(TAG, "%s: origin locale: %s, data: %ls", __FUNCTION__, origin, src);
  663. setlocale(LC_CTYPE, "");
  664. len = wcstombs(NULL, src, 0);
  665. if (len == -1) {
  666. WLog_DBG(TAG, "wcstombs failed: %d", errno);
  667. goto on_end;
  668. }
  669. len += 1;
  670. if (dst == NULL) {
  671. WLog_DBG(TAG, "wcstombs to fetch need capacity: %d", len);
  672. goto on_end;
  673. }
  674. str = malloc(sizeof(char) * len);
  675. memset(str, 0, len * sizeof(char));
  676. len = wcstombs(str, src, len);
  677. if (len == -1) {
  678. WLog_DBG(TAG, "wcstombs after new malloc failed: %d", errno);
  679. goto on_del;
  680. }
  681. memset(dst, 0, dst_size * sizeof(char));
  682. strcpy(dst, str);
  683. len = strlen(dst);
  684. WLog_DBG(TAG, "after wcstombs returned: %s(%d)", dst, len);
  685. on_del:
  686. free(str);
  687. on_end:
  688. setlocale(LC_CTYPE, origin);
  689. return len;
  690. #endif //_MSC_VER
  691. }
  692. TOOLKIT_API char* ConvertGBKToUtf8(const char* gbk, int* n)
  693. {
  694. #ifdef _MSC_VER
  695. int len = MultiByteToWideChar(CP_ACP, 0, gbk, -1, NULL, 0);
  696. WCHAR* wszGBK = malloc(sizeof(WCHAR) * (len + 1));
  697. memset(wszGBK, 0, len * 2 + 2);
  698. MultiByteToWideChar(CP_ACP, 0, gbk, -1, wszGBK, len);
  699. len = WideCharToMultiByte(CP_UTF8, 0, wszGBK, -1, NULL, 0, NULL, NULL);
  700. char* szUtf8 = malloc(sizeof(char) * (len + 1));
  701. memset(szUtf8, 0, len + 1);
  702. WideCharToMultiByte(CP_UTF8, 0, wszGBK, -1, szUtf8, len, NULL, NULL);
  703. free(wszGBK);
  704. *n = len - 1;
  705. return szUtf8;
  706. #else
  707. return NULL;
  708. #endif
  709. }
  710. TOOLKIT_API char* ConvertUtf8ToGBK(const char* strUtf8)
  711. {
  712. #ifdef _MSC_VER
  713. int len = MultiByteToWideChar(CP_UTF8, 0, strUtf8, -1, NULL, 0);
  714. WCHAR* wszGBK = malloc(sizeof(WCHAR)*(len + 1));
  715. memset(wszGBK, 0, len * 2 + 2);
  716. MultiByteToWideChar(CP_UTF8, 0, strUtf8, -1, wszGBK, len);
  717. len = WideCharToMultiByte(CP_ACP, 0, wszGBK, -1, NULL, 0, NULL, NULL);
  718. char* szGBK = malloc(sizeof(char) * (len + 1));
  719. memset(szGBK, 0, len + 1);
  720. WideCharToMultiByte(CP_ACP, 0, wszGBK, -1, szGBK, len, NULL, NULL);
  721. free(wszGBK);
  722. return szGBK;
  723. #else
  724. return NULL;
  725. #endif
  726. }