charset.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. #include "precompile.h"
  2. #include "charset.h"
  3. #include <stdlib.h>
  4. #include <stdio.h>
  5. #include <fcntl.h>
  6. #include <string.h>
  7. #ifdef _WIN32
  8. #include <windows.h>
  9. #else
  10. #include <unistd.h>
  11. #include <iconv.h>
  12. #include <locale.h>
  13. #include <sys/stat.h>
  14. #include <errno.h>
  15. #endif
  16. #define TAG TOOLKIT_TAG("charaset")
  17. // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
  18. // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
  19. #define UTF8_ACCEPT 0
  20. #define UTF8_REJECT 1
  21. static const uint8_t utf8d[] = {
  22. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
  23. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
  24. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
  25. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
  26. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
  27. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
  28. 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
  29. 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
  30. 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
  31. 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
  32. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
  33. 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
  34. 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
  35. 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
  36. };
  37. static uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte)
  38. {
  39. uint32_t type = utf8d[byte];
  40. *codep = (*state != UTF8_ACCEPT) ?
  41. (byte & 0x3fu) | (*codep << 6) :
  42. (0xff >> type) & (byte);
  43. *state = utf8d[256 + *state * 16 + type];
  44. return *state;
  45. }
  46. uint32_t validate_utf8(char* str, size_t len)
  47. {
  48. size_t i;
  49. uint32_t type;
  50. uint32_t state = UTF8_ACCEPT;
  51. for (i = 0; i < len; i++) {
  52. // We don't care about the codepoint, so this is a simplified version of the decode function.
  53. type = utf8d[(uint8_t)str[i]];
  54. state = utf8d[256 + (state) * 16 + type];
  55. if (state == UTF8_REJECT)
  56. break;
  57. }
  58. return state;
  59. }
  60. #ifndef _WIN32
  61. int code_convert(char* from_charset, char* to_charset, char* inbuf, size_t inlen, char* outbuf, size_t outlen)
  62. {
  63. iconv_t cd;
  64. char** pin = &inbuf;
  65. char** pout = &outbuf;
  66. cd = iconv_open(to_charset, from_charset);
  67. if (cd == 0)
  68. return -1;
  69. memset(outbuf, 0, outlen);
  70. if (iconv(cd, pin, &inlen, pout, &outlen) == -1)
  71. return -1;
  72. iconv_close(cd);
  73. *pout = '\0';
  74. return 0;
  75. }
  76. #endif //NOT _WIN32
  77. #ifdef _WIN32
  78. void utf82gbk_2(const char* src, char* dst, int len)
  79. {
  80. int ret = 0;
  81. WCHAR* strA;
  82. int i = MultiByteToWideChar(CP_UTF8, 0, src, -1, NULL, 0);
  83. if (i <= 0) {
  84. printf("ERROR.");
  85. return;
  86. }
  87. strA = (WCHAR*)malloc(i * 2);
  88. MultiByteToWideChar(CP_UTF8, 0, src, -1, strA, i);
  89. i = WideCharToMultiByte(CP_ACP, 0, strA, -1, NULL, 0, NULL, NULL);
  90. if (len >= i) {
  91. ret = WideCharToMultiByte(CP_ACP, 0, strA, -1, dst, i, NULL, NULL);
  92. dst[i] = 0;
  93. }
  94. if (ret <= 0) {
  95. free(strA);
  96. return;
  97. }
  98. free(strA);
  99. }
  100. void gbk2utf8_2(const char* src, char* dst, int len)
  101. {
  102. int ret = 0;
  103. WCHAR* strA;
  104. int i = MultiByteToWideChar(CP_ACP, 0, src, -1, NULL, 0);
  105. if (i <= 0) {
  106. printf("ERROR.");
  107. return;
  108. }
  109. strA = (WCHAR*)malloc(i * 2);
  110. MultiByteToWideChar(CP_ACP, 0, src, -1, strA, i);
  111. i = WideCharToMultiByte(CP_UTF8, 0, strA, -1, NULL, 0, NULL, NULL);
  112. if (len >= i) {
  113. ret = WideCharToMultiByte(CP_UTF8, 0, strA, -1, dst, i, NULL, NULL);
  114. dst[i] = 0;
  115. }
  116. if (ret <= 0) {
  117. free(strA);
  118. return;
  119. }
  120. free(strA);
  121. }
  122. #else //Linux
  123. // starkwong: In iconv implementations, inlen and outlen should be type of size_t not uint, which is different in length on Mac
  124. void utf82gbk_2(const char* src, char* dst, int len)
  125. {
  126. int ret = 0;
  127. size_t inlen = strlen(src) + 1;
  128. size_t outlen = len;
  129. // duanqn: The iconv function in Linux requires non-const char *
  130. // So we need to copy the source string
  131. char* inbuf = (char*)malloc(len);
  132. char* inbuf_hold = inbuf; // iconv may change the address of inbuf
  133. // so we use another pointer to keep the address
  134. memcpy(inbuf, src, len);
  135. char* outbuf = dst;
  136. iconv_t cd;
  137. cd = iconv_open("GBK", "UTF-8");
  138. if (cd != (iconv_t)-1) {
  139. ret = iconv(cd, &inbuf, &inlen, &outbuf, &outlen);
  140. if (ret != 0) {
  141. printf("iconv failed err: %s\n", strerror(errno));
  142. }
  143. iconv_close(cd);
  144. }
  145. free(inbuf_hold); // Don't pass in inbuf as it may have been modified
  146. }
  147. void gbk2utf8_2(const char* src, char* dst, int len)
  148. {
  149. int ret = 0;
  150. size_t inlen = strlen(src) + 1;
  151. size_t outlen = len;
  152. // duanqn: The iconv function in Linux requires non-const char *
  153. // So we need to copy the source string
  154. char* inbuf = (char*)malloc(len);
  155. char* inbuf_hold = inbuf; // iconv may change the address of inbuf
  156. // so we use another pointer to keep the address
  157. memcpy(inbuf, src, len);
  158. char* outbuf2 = NULL;
  159. char* outbuf = dst;
  160. iconv_t cd;
  161. // starkwong: if src==dst, the string will become invalid during conversion since UTF-8 is 3 chars in Chinese but GBK is mostly 2 chars
  162. if (src == dst) {
  163. outbuf2 = (char*)malloc(len);
  164. memset(outbuf2, 0, len);
  165. outbuf = outbuf2;
  166. }
  167. cd = iconv_open("UTF-8", "GBK");
  168. if (cd != (iconv_t)-1) {
  169. ret = iconv(cd, &inbuf, &inlen, &outbuf, &outlen);
  170. if (ret != 0)
  171. printf("iconv failed err: %s\n", strerror(errno));
  172. if (outbuf2 != NULL) {
  173. strcpy(dst, outbuf2);
  174. free(outbuf2);
  175. }
  176. iconv_close(cd);
  177. }
  178. free(inbuf_hold); // Don't pass in inbuf as it may have been modified
  179. }
  180. #endif
  181. static int is_utf8(const char* str)
  182. {
  183. const unsigned char* bytes = (const unsigned char*)str;
  184. if (str == NULL)
  185. return 1;
  186. #if 0
  187. while (*bytes != 0x00) {
  188. if ((*bytes & 0x80) == 0x00) {
  189. // U+0000 to U+007F
  190. num = 1;
  191. } else if ((*bytes & 0xE0) == 0xC0) {
  192. // U+0080 to U+07FF
  193. num = 2;
  194. } else if ((*bytes & 0xF0) == 0xE0) {
  195. // U+0800 to U+FFFF
  196. num = 3;
  197. } else if ((*bytes & 0xF8) == 0xF0) {
  198. // U+10000 to U+10FFFF
  199. num = 4;
  200. } else {
  201. return 0;
  202. }
  203. bytes += 1;
  204. for (i = 1; i < num; ++i) {
  205. if ((*bytes & 0xC0) != 0x80)
  206. return 0;
  207. bytes += 1;
  208. }
  209. }
  210. return 1;
  211. #else
  212. while (*bytes) {
  213. if ((// ASCII
  214. // use bytes[0] <= 0x7F to allow ASCII control characters
  215. bytes[0] == 0x09 ||
  216. bytes[0] == 0x0A ||
  217. bytes[0] == 0x0D ||
  218. (0x20 <= bytes[0] && bytes[0] <= 0x7E)
  219. )
  220. ) {
  221. bytes += 1;
  222. continue;
  223. }
  224. if ((// non-overlong 2-byte
  225. (0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
  226. (0x80 <= bytes[1] && bytes[1] <= 0xBF)
  227. )
  228. ) {
  229. bytes += 2;
  230. continue;
  231. }
  232. if ((// excluding overlongs
  233. bytes[0] == 0xE0 &&
  234. (0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
  235. (0x80 <= bytes[2] && bytes[2] <= 0xBF)
  236. ) ||
  237. (// straight 3-byte
  238. ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
  239. bytes[0] == 0xEE ||
  240. bytes[0] == 0xEF) &&
  241. (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
  242. (0x80 <= bytes[2] && bytes[2] <= 0xBF)
  243. ) ||
  244. (// excluding surrogates
  245. bytes[0] == 0xED &&
  246. (0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
  247. (0x80 <= bytes[2] && bytes[2] <= 0xBF)
  248. )
  249. ) {
  250. bytes += 3;
  251. continue;
  252. }
  253. if ((// planes 1-3
  254. bytes[0] == 0xF0 &&
  255. (0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
  256. (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
  257. (0x80 <= bytes[3] && bytes[3] <= 0xBF)
  258. ) ||
  259. (// planes 4-15
  260. (0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
  261. (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
  262. (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
  263. (0x80 <= bytes[3] && bytes[3] <= 0xBF)
  264. ) ||
  265. (// plane 16
  266. bytes[0] == 0xF4 &&
  267. (0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
  268. (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
  269. (0x80 <= bytes[3] && bytes[3] <= 0xBF)
  270. )
  271. ) {
  272. bytes += 4;
  273. continue;
  274. }
  275. return 0;
  276. }
  277. return 1;
  278. #endif
  279. }
  280. int is_valid_utf8(const char* str)
  281. {
  282. const unsigned char* bytes = (const unsigned char*)str;
  283. unsigned int cp;
  284. int num, i;
  285. if (!str) return 1;
  286. while (*bytes != 0x00) {
  287. if ((*bytes & 0x80) == 0x00) {
  288. // U+0000 to U+007F
  289. cp = (*bytes & 0x7F);
  290. num = 1;
  291. } else if ((*bytes & 0xE0) == 0xC0) {
  292. // U+0080 to U+07FF
  293. cp = (*bytes & 0x1F);
  294. num = 2;
  295. } else if ((*bytes & 0xF0) == 0xE0) {
  296. // U+0800 to U+FFFF
  297. cp = (*bytes & 0x0F);
  298. num = 3;
  299. } else if ((*bytes & 0xF8) == 0xF0) {
  300. // U+10000 to U+10FFFF
  301. cp = (*bytes & 0x07);
  302. num = 4;
  303. } else
  304. return 0;
  305. bytes += 1;
  306. for (i = 1; i < num; ++i) {
  307. if ((*bytes & 0xC0) != 0x80)
  308. return 0;
  309. cp = (cp << 6) | (*bytes & 0x3F);
  310. bytes += 1;
  311. }
  312. if ((cp > 0x10FFFF) ||
  313. ((cp >= 0xD800) && (cp <= 0xDFFF)) ||
  314. ((cp <= 0x007F) && (num != 1)) ||
  315. ((cp >= 0x0080) && (cp <= 0x07FF) && (num != 2)) ||
  316. ((cp >= 0x0800) && (cp <= 0xFFFF) && (num != 3)) ||
  317. ((cp >= 0x10000) && (cp <= 0x1FFFFF) && (num != 4)))
  318. return 0;
  319. }
  320. return 1;
  321. }
  322. int s2w(char* locale_charset, char* inbuf, size_t inlen, wchar_t* outbuf, size_t outlen)
  323. {
  324. #ifdef _MSC_VER
  325. wchar_t* wstr = NULL;
  326. int n = MultiByteToWideChar(CP_ACP, 0, inbuf, -1, NULL, 0);
  327. if (outbuf == NULL || outlen == 0)
  328. return n;
  329. if (n > 0) {
  330. wstr = malloc(sizeof(wchar_t) * (n + 1));
  331. if (wstr == NULL) {
  332. return 0;
  333. }
  334. memset(wstr, 0, (n + 1) * sizeof(wchar_t));
  335. MultiByteToWideChar(CP_ACP, 0, inbuf, -1, &wstr[0], n);
  336. wcscpy_s(outbuf, outlen, wstr);
  337. free(wstr);
  338. return n;
  339. }
  340. return 0;
  341. #else
  342. unsigned len = 0;
  343. if (inbuf == NULL || inlen == 0) {
  344. return 0;
  345. }
  346. const char* origin = setlocale(LC_CTYPE, NULL);
  347. WLog_DBG(TAG, "%s: origin locale: %s, aim locale: %s, data:%s", __FUNCTION__, origin, locale_charset, inbuf);
  348. if (NULL == setlocale(LC_CTYPE, locale_charset)) {
  349. WLog_DBG(TAG, "setlocale with \"%s\" failed: %d", locale_charset, errno);
  350. return -1;
  351. }
  352. len = mbstowcs(NULL, inbuf, inlen);
  353. if (len == (unsigned)-1) {
  354. WLog_DBG(TAG, "mbstowcs failed: %d, %s", errno, strerror(errno));
  355. goto on_end;
  356. }
  357. len += 1;
  358. if (outbuf == NULL || outlen == 0) {
  359. WLog_DBG(TAG, "mbstowcs to fetch need capacity: %d", len);
  360. goto on_end;
  361. }
  362. wmemset(outbuf, 0, outlen);
  363. len = mbstowcs(outbuf, inbuf, outlen > len ? len : outlen);
  364. if (len == (unsigned)-1) {
  365. WLog_DBG(TAG, "mbstowcs after new malloc failed: %d", errno);
  366. goto on_end;
  367. }
  368. WLog_DBG(TAG, "after mbstowcs returned: %ls(%d)", outbuf, len);
  369. on_end:
  370. setlocale(LC_CTYPE, origin);
  371. return len;
  372. #endif
  373. }
  374. int w2s(char* locale_charset, wchar_t* inbuf, size_t inlen, char* outbuf, size_t outlen)
  375. {
  376. #if defined(_MSC_VER)
  377. char* str = NULL;
  378. int n = 0;
  379. n = WideCharToMultiByte(CP_ACP, 0, inbuf, -1, NULL, 0, NULL, NULL);
  380. if (outbuf == NULL || outlen == 0)
  381. return n;
  382. if (n > 0) {
  383. str = malloc(sizeof(char) * (n + 1));
  384. if (str == NULL) {
  385. return 0;
  386. }
  387. memset(str, 0, sizeof(char) * (n + 1));
  388. WideCharToMultiByte(CP_ACP, 0, inbuf, -1, &str[0], n, NULL, NULL);
  389. memset(outbuf, 0, sizeof(char) * (outlen));
  390. strcpy_s(outbuf, outlen, str);
  391. free(str);
  392. return n;
  393. }
  394. return 0;
  395. #else
  396. unsigned len;
  397. char* str = NULL;
  398. char* origin = NULL;
  399. len = wcslen(inbuf);
  400. if (inbuf == NULL || inlen == 0) {
  401. return 0;
  402. }
  403. origin = setlocale(LC_CTYPE, NULL);
  404. WLog_DBG(TAG, "%s: origin locale: %s, aim locale: %s, data: %ls", __FUNCTION__, origin, locale_charset, inbuf);
  405. if (NULL == setlocale(LC_CTYPE, locale_charset)) {
  406. WLog_ERR(TAG, "setlocale with \"%s\" failed: %d", locale_charset, errno);
  407. }
  408. len = wcstombs(NULL, inbuf, inlen);
  409. if (len == (unsigned)-1) {
  410. WLog_DBG(TAG, "wcstombs failed: %d", errno);
  411. goto on_end;
  412. }
  413. len += 1;
  414. if (outbuf == NULL || outlen == 0) {
  415. WLog_DBG(TAG, "wcstombs to fetch need capacity: %d", len);
  416. goto on_end;
  417. }
  418. memset(outbuf, 0, outlen * sizeof(char));
  419. len = wcstombs(outbuf, inbuf, len > outlen ? outlen : len);
  420. if (len == (unsigned)-1) {
  421. WLog_DBG(TAG, "wcstombs after new malloc failed: %d", errno);
  422. goto on_end;
  423. }
  424. WLog_DBG(TAG, "after wcstombs returned: %s(%d)", outbuf, len);
  425. on_end:
  426. setlocale(LC_CTYPE, origin);
  427. return len;
  428. #endif //_MSC_VER
  429. }
  430. TOOLKIT_API int toolkit_utf82gbk(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
  431. {
  432. //return code_convert("utf-8", "gb2312", inbuf, inlen, outbuf, outlen);
  433. int in = 0;
  434. wchar_t* p = NULL;
  435. int n = 0;
  436. n = s2w("zh_CN.utf8", inbuf, inlen, NULL, 0);
  437. if (n <= 0) { return n; }
  438. p = malloc(sizeof(wchar_t) * n);
  439. if (p == NULL) {
  440. return 0;
  441. }
  442. wmemset(p, 0, n);
  443. n = s2w("zh_CN.utf8", inbuf, inlen, p, n);
  444. n = w2s("zh_CN.gbk", p, n, outbuf, outlen);
  445. free(p);
  446. return n;
  447. }
  448. TOOLKIT_API int toolkit_gbk2utf8(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
  449. {
  450. //return code_convert("gb2312", "utf-8", inbuf, inlen, outbuf, outlen);
  451. int in = 0;
  452. wchar_t* p = NULL;
  453. int n = 0;
  454. n = s2w("zh_CN.gbk", inbuf, inlen, NULL, 0);
  455. if (n <= 0) { return n; }
  456. p = malloc(sizeof(wchar_t) * n);
  457. wmemset(p, 0, n);
  458. n = s2w("zh_CN.gbk", inbuf, inlen, p, n);
  459. n = w2s("zh_CN.utf8", p, n, outbuf, outlen);
  460. free(p);
  461. return n;
  462. }
  463. /*when the text file is UTF-8 without BOM, the following function will mistakenly regard it as a ANSI file.*/
  464. TOOLKIT_API char_encoding detect_file_encoding(const char* file_path)
  465. {
  466. FILE* fp = NULL;
  467. long file_size = 0;
  468. char* buf = NULL;
  469. char_encoding result = unknown;
  470. unsigned char two[20];
  471. unsigned char unic[] = { 0xFF, 0xFE, 0x00 }; // Unicode file header
  472. unsigned char unic_big[] = { 0xFE, 0xFF,0x00 }; // Unicode big endian file header
  473. unsigned char utf8[] = { 0xEF, 0xBB, 0xBF }; // UTF_8 file header
  474. fp = fopen(file_path, "rb");
  475. if (NULL == fp) {
  476. WLog_ERR(TAG, "fopen(%s) failed: %d", file_path, errno);
  477. return result;
  478. }
  479. fseek(fp, 0, SEEK_SET);
  480. memset(two, 0, sizeof(two));
  481. if (fread(two, 1, 2, fp) != 2)
  482. goto on_end;
  483. if (two[0] == unic[0] && two[1] == unic[1])
  484. result = unicode;
  485. else if (two[0] == unic_big[0] && two[1] == unic_big[1])
  486. result = unicode_with_big_endian;
  487. else if (two[0] == utf8[0] && two[1] == utf8[1] /*&& two[2] == utf8[2]*/)
  488. result = unicode_with_bom;
  489. else
  490. result = ansi;
  491. on_end:
  492. WLog_DBG(TAG, "firstchar 0x%X, 0x%X: %d", two[0], two[1], result);
  493. fclose(fp);
  494. return result;
  495. }
  496. TOOLKIT_API int toolkit_detect_utf8_file(const char* file_path)
  497. {
  498. FILE* f = NULL;
  499. long file_size = 0;
  500. char* buf = NULL;
  501. int result = 1;
  502. f = fopen(file_path, "rb");
  503. if (NULL == f) {
  504. return -1;
  505. }
  506. fseek(f, 0, SEEK_END);
  507. file_size = ftell(f);
  508. if (0 == file_size) {
  509. fclose(f);
  510. return 1;
  511. }
  512. fseek(f, 0, SEEK_SET);
  513. buf = (char*)malloc((file_size+1) * sizeof(char));
  514. memset(buf, 0, file_size + 1);
  515. fread(buf, file_size, 1, f);
  516. fclose(f);
  517. result = is_utf8(buf);
  518. free(buf);
  519. return result;
  520. }
  521. TOOLKIT_API int toolkit_detect_utf8_str(const char* str)
  522. {
  523. //if (validate_utf8(str, strlen(str)) == UTF8_ACCEPT) return 1;
  524. return is_utf8(str);
  525. }