unicode.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522
  1. /**
  2. * WinPR: Windows Portable Runtime
  3. * Unicode Conversion (CRT)
  4. *
  5. * Copyright 2012 Marc-Andre Moreau <marcandre.moreau@gmail.com>
  6. *
  7. * Licensed under the Apache License, Version 2.0 (the "License");
  8. * you may not use this file except in compliance with the License.
  9. * You may obtain a copy of the License at
  10. *
  11. * http://www.apache.org/licenses/LICENSE-2.0
  12. *
  13. * Unless required by applicable law or agreed to in writing, software
  14. * distributed under the License is distributed on an "AS IS" BASIS,
  15. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. * See the License for the specific language governing permissions and
  17. * limitations under the License.
  18. */
  19. #ifdef HAVE_CONFIG_H
  20. #include "config.h"
  21. #endif
  22. #include <errno.h>
  23. #include <wctype.h>
  24. #include <winpr/crt.h>
  25. #include <winpr/error.h>
  26. #include <winpr/print.h>
  27. #ifndef _WIN32
  28. #if defined(WITH_ICU)
  29. #include <unicode/ucnv.h>
  30. #include <unicode/ustring.h>
  31. #else
  32. #include "utf.h"
  33. #endif
  34. #include "../log.h"
  35. #define TAG WINPR_TAG("unicode")
  36. /**
  37. * Notes on cross-platform Unicode portability:
  38. *
  39. * Unicode has many possible Unicode Transformation Format (UTF) encodings,
  40. * where some of the most commonly used are UTF-8, UTF-16 and sometimes UTF-32.
  41. *
  42. * The number in the UTF encoding name (8, 16, 32) refers to the number of bits
  43. * per code unit. A code unit is the minimal bit combination that can represent
  44. * a unit of encoded text in the given encoding. For instance, UTF-8 encodes
  45. * the English alphabet using 8 bits (or one byte) each, just like in ASCII.
  46. *
  47. * However, the total number of code points (values in the Unicode codespace)
  48. * only fits completely within 32 bits. This means that for UTF-8 and UTF-16,
  49. * more than one code unit may be required to fully encode a specific value.
  50. * UTF-8 and UTF-16 are variable-width encodings, while UTF-32 is fixed-width.
  51. *
  52. * UTF-8 has the advantage of being backwards compatible with ASCII, and is
  53. * one of the most commonly used Unicode encoding.
  54. *
  55. * UTF-16 is used everywhere in the Windows API. The strategy employed by
  56. * Microsoft to provide backwards compatibility in their API was to create
  57. * an ANSI and a Unicode version of the same function, ending with A (ANSI)
  58. * and W (Wide character, or UTF-16 Unicode). In headers, the original
  59. * function name is replaced by a macro that defines to either the ANSI
  60. * or Unicode version based on the definition of the _UNICODE macro.
  61. *
  62. * UTF-32 has the advantage of being fixed width, but wastes a lot of space
  63. * for English text (4x more than UTF-8, 2x more than UTF-16).
  64. *
  65. * In C, wide character strings are often defined with the wchar_t type.
  66. * Many functions are provided to deal with those wide character strings,
  67. * such as wcslen (strlen equivalent) or wprintf (printf equivalent).
  68. *
  69. * This may lead to some confusion, since many of these functions exist
  70. * on both Windows and Linux, but they are *not* the same!
  71. *
  72. * This sample hello world is a good example:
  73. *
  74. * #include <wchar.h>
  75. *
  76. * wchar_t hello[] = L"Hello, World!\n";
  77. *
  78. * int main(int argc, char** argv)
  79. * {
  80. * wprintf(hello);
  81. * wprintf(L"sizeof(wchar_t): %d\n", sizeof(wchar_t));
  82. * return 0;
  83. * }
  84. *
  85. * There is a reason why the sample prints the size of the wchar_t type:
  86. * On Windows, wchar_t is two bytes (UTF-16), while on most other systems
  87. * it is 4 bytes (UTF-32). This means that if you write code on Windows,
  88. * use L"" to define a string which is meant to be UTF-16 and not UTF-32,
  89. * you will have a little surprise when trying to port your code to Linux.
  90. *
  91. * Since the Windows API uses UTF-16, not UTF-32, WinPR defines the WCHAR
  92. * type to always be 2-bytes long and uses it instead of wchar_t. Do not
  93. * ever use wchar_t with WinPR unless you know what you are doing.
  94. *
  95. * As for L"", it is unfortunately unusable in a portable way, unless a
  96. * special option is passed to GCC to define wchar_t as being two bytes.
  97. * For string constants that must be UTF-16, it is a pain, but they can
  98. * be defined in a portable way like this:
  99. *
  100. * WCHAR hello[] = { 'H','e','l','l','o','\0' };
  101. *
  102. * Such strings cannot be passed to native functions like wcslen(), which
  103. * may expect a different wchar_t size. For this reason, WinPR provides
  104. * _wcslen, which expects UTF-16 WCHAR strings on all platforms.
  105. *
  106. */
  107. /*
  108. * Conversion to Unicode (UTF-16)
  109. * MultiByteToWideChar: http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072/
  110. *
  111. * cbMultiByte is an input size in bytes (BYTE)
  112. * cchWideChar is an output size in wide characters (WCHAR)
  113. *
  114. * Null-terminated UTF-8 strings:
  115. *
  116. * cchWideChar *cannot* be assumed to be cbMultiByte since UTF-8 is variable-width!
  117. *
  118. * Instead, obtain the required cchWideChar output size like this:
  119. * cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, NULL, 0);
  120. *
  121. * A value of -1 for cbMultiByte indicates that the input string is null-terminated,
  122. * and the null terminator *will* be processed. The size returned by MultiByteToWideChar
  123. * will therefore include the null terminator. Equivalent behavior can be obtained by
  124. * computing the length in bytes of the input buffer, including the null terminator:
  125. *
  126. * cbMultiByte = strlen((char*) lpMultiByteStr) + 1;
  127. *
  128. * An output buffer of the proper size can then be allocated:
  129. *
  130. * lpWideCharStr = (LPWSTR) malloc(cchWideChar * sizeof(WCHAR));
  131. *
  132. * Since cchWideChar is an output size in wide characters, the actual buffer size is:
  133. * (cchWideChar * sizeof(WCHAR)) or (cchWideChar * 2)
  134. *
  135. * Finally, perform the conversion:
  136. *
  137. * cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, lpWideCharStr,
  138. * cchWideChar);
  139. *
  140. * The value returned by MultiByteToWideChar corresponds to the number of wide characters written
  141. * to the output buffer, and should match the value obtained on the first call to
  142. * MultiByteToWideChar.
  143. *
  144. */
  145. int MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
  146. LPWSTR lpWideCharStr, int cchWideChar)
  147. {
  148. LPWSTR targetStart;
  149. #if !defined(WITH_ICU)
  150. const BYTE* sourceStart;
  151. int length;
  152. ConversionResult result;
  153. #endif
  154. /* If cbMultiByte is 0, the function fails */
  155. if ((cbMultiByte == 0) || (cbMultiByte < -1))
  156. return 0;
  157. /* If cbMultiByte is -1, the string is null-terminated */
  158. if (cbMultiByte == -1)
  159. {
  160. size_t len = lpMultiByteStr ? strnlen((const char*)lpMultiByteStr, INT32_MAX) : INT32_MAX;
  161. if (len >= INT32_MAX)
  162. return 0;
  163. cbMultiByte = (int)len + 1;
  164. }
  165. /*
  166. * if cchWideChar is 0, the function returns the required buffer size
  167. * in characters for lpWideCharStr and makes no use of the output parameter itself.
  168. */
  169. #if defined(WITH_ICU)
  170. {
  171. UErrorCode error;
  172. int32_t targetLength;
  173. int32_t targetCapacity;
  174. switch (CodePage)
  175. {
  176. case CP_ACP:
  177. case CP_UTF8:
  178. break;
  179. default:
  180. WLog_ERR(TAG, "Unsupported encoding %u", CodePage);
  181. return 0;
  182. }
  183. targetStart = lpWideCharStr;
  184. targetCapacity = cchWideChar;
  185. error = U_ZERO_ERROR;
  186. if (cchWideChar == 0)
  187. {
  188. u_strFromUTF8(NULL, 0, &targetLength, lpMultiByteStr, cbMultiByte, &error);
  189. cchWideChar = targetLength;
  190. }
  191. else
  192. {
  193. u_strFromUTF8(targetStart, targetCapacity, &targetLength, lpMultiByteStr, cbMultiByte,
  194. &error);
  195. cchWideChar = U_SUCCESS(error) ? targetLength : 0;
  196. }
  197. }
  198. #else
  199. if (cchWideChar == 0)
  200. {
  201. sourceStart = (const BYTE*)lpMultiByteStr;
  202. targetStart = (WCHAR*)NULL;
  203. result = ConvertUTF8toUTF16(&sourceStart, &sourceStart[cbMultiByte], &targetStart, NULL,
  204. strictConversion);
  205. length = targetStart - ((WCHAR*)NULL);
  206. }
  207. else
  208. {
  209. sourceStart = (const BYTE*)lpMultiByteStr;
  210. targetStart = lpWideCharStr;
  211. result = ConvertUTF8toUTF16(&sourceStart, &sourceStart[cbMultiByte], &targetStart,
  212. &targetStart[cchWideChar], strictConversion);
  213. length = targetStart - ((WCHAR*)lpWideCharStr);
  214. }
  215. cchWideChar = (result == conversionOK) ? length : 0;
  216. #endif
  217. return cchWideChar;
  218. }
  219. /*
  220. * Conversion from Unicode (UTF-16)
  221. * WideCharToMultiByte: http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130/
  222. *
  223. * cchWideChar is an input size in wide characters (WCHAR)
  224. * cbMultiByte is an output size in bytes (BYTE)
  225. *
  226. * Null-terminated UTF-16 strings:
  227. *
  228. * cbMultiByte *cannot* be assumed to be cchWideChar since UTF-8 is variable-width!
  229. *
  230. * Instead, obtain the required cbMultiByte output size like this:
  231. * cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, NULL, 0, NULL, NULL);
  232. *
  233. * A value of -1 for cbMultiByte indicates that the input string is null-terminated,
  234. * and the null terminator *will* be processed. The size returned by WideCharToMultiByte
  235. * will therefore include the null terminator. Equivalent behavior can be obtained by
  236. * computing the length in bytes of the input buffer, including the null terminator:
  237. *
  238. * cchWideChar = _wcslen((WCHAR*) lpWideCharStr) + 1;
  239. *
  240. * An output buffer of the proper size can then be allocated:
  241. * lpMultiByteStr = (LPSTR) malloc(cbMultiByte);
  242. *
  243. * Since cbMultiByte is an output size in bytes, it is the same as the buffer size
  244. *
  245. * Finally, perform the conversion:
  246. *
  247. * cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, lpMultiByteStr,
  248. * cbMultiByte, NULL, NULL);
  249. *
  250. * The value returned by WideCharToMultiByte corresponds to the number of bytes written
  251. * to the output buffer, and should match the value obtained on the first call to
  252. * WideCharToMultiByte.
  253. *
  254. */
  255. int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
  256. LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
  257. LPBOOL lpUsedDefaultChar)
  258. {
  259. #if !defined(WITH_ICU)
  260. int length;
  261. const WCHAR* sourceStart;
  262. ConversionResult result;
  263. BYTE* targetStart;
  264. #else
  265. char* targetStart;
  266. #endif
  267. /* If cchWideChar is 0, the function fails */
  268. if ((cchWideChar == 0) || (cchWideChar < -1))
  269. return 0;
  270. /* If cchWideChar is -1, the string is null-terminated */
  271. if (cchWideChar == -1)
  272. {
  273. size_t len = lpWideCharStr ? _wcslen(lpWideCharStr) : INT32_MAX;
  274. if (len >= INT32_MAX)
  275. return 0;
  276. cchWideChar = (int)len + 1;
  277. }
  278. /*
  279. * if cbMultiByte is 0, the function returns the required buffer size
  280. * in bytes for lpMultiByteStr and makes no use of the output parameter itself.
  281. */
  282. #if defined(WITH_ICU)
  283. {
  284. UErrorCode error;
  285. int32_t targetLength;
  286. int32_t targetCapacity;
  287. switch (CodePage)
  288. {
  289. case CP_ACP:
  290. case CP_UTF8:
  291. break;
  292. default:
  293. WLog_ERR(TAG, "Unsupported encoding %u", CodePage);
  294. return 0;
  295. }
  296. targetStart = lpMultiByteStr;
  297. targetCapacity = cbMultiByte;
  298. error = U_ZERO_ERROR;
  299. if (cbMultiByte == 0)
  300. {
  301. u_strToUTF8(NULL, 0, &targetLength, lpWideCharStr, cchWideChar, &error);
  302. cbMultiByte = targetLength;
  303. }
  304. else
  305. {
  306. u_strToUTF8(targetStart, targetCapacity, &targetLength, lpWideCharStr, cchWideChar,
  307. &error);
  308. cbMultiByte = U_SUCCESS(error) ? targetLength : 0;
  309. }
  310. }
  311. #else
  312. if (cbMultiByte == 0)
  313. {
  314. sourceStart = (WCHAR*)lpWideCharStr;
  315. targetStart = (BYTE*)NULL;
  316. result = ConvertUTF16toUTF8(&sourceStart, &sourceStart[cchWideChar], &targetStart, NULL,
  317. strictConversion);
  318. length = targetStart - ((BYTE*)NULL);
  319. }
  320. else
  321. {
  322. sourceStart = (WCHAR*)lpWideCharStr;
  323. targetStart = (BYTE*)lpMultiByteStr;
  324. result = ConvertUTF16toUTF8(&sourceStart, &sourceStart[cchWideChar], &targetStart,
  325. &targetStart[cbMultiByte], strictConversion);
  326. length = targetStart - ((BYTE*)lpMultiByteStr);
  327. }
  328. cbMultiByte = (result == conversionOK) ? length : 0;
  329. #endif
  330. return cbMultiByte;
  331. }
  332. #endif
  333. /**
  334. * ConvertToUnicode is a convenience wrapper for MultiByteToWideChar:
  335. *
  336. * If the lpWideCharStr parameter for the converted string points to NULL
  337. * or if the cchWideChar parameter is set to 0 this function will automatically
  338. * allocate the required memory which is guaranteed to be null-terminated
  339. * after the conversion, even if the source c string isn't.
  340. *
  341. * If the cbMultiByte parameter is set to -1 the passed lpMultiByteStr must
  342. * be null-terminated and the required length for the converted string will be
  343. * calculated accordingly.
  344. */
  345. int ConvertToUnicode(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
  346. LPWSTR* lpWideCharStr, int cchWideChar)
  347. {
  348. int status;
  349. BOOL allocate = FALSE;
  350. if (!lpMultiByteStr)
  351. return 0;
  352. if (!lpWideCharStr)
  353. return 0;
  354. if (cbMultiByte == -1)
  355. {
  356. size_t len = strnlen(lpMultiByteStr, INT_MAX);
  357. if (len >= INT_MAX)
  358. return 0;
  359. cbMultiByte = (int)(len + 1);
  360. }
  361. if (cchWideChar == 0)
  362. {
  363. cchWideChar = MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, NULL, 0);
  364. allocate = TRUE;
  365. }
  366. if (cchWideChar < 1)
  367. return 0;
  368. if (!(*lpWideCharStr))
  369. allocate = TRUE;
  370. if (allocate)
  371. {
  372. *lpWideCharStr = (LPWSTR)calloc(cchWideChar + 1, sizeof(WCHAR));
  373. if (!(*lpWideCharStr))
  374. {
  375. return 0;
  376. }
  377. }
  378. status = MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, *lpWideCharStr,
  379. cchWideChar);
  380. if (status != cchWideChar)
  381. {
  382. if (allocate)
  383. {
  384. free(*lpWideCharStr);
  385. *lpWideCharStr = NULL;
  386. }
  387. status = 0;
  388. }
  389. return status;
  390. }
  391. /**
  392. * ConvertFromUnicode is a convenience wrapper for WideCharToMultiByte:
  393. *
  394. * If the lpMultiByteStr parameter for the converted string points to NULL
  395. * or if the cbMultiByte parameter is set to 0 this function will automatically
  396. * allocate the required memory which is guaranteed to be null-terminated
  397. * after the conversion, even if the source unicode string isn't.
  398. *
  399. * If the cchWideChar parameter is set to -1 the passed lpWideCharStr must
  400. * be null-terminated and the required length for the converted string will be
  401. * calculated accordingly.
  402. */
  403. int ConvertFromUnicode(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
  404. LPSTR* lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
  405. LPBOOL lpUsedDefaultChar)
  406. {
  407. int status;
  408. BOOL allocate = FALSE;
  409. if (!lpWideCharStr)
  410. return 0;
  411. if (!lpMultiByteStr)
  412. return 0;
  413. if (cchWideChar == -1)
  414. cchWideChar = (int)(_wcslen(lpWideCharStr) + 1);
  415. if (cbMultiByte == 0)
  416. {
  417. cbMultiByte =
  418. WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, NULL, 0, NULL, NULL);
  419. allocate = TRUE;
  420. }
  421. if (cbMultiByte < 1)
  422. return 0;
  423. if (!(*lpMultiByteStr))
  424. allocate = TRUE;
  425. if (allocate)
  426. {
  427. *lpMultiByteStr = (LPSTR)calloc(1, cbMultiByte + 1);
  428. if (!(*lpMultiByteStr))
  429. {
  430. return 0;
  431. }
  432. }
  433. status = WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, *lpMultiByteStr,
  434. cbMultiByte, lpDefaultChar, lpUsedDefaultChar);
  435. if ((status != cbMultiByte) && allocate)
  436. {
  437. status = 0;
  438. }
  439. if ((status <= 0) && allocate)
  440. {
  441. free(*lpMultiByteStr);
  442. *lpMultiByteStr = NULL;
  443. }
  444. return status;
  445. }
  446. /**
  447. * Swap Unicode byte order (UTF16LE <-> UTF16BE)
  448. */
  449. void ByteSwapUnicode(WCHAR* wstr, int length)
  450. {
  451. WCHAR* end = &wstr[length];
  452. while (wstr < end)
  453. {
  454. *wstr = _byteswap_ushort(*wstr);
  455. wstr++;
  456. }
  457. }