Commit | Line | Data |
---|---|---|
8690e634 | 1 | /* Convert multibyte character to wide character. |
c0c3707f | 2 | Copyright (C) 1999-2002, 2005-2019 Free Software Foundation, Inc. |
8690e634 JK |
3 | Written by Bruno Haible <bruno@clisp.org>, 2008. |
4 | ||
5 | This program is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU General Public License as published by | |
7 | the Free Software Foundation; either version 3 of the License, or | |
8 | (at your option) any later version. | |
9 | ||
10 | This program is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU General Public License | |
c0c3707f | 16 | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
8690e634 JK |
17 | |
18 | #include <config.h> | |
19 | ||
20 | /* Specification. */ | |
21 | #include <wchar.h> | |
22 | ||
49e4877c PA |
23 | #if C_LOCALE_MAYBE_EILSEQ |
24 | # include "hard-locale.h" | |
25 | # include <locale.h> | |
26 | #endif | |
27 | ||
8690e634 JK |
28 | #if GNULIB_defined_mbstate_t |
29 | /* Implement mbrtowc() on top of mbtowc(). */ | |
30 | ||
31 | # include <errno.h> | |
32 | # include <stdlib.h> | |
33 | ||
34 | # include "localcharset.h" | |
35 | # include "streq.h" | |
36 | # include "verify.h" | |
c0c3707f CB |
37 | # include "glthread/lock.h" |
38 | ||
39 | # ifndef FALLTHROUGH | |
40 | # if __GNUC__ < 7 | |
41 | # define FALLTHROUGH ((void) 0) | |
42 | # else | |
43 | # define FALLTHROUGH __attribute__ ((__fallthrough__)) | |
44 | # endif | |
45 | # endif | |
46 | ||
47 | /* Returns a classification of special values of the encoding of the current | |
48 | locale. */ | |
49 | typedef enum { | |
50 | enc_other, /* other */ | |
51 | enc_utf8, /* UTF-8 */ | |
52 | enc_eucjp, /* EUC-JP */ | |
53 | enc_94, /* EUC-KR, GB2312, BIG5 */ | |
54 | enc_euctw, /* EUC-TW */ | |
55 | enc_gb18030, /* GB18030 */ | |
56 | enc_sjis /* SJIS */ | |
57 | } enc_t; | |
58 | static inline enc_t | |
59 | locale_enc (void) | |
60 | { | |
61 | const char *encoding = locale_charset (); | |
62 | if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) | |
63 | return enc_utf8; | |
64 | if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) | |
65 | return enc_eucjp; | |
66 | if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) | |
67 | || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) | |
68 | || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) | |
69 | return enc_94; | |
70 | if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) | |
71 | return enc_euctw; | |
72 | if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) | |
73 | return enc_gb18030; | |
74 | if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) | |
75 | return enc_sjis; | |
76 | return enc_other; | |
77 | } | |
78 | ||
79 | # if GNULIB_WCHAR_SINGLE | |
80 | /* When we know that the locale does not change, provide a speedup by | |
81 | caching the value of locale_enc. */ | |
82 | static int cached_locale_enc = -1; | |
83 | static inline enc_t | |
84 | locale_enc_cached (void) | |
85 | { | |
86 | if (cached_locale_enc < 0) | |
87 | cached_locale_enc = locale_enc (); | |
88 | return cached_locale_enc; | |
89 | } | |
90 | # else | |
91 | /* By default, don't make assumptions, hence no caching. */ | |
92 | # define locale_enc_cached locale_enc | |
93 | # endif | |
8690e634 | 94 | |
c0c3707f CB |
95 | /* This lock protects the internal state of mbtowc against multiple simultaneous |
96 | calls of mbrtowc. */ | |
97 | gl_lock_define_initialized(static, mbtowc_lock) | |
8690e634 JK |
98 | |
99 | verify (sizeof (mbstate_t) >= 4); | |
100 | ||
101 | static char internal_state[4]; | |
102 | ||
103 | size_t | |
104 | mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) | |
105 | { | |
106 | char *pstate = (char *)ps; | |
107 | ||
108 | if (s == NULL) | |
109 | { | |
110 | pwc = NULL; | |
111 | s = ""; | |
112 | n = 1; | |
113 | } | |
114 | ||
115 | if (n == 0) | |
116 | return (size_t)(-2); | |
117 | ||
118 | /* Here n > 0. */ | |
119 | ||
120 | if (pstate == NULL) | |
121 | pstate = internal_state; | |
122 | ||
123 | { | |
124 | size_t nstate = pstate[0]; | |
125 | char buf[4]; | |
126 | const char *p; | |
127 | size_t m; | |
c0c3707f CB |
128 | enc_t enc; |
129 | int res; | |
8690e634 JK |
130 | |
131 | switch (nstate) | |
132 | { | |
133 | case 0: | |
134 | p = s; | |
135 | m = n; | |
136 | break; | |
137 | case 3: | |
138 | buf[2] = pstate[3]; | |
c0c3707f | 139 | FALLTHROUGH; |
8690e634 JK |
140 | case 2: |
141 | buf[1] = pstate[2]; | |
c0c3707f | 142 | FALLTHROUGH; |
8690e634 JK |
143 | case 1: |
144 | buf[0] = pstate[1]; | |
145 | p = buf; | |
146 | m = nstate; | |
147 | buf[m++] = s[0]; | |
148 | if (n >= 2 && m < 4) | |
149 | { | |
150 | buf[m++] = s[1]; | |
151 | if (n >= 3 && m < 4) | |
152 | buf[m++] = s[2]; | |
153 | } | |
154 | break; | |
155 | default: | |
156 | errno = EINVAL; | |
157 | return (size_t)(-1); | |
158 | } | |
159 | ||
160 | /* Here m > 0. */ | |
161 | ||
c0c3707f | 162 | enc = locale_enc_cached (); |
8690e634 | 163 | |
c0c3707f | 164 | if (enc == enc_utf8) /* UTF-8 */ |
5e8754f9 | 165 | { |
c0c3707f CB |
166 | /* Achieve multi-thread safety by not calling mbtowc() at all. */ |
167 | /* Cf. unistr/u8-mbtouc.c. */ | |
168 | unsigned char c = (unsigned char) p[0]; | |
5e8754f9 | 169 | |
c0c3707f | 170 | if (c < 0x80) |
8690e634 | 171 | { |
c0c3707f CB |
172 | if (pwc != NULL) |
173 | *pwc = c; | |
174 | res = (c == 0 ? 0 : 1); | |
175 | goto success; | |
8690e634 | 176 | } |
c0c3707f | 177 | if (c >= 0xc2) |
8690e634 | 178 | { |
c0c3707f | 179 | if (c < 0xe0) |
8690e634 | 180 | { |
c0c3707f | 181 | if (m == 1) |
8690e634 | 182 | goto incomplete; |
c0c3707f | 183 | else /* m >= 2 */ |
8690e634 JK |
184 | { |
185 | unsigned char c2 = (unsigned char) p[1]; | |
186 | ||
c0c3707f CB |
187 | if ((c2 ^ 0x80) < 0x40) |
188 | { | |
189 | if (pwc != NULL) | |
190 | *pwc = ((unsigned int) (c & 0x1f) << 6) | |
191 | | (unsigned int) (c2 ^ 0x80); | |
192 | res = 2; | |
193 | goto success; | |
194 | } | |
8690e634 JK |
195 | } |
196 | } | |
c0c3707f | 197 | else if (c < 0xf0) |
8690e634 | 198 | { |
c0c3707f | 199 | if (m == 1) |
8690e634 | 200 | goto incomplete; |
c0c3707f CB |
201 | else |
202 | { | |
203 | unsigned char c2 = (unsigned char) p[1]; | |
8690e634 | 204 | |
c0c3707f CB |
205 | if ((c2 ^ 0x80) < 0x40 |
206 | && (c >= 0xe1 || c2 >= 0xa0) | |
207 | && (c != 0xed || c2 < 0xa0)) | |
208 | { | |
209 | if (m == 2) | |
210 | goto incomplete; | |
211 | else /* m >= 3 */ | |
212 | { | |
213 | unsigned char c3 = (unsigned char) p[2]; | |
8690e634 | 214 | |
c0c3707f CB |
215 | if ((c3 ^ 0x80) < 0x40) |
216 | { | |
217 | if (pwc != NULL) | |
218 | *pwc = ((unsigned int) (c & 0x0f) << 12) | |
219 | | ((unsigned int) (c2 ^ 0x80) << 6) | |
220 | | (unsigned int) (c3 ^ 0x80); | |
221 | res = 3; | |
222 | goto success; | |
223 | } | |
224 | } | |
225 | } | |
226 | } | |
8690e634 | 227 | } |
c0c3707f | 228 | else if (c <= 0xf4) |
8690e634 | 229 | { |
c0c3707f | 230 | if (m == 1) |
8690e634 | 231 | goto incomplete; |
c0c3707f | 232 | else |
8690e634 JK |
233 | { |
234 | unsigned char c2 = (unsigned char) p[1]; | |
235 | ||
c0c3707f CB |
236 | if ((c2 ^ 0x80) < 0x40 |
237 | && (c >= 0xf1 || c2 >= 0x90) | |
238 | && (c < 0xf4 || (c == 0xf4 && c2 < 0x90))) | |
8690e634 JK |
239 | { |
240 | if (m == 2) | |
241 | goto incomplete; | |
c0c3707f | 242 | else |
8690e634 JK |
243 | { |
244 | unsigned char c3 = (unsigned char) p[2]; | |
245 | ||
c0c3707f CB |
246 | if ((c3 ^ 0x80) < 0x40) |
247 | { | |
248 | if (m == 3) | |
249 | goto incomplete; | |
250 | else /* m >= 4 */ | |
251 | { | |
252 | unsigned char c4 = (unsigned char) p[3]; | |
253 | ||
254 | if ((c4 ^ 0x80) < 0x40) | |
255 | { | |
256 | if (pwc != NULL) | |
257 | *pwc = ((unsigned int) (c & 0x07) << 18) | |
258 | | ((unsigned int) (c2 ^ 0x80) << 12) | |
259 | | ((unsigned int) (c3 ^ 0x80) << 6) | |
260 | | (unsigned int) (c4 ^ 0x80); | |
261 | res = 4; | |
262 | goto success; | |
263 | } | |
264 | } | |
265 | } | |
8690e634 JK |
266 | } |
267 | } | |
268 | } | |
269 | } | |
8690e634 | 270 | } |
c0c3707f CB |
271 | goto invalid; |
272 | } | |
273 | else | |
274 | { | |
275 | /* The hidden internal state of mbtowc would make this function not | |
276 | multi-thread safe. Achieve multi-thread safety through a lock. */ | |
277 | gl_lock_lock (mbtowc_lock); | |
8690e634 | 278 | |
c0c3707f CB |
279 | /* Put the hidden internal state of mbtowc into its initial state. |
280 | This is needed at least with glibc, uClibc, and MSVC CRT. | |
281 | See <https://sourceware.org/bugzilla/show_bug.cgi?id=9674>. */ | |
282 | mbtowc (NULL, NULL, 0); | |
8690e634 | 283 | |
c0c3707f | 284 | res = mbtowc (pwc, p, m); |
8690e634 | 285 | |
c0c3707f CB |
286 | gl_lock_unlock (mbtowc_lock); |
287 | ||
288 | if (res >= 0) | |
289 | { | |
290 | if (pwc != NULL && ((*pwc == 0) != (res == 0))) | |
291 | abort (); | |
292 | goto success; | |
293 | } | |
294 | ||
295 | /* mbtowc does not distinguish between invalid and incomplete multibyte | |
296 | sequences. But mbrtowc needs to make this distinction. | |
297 | There are two possible approaches: | |
298 | - Use iconv() and its return value. | |
299 | - Use built-in knowledge about the possible encodings. | |
300 | Given the low quality of implementation of iconv() on the systems | |
301 | that lack mbrtowc(), we use the second approach. | |
302 | The possible encodings are: | |
303 | - 8-bit encodings, | |
304 | - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, | |
305 | - UTF-8 (already handled above). | |
306 | Use specialized code for each. */ | |
307 | if (m >= 4 || m >= MB_CUR_MAX) | |
308 | goto invalid; | |
309 | /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ | |
310 | switch (enc) | |
8690e634 | 311 | { |
c0c3707f CB |
312 | /* As a reference for this code, you can use the GNU libiconv |
313 | implementation. Look for uses of the RET_TOOFEW macro. */ | |
314 | ||
315 | case enc_eucjp: /* EUC-JP */ | |
316 | { | |
317 | if (m == 1) | |
318 | { | |
319 | unsigned char c = (unsigned char) p[0]; | |
320 | ||
321 | if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) | |
322 | goto incomplete; | |
323 | } | |
324 | if (m == 2) | |
325 | { | |
326 | unsigned char c = (unsigned char) p[0]; | |
327 | ||
328 | if (c == 0x8f) | |
329 | { | |
330 | unsigned char c2 = (unsigned char) p[1]; | |
331 | ||
332 | if (c2 >= 0xa1 && c2 < 0xff) | |
333 | goto incomplete; | |
334 | } | |
335 | } | |
336 | goto invalid; | |
337 | } | |
338 | ||
339 | case enc_94: /* EUC-KR, GB2312, BIG5 */ | |
340 | { | |
341 | if (m == 1) | |
342 | { | |
343 | unsigned char c = (unsigned char) p[0]; | |
344 | ||
345 | if (c >= 0xa1 && c < 0xff) | |
346 | goto incomplete; | |
347 | } | |
348 | goto invalid; | |
349 | } | |
350 | ||
351 | case enc_euctw: /* EUC-TW */ | |
352 | { | |
353 | if (m == 1) | |
354 | { | |
355 | unsigned char c = (unsigned char) p[0]; | |
356 | ||
357 | if ((c >= 0xa1 && c < 0xff) || c == 0x8e) | |
358 | goto incomplete; | |
359 | } | |
360 | else /* m == 2 || m == 3 */ | |
361 | { | |
362 | unsigned char c = (unsigned char) p[0]; | |
363 | ||
364 | if (c == 0x8e) | |
365 | goto incomplete; | |
366 | } | |
367 | goto invalid; | |
368 | } | |
369 | ||
370 | case enc_gb18030: /* GB18030 */ | |
371 | { | |
372 | if (m == 1) | |
373 | { | |
374 | unsigned char c = (unsigned char) p[0]; | |
375 | ||
376 | if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) | |
377 | goto incomplete; | |
378 | } | |
379 | else /* m == 2 || m == 3 */ | |
380 | { | |
381 | unsigned char c = (unsigned char) p[0]; | |
382 | ||
383 | if (c >= 0x90 && c <= 0xe3) | |
384 | { | |
385 | unsigned char c2 = (unsigned char) p[1]; | |
386 | ||
387 | if (c2 >= 0x30 && c2 <= 0x39) | |
388 | { | |
389 | if (m == 2) | |
390 | goto incomplete; | |
391 | else /* m == 3 */ | |
392 | { | |
393 | unsigned char c3 = (unsigned char) p[2]; | |
394 | ||
395 | if (c3 >= 0x81 && c3 <= 0xfe) | |
396 | goto incomplete; | |
397 | } | |
398 | } | |
399 | } | |
400 | } | |
401 | goto invalid; | |
402 | } | |
403 | ||
404 | case enc_sjis: /* SJIS */ | |
405 | { | |
406 | if (m == 1) | |
407 | { | |
408 | unsigned char c = (unsigned char) p[0]; | |
409 | ||
410 | if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) | |
411 | || (c >= 0xf0 && c <= 0xf9)) | |
412 | goto incomplete; | |
413 | } | |
414 | goto invalid; | |
415 | } | |
416 | ||
417 | default: | |
418 | /* An unknown multibyte encoding. */ | |
419 | goto incomplete; | |
8690e634 | 420 | } |
8690e634 | 421 | } |
8690e634 | 422 | |
c0c3707f CB |
423 | success: |
424 | /* res >= 0 is the corrected return value of mbtowc (pwc, p, m). */ | |
425 | if (nstate >= (res > 0 ? res : 1)) | |
426 | abort (); | |
427 | res -= nstate; | |
428 | pstate[0] = 0; | |
429 | return res; | |
430 | ||
431 | incomplete: | |
432 | { | |
433 | size_t k = nstate; | |
434 | /* Here 0 <= k < m < 4. */ | |
435 | pstate[++k] = s[0]; | |
436 | if (k < m) | |
437 | { | |
438 | pstate[++k] = s[1]; | |
439 | if (k < m) | |
440 | pstate[++k] = s[2]; | |
441 | } | |
442 | if (k != m) | |
443 | abort (); | |
8690e634 | 444 | } |
c0c3707f CB |
445 | pstate[0] = m; |
446 | return (size_t)(-2); | |
447 | ||
448 | invalid: | |
449 | errno = EILSEQ; | |
450 | /* The conversion state is undefined, says POSIX. */ | |
451 | return (size_t)(-1); | |
8690e634 JK |
452 | } |
453 | } | |
454 | ||
455 | #else | |
456 | /* Override the system's mbrtowc() function. */ | |
457 | ||
458 | # undef mbrtowc | |
459 | ||
460 | size_t | |
461 | rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) | |
462 | { | |
49e4877c PA |
463 | size_t ret; |
464 | wchar_t wc; | |
465 | ||
4a626d0a | 466 | # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG |
8690e634 JK |
467 | if (s == NULL) |
468 | { | |
469 | pwc = NULL; | |
470 | s = ""; | |
471 | n = 1; | |
472 | } | |
473 | # endif | |
474 | ||
4a626d0a PA |
475 | # if MBRTOWC_EMPTY_INPUT_BUG |
476 | if (n == 0) | |
477 | return (size_t) -2; | |
478 | # endif | |
479 | ||
49e4877c PA |
480 | if (! pwc) |
481 | pwc = &wc; | |
482 | ||
8690e634 JK |
483 | # if MBRTOWC_RETVAL_BUG |
484 | { | |
485 | static mbstate_t internal_state; | |
486 | ||
487 | /* Override mbrtowc's internal state. We cannot call mbsinit() on the | |
488 | hidden internal state, but we can call it on our variable. */ | |
489 | if (ps == NULL) | |
490 | ps = &internal_state; | |
491 | ||
492 | if (!mbsinit (ps)) | |
493 | { | |
494 | /* Parse the rest of the multibyte character byte for byte. */ | |
495 | size_t count = 0; | |
496 | for (; n > 0; s++, n--) | |
497 | { | |
49e4877c | 498 | ret = mbrtowc (&wc, s, 1, ps); |
8690e634 JK |
499 | |
500 | if (ret == (size_t)(-1)) | |
501 | return (size_t)(-1); | |
502 | count++; | |
503 | if (ret != (size_t)(-2)) | |
504 | { | |
505 | /* The multibyte character has been completed. */ | |
49e4877c | 506 | *pwc = wc; |
8690e634 JK |
507 | return (wc == 0 ? 0 : count); |
508 | } | |
509 | } | |
510 | return (size_t)(-2); | |
511 | } | |
512 | } | |
513 | # endif | |
514 | ||
49e4877c | 515 | ret = mbrtowc (pwc, s, n, ps); |
8690e634 | 516 | |
49e4877c PA |
517 | # if MBRTOWC_NUL_RETVAL_BUG |
518 | if (ret < (size_t) -2 && !*pwc) | |
519 | return 0; | |
520 | # endif | |
8690e634 | 521 | |
49e4877c PA |
522 | # if C_LOCALE_MAYBE_EILSEQ |
523 | if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE)) | |
524 | { | |
525 | unsigned char uc = *s; | |
526 | *pwc = uc; | |
527 | return 1; | |
528 | } | |
8690e634 | 529 | # endif |
49e4877c PA |
530 | |
531 | return ret; | |
8690e634 JK |
532 | } |
533 | ||
534 | #endif |