Commit | Line | Data |
---|---|---|
8690e634 | 1 | /* Convert multibyte character to wide character. |
7a6dbc2f | 2 | Copyright (C) 1999-2002, 2005-2018 Free Software Foundation, Inc. |
8690e634 JK |
3 | Written by Bruno Haible <bruno@clisp.org>, 2008. |
4 | ||
5 | This program is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU General Public License as published by | |
7 | the Free Software Foundation; either version 3 of the License, or | |
8 | (at your option) any later version. | |
9 | ||
10 | This program is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU General Public License | |
7a6dbc2f | 16 | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
8690e634 JK |
17 | |
18 | #include <config.h> | |
19 | ||
20 | /* Specification. */ | |
21 | #include <wchar.h> | |
22 | ||
49e4877c PA |
23 | #if C_LOCALE_MAYBE_EILSEQ |
24 | # include "hard-locale.h" | |
25 | # include <locale.h> | |
26 | #endif | |
27 | ||
8690e634 JK |
28 | #if GNULIB_defined_mbstate_t |
29 | /* Implement mbrtowc() on top of mbtowc(). */ | |
30 | ||
31 | # include <errno.h> | |
32 | # include <stdlib.h> | |
33 | ||
34 | # include "localcharset.h" | |
35 | # include "streq.h" | |
36 | # include "verify.h" | |
37 | ||
7a6dbc2f SDJ |
38 | # ifndef FALLTHROUGH |
39 | # if __GNUC__ < 7 | |
40 | # define FALLTHROUGH ((void) 0) | |
41 | # else | |
42 | # define FALLTHROUGH __attribute__ ((__fallthrough__)) | |
43 | # endif | |
44 | # endif | |
45 | ||
46 | /* Returns a classification of special values of the encoding of the current | |
47 | locale. */ | |
48 | typedef enum { | |
49 | enc_other, /* other */ | |
50 | enc_utf8, /* UTF-8 */ | |
51 | enc_eucjp, /* EUC-JP */ | |
52 | enc_94, /* EUC-KR, GB2312, BIG5 */ | |
53 | enc_euctw, /* EUC-TW */ | |
54 | enc_gb18030, /* GB18030 */ | |
55 | enc_sjis /* SJIS */ | |
56 | } enc_t; | |
57 | static inline enc_t | |
58 | locale_enc (void) | |
59 | { | |
60 | const char *encoding = locale_charset (); | |
61 | if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) | |
62 | return enc_utf8; | |
63 | if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) | |
64 | return enc_eucjp; | |
65 | if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) | |
66 | || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) | |
67 | || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) | |
68 | return enc_94; | |
69 | if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) | |
70 | return enc_euctw; | |
71 | if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) | |
72 | return enc_gb18030; | |
73 | if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) | |
74 | return enc_sjis; | |
75 | return enc_other; | |
76 | } | |
77 | ||
78 | #if GNULIB_WCHAR_SINGLE | |
79 | /* When we know that the locale does not change, provide a speedup by | |
80 | caching the value of locale_enc. */ | |
81 | static int cached_locale_enc = -1; | |
82 | static inline enc_t | |
83 | locale_enc_cached (void) | |
84 | { | |
85 | if (cached_locale_enc < 0) | |
86 | cached_locale_enc = locale_enc (); | |
87 | return cached_locale_enc; | |
88 | } | |
89 | #else | |
90 | /* By default, don't make assumptions, hence no caching. */ | |
91 | # define locale_enc_cached locale_enc | |
92 | #endif | |
8690e634 JK |
93 | |
94 | verify (sizeof (mbstate_t) >= 4); | |
95 | ||
96 | static char internal_state[4]; | |
97 | ||
98 | size_t | |
99 | mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) | |
100 | { | |
101 | char *pstate = (char *)ps; | |
102 | ||
103 | if (s == NULL) | |
104 | { | |
105 | pwc = NULL; | |
106 | s = ""; | |
107 | n = 1; | |
108 | } | |
109 | ||
110 | if (n == 0) | |
111 | return (size_t)(-2); | |
112 | ||
113 | /* Here n > 0. */ | |
114 | ||
115 | if (pstate == NULL) | |
116 | pstate = internal_state; | |
117 | ||
118 | { | |
119 | size_t nstate = pstate[0]; | |
120 | char buf[4]; | |
121 | const char *p; | |
122 | size_t m; | |
123 | ||
124 | switch (nstate) | |
125 | { | |
126 | case 0: | |
127 | p = s; | |
128 | m = n; | |
129 | break; | |
130 | case 3: | |
131 | buf[2] = pstate[3]; | |
7a6dbc2f | 132 | FALLTHROUGH; |
8690e634 JK |
133 | case 2: |
134 | buf[1] = pstate[2]; | |
7a6dbc2f | 135 | FALLTHROUGH; |
8690e634 JK |
136 | case 1: |
137 | buf[0] = pstate[1]; | |
138 | p = buf; | |
139 | m = nstate; | |
140 | buf[m++] = s[0]; | |
141 | if (n >= 2 && m < 4) | |
142 | { | |
143 | buf[m++] = s[1]; | |
144 | if (n >= 3 && m < 4) | |
145 | buf[m++] = s[2]; | |
146 | } | |
147 | break; | |
148 | default: | |
149 | errno = EINVAL; | |
150 | return (size_t)(-1); | |
151 | } | |
152 | ||
153 | /* Here m > 0. */ | |
154 | ||
155 | # if __GLIBC__ || defined __UCLIBC__ | |
7a6dbc2f | 156 | /* Work around bug <https://sourceware.org/bugzilla/show_bug.cgi?id=9674> */ |
8690e634 JK |
157 | mbtowc (NULL, NULL, 0); |
158 | # endif | |
159 | { | |
160 | int res = mbtowc (pwc, p, m); | |
161 | ||
162 | if (res >= 0) | |
163 | { | |
164 | if (pwc != NULL && ((*pwc == 0) != (res == 0))) | |
165 | abort (); | |
166 | if (nstate >= (res > 0 ? res : 1)) | |
167 | abort (); | |
168 | res -= nstate; | |
169 | pstate[0] = 0; | |
170 | return res; | |
171 | } | |
172 | ||
173 | /* mbtowc does not distinguish between invalid and incomplete multibyte | |
174 | sequences. But mbrtowc needs to make this distinction. | |
175 | There are two possible approaches: | |
176 | - Use iconv() and its return value. | |
177 | - Use built-in knowledge about the possible encodings. | |
178 | Given the low quality of implementation of iconv() on the systems that | |
179 | lack mbrtowc(), we use the second approach. | |
180 | The possible encodings are: | |
181 | - 8-bit encodings, | |
182 | - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, | |
183 | - UTF-8. | |
184 | Use specialized code for each. */ | |
185 | if (m >= 4 || m >= MB_CUR_MAX) | |
186 | goto invalid; | |
187 | /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ | |
7a6dbc2f SDJ |
188 | switch (locale_enc_cached ()) |
189 | { | |
190 | case enc_utf8: /* UTF-8 */ | |
8690e634 JK |
191 | { |
192 | /* Cf. unistr/u8-mblen.c. */ | |
193 | unsigned char c = (unsigned char) p[0]; | |
194 | ||
195 | if (c >= 0xc2) | |
196 | { | |
197 | if (c < 0xe0) | |
198 | { | |
199 | if (m == 1) | |
200 | goto incomplete; | |
201 | } | |
202 | else if (c < 0xf0) | |
203 | { | |
204 | if (m == 1) | |
205 | goto incomplete; | |
206 | if (m == 2) | |
207 | { | |
208 | unsigned char c2 = (unsigned char) p[1]; | |
209 | ||
210 | if ((c2 ^ 0x80) < 0x40 | |
211 | && (c >= 0xe1 || c2 >= 0xa0) | |
212 | && (c != 0xed || c2 < 0xa0)) | |
213 | goto incomplete; | |
214 | } | |
215 | } | |
216 | else if (c <= 0xf4) | |
217 | { | |
218 | if (m == 1) | |
219 | goto incomplete; | |
220 | else /* m == 2 || m == 3 */ | |
221 | { | |
222 | unsigned char c2 = (unsigned char) p[1]; | |
223 | ||
224 | if ((c2 ^ 0x80) < 0x40 | |
225 | && (c >= 0xf1 || c2 >= 0x90) | |
226 | && (c < 0xf4 || (c == 0xf4 && c2 < 0x90))) | |
227 | { | |
228 | if (m == 2) | |
229 | goto incomplete; | |
230 | else /* m == 3 */ | |
231 | { | |
232 | unsigned char c3 = (unsigned char) p[2]; | |
233 | ||
234 | if ((c3 ^ 0x80) < 0x40) | |
235 | goto incomplete; | |
236 | } | |
237 | } | |
238 | } | |
239 | } | |
240 | } | |
241 | goto invalid; | |
242 | } | |
243 | ||
244 | /* As a reference for this code, you can use the GNU libiconv | |
245 | implementation. Look for uses of the RET_TOOFEW macro. */ | |
246 | ||
7a6dbc2f | 247 | case enc_eucjp: /* EUC-JP */ |
8690e634 JK |
248 | { |
249 | if (m == 1) | |
250 | { | |
251 | unsigned char c = (unsigned char) p[0]; | |
252 | ||
253 | if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) | |
254 | goto incomplete; | |
255 | } | |
256 | if (m == 2) | |
257 | { | |
258 | unsigned char c = (unsigned char) p[0]; | |
259 | ||
260 | if (c == 0x8f) | |
261 | { | |
262 | unsigned char c2 = (unsigned char) p[1]; | |
263 | ||
264 | if (c2 >= 0xa1 && c2 < 0xff) | |
265 | goto incomplete; | |
266 | } | |
267 | } | |
268 | goto invalid; | |
269 | } | |
7a6dbc2f SDJ |
270 | |
271 | case enc_94: /* EUC-KR, GB2312, BIG5 */ | |
8690e634 JK |
272 | { |
273 | if (m == 1) | |
274 | { | |
275 | unsigned char c = (unsigned char) p[0]; | |
276 | ||
277 | if (c >= 0xa1 && c < 0xff) | |
278 | goto incomplete; | |
279 | } | |
280 | goto invalid; | |
281 | } | |
7a6dbc2f SDJ |
282 | |
283 | case enc_euctw: /* EUC-TW */ | |
8690e634 JK |
284 | { |
285 | if (m == 1) | |
286 | { | |
287 | unsigned char c = (unsigned char) p[0]; | |
288 | ||
289 | if ((c >= 0xa1 && c < 0xff) || c == 0x8e) | |
290 | goto incomplete; | |
291 | } | |
292 | else /* m == 2 || m == 3 */ | |
293 | { | |
294 | unsigned char c = (unsigned char) p[0]; | |
295 | ||
296 | if (c == 0x8e) | |
297 | goto incomplete; | |
298 | } | |
299 | goto invalid; | |
300 | } | |
7a6dbc2f SDJ |
301 | |
302 | case enc_gb18030: /* GB18030 */ | |
8690e634 JK |
303 | { |
304 | if (m == 1) | |
305 | { | |
306 | unsigned char c = (unsigned char) p[0]; | |
307 | ||
308 | if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) | |
309 | goto incomplete; | |
310 | } | |
311 | else /* m == 2 || m == 3 */ | |
312 | { | |
313 | unsigned char c = (unsigned char) p[0]; | |
314 | ||
315 | if (c >= 0x90 && c <= 0xe3) | |
316 | { | |
317 | unsigned char c2 = (unsigned char) p[1]; | |
318 | ||
319 | if (c2 >= 0x30 && c2 <= 0x39) | |
320 | { | |
321 | if (m == 2) | |
322 | goto incomplete; | |
323 | else /* m == 3 */ | |
324 | { | |
325 | unsigned char c3 = (unsigned char) p[2]; | |
326 | ||
327 | if (c3 >= 0x81 && c3 <= 0xfe) | |
328 | goto incomplete; | |
329 | } | |
330 | } | |
331 | } | |
332 | } | |
333 | goto invalid; | |
334 | } | |
7a6dbc2f SDJ |
335 | |
336 | case enc_sjis: /* SJIS */ | |
8690e634 JK |
337 | { |
338 | if (m == 1) | |
339 | { | |
340 | unsigned char c = (unsigned char) p[0]; | |
341 | ||
342 | if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) | |
343 | || (c >= 0xf0 && c <= 0xf9)) | |
344 | goto incomplete; | |
345 | } | |
346 | goto invalid; | |
347 | } | |
348 | ||
7a6dbc2f SDJ |
349 | default: |
350 | /* An unknown multibyte encoding. */ | |
351 | goto incomplete; | |
352 | } | |
8690e634 JK |
353 | |
354 | incomplete: | |
355 | { | |
356 | size_t k = nstate; | |
357 | /* Here 0 <= k < m < 4. */ | |
358 | pstate[++k] = s[0]; | |
359 | if (k < m) | |
360 | { | |
361 | pstate[++k] = s[1]; | |
362 | if (k < m) | |
363 | pstate[++k] = s[2]; | |
364 | } | |
365 | if (k != m) | |
366 | abort (); | |
367 | } | |
368 | pstate[0] = m; | |
369 | return (size_t)(-2); | |
370 | ||
371 | invalid: | |
372 | errno = EILSEQ; | |
373 | /* The conversion state is undefined, says POSIX. */ | |
374 | return (size_t)(-1); | |
375 | } | |
376 | } | |
377 | } | |
378 | ||
379 | #else | |
380 | /* Override the system's mbrtowc() function. */ | |
381 | ||
382 | # undef mbrtowc | |
383 | ||
384 | size_t | |
385 | rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) | |
386 | { | |
49e4877c PA |
387 | size_t ret; |
388 | wchar_t wc; | |
389 | ||
4a626d0a | 390 | # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG |
8690e634 JK |
391 | if (s == NULL) |
392 | { | |
393 | pwc = NULL; | |
394 | s = ""; | |
395 | n = 1; | |
396 | } | |
397 | # endif | |
398 | ||
4a626d0a PA |
399 | # if MBRTOWC_EMPTY_INPUT_BUG |
400 | if (n == 0) | |
401 | return (size_t) -2; | |
402 | # endif | |
403 | ||
49e4877c PA |
404 | if (! pwc) |
405 | pwc = &wc; | |
406 | ||
8690e634 JK |
407 | # if MBRTOWC_RETVAL_BUG |
408 | { | |
409 | static mbstate_t internal_state; | |
410 | ||
411 | /* Override mbrtowc's internal state. We cannot call mbsinit() on the | |
412 | hidden internal state, but we can call it on our variable. */ | |
413 | if (ps == NULL) | |
414 | ps = &internal_state; | |
415 | ||
416 | if (!mbsinit (ps)) | |
417 | { | |
418 | /* Parse the rest of the multibyte character byte for byte. */ | |
419 | size_t count = 0; | |
420 | for (; n > 0; s++, n--) | |
421 | { | |
49e4877c | 422 | ret = mbrtowc (&wc, s, 1, ps); |
8690e634 JK |
423 | |
424 | if (ret == (size_t)(-1)) | |
425 | return (size_t)(-1); | |
426 | count++; | |
427 | if (ret != (size_t)(-2)) | |
428 | { | |
429 | /* The multibyte character has been completed. */ | |
49e4877c | 430 | *pwc = wc; |
8690e634 JK |
431 | return (wc == 0 ? 0 : count); |
432 | } | |
433 | } | |
434 | return (size_t)(-2); | |
435 | } | |
436 | } | |
437 | # endif | |
438 | ||
49e4877c | 439 | ret = mbrtowc (pwc, s, n, ps); |
8690e634 | 440 | |
49e4877c PA |
441 | # if MBRTOWC_NUL_RETVAL_BUG |
442 | if (ret < (size_t) -2 && !*pwc) | |
443 | return 0; | |
444 | # endif | |
8690e634 | 445 | |
49e4877c PA |
446 | # if C_LOCALE_MAYBE_EILSEQ |
447 | if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE)) | |
448 | { | |
449 | unsigned char uc = *s; | |
450 | *pwc = uc; | |
451 | return 1; | |
452 | } | |
8690e634 | 453 | # endif |
49e4877c PA |
454 | |
455 | return ret; | |
8690e634 JK |
456 | } |
457 | ||
458 | #endif |