Update gnulib to current upstream master
[deliverable/binutils-gdb.git] / gdb / gnulib / import / mbrtowc.c
CommitLineData
8690e634 1/* Convert multibyte character to wide character.
7a6dbc2f 2 Copyright (C) 1999-2002, 2005-2018 Free Software Foundation, Inc.
8690e634
JK
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
7a6dbc2f 16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
8690e634
JK
17
18#include <config.h>
19
20/* Specification. */
21#include <wchar.h>
22
49e4877c
PA
23#if C_LOCALE_MAYBE_EILSEQ
24# include "hard-locale.h"
25# include <locale.h>
26#endif
27
8690e634
JK
28#if GNULIB_defined_mbstate_t
29/* Implement mbrtowc() on top of mbtowc(). */
30
31# include <errno.h>
32# include <stdlib.h>
33
34# include "localcharset.h"
35# include "streq.h"
36# include "verify.h"
37
7a6dbc2f
SDJ
38# ifndef FALLTHROUGH
39# if __GNUC__ < 7
40# define FALLTHROUGH ((void) 0)
41# else
42# define FALLTHROUGH __attribute__ ((__fallthrough__))
43# endif
44# endif
45
46/* Returns a classification of special values of the encoding of the current
47 locale. */
48typedef enum {
49 enc_other, /* other */
50 enc_utf8, /* UTF-8 */
51 enc_eucjp, /* EUC-JP */
52 enc_94, /* EUC-KR, GB2312, BIG5 */
53 enc_euctw, /* EUC-TW */
54 enc_gb18030, /* GB18030 */
55 enc_sjis /* SJIS */
56} enc_t;
57static inline enc_t
58locale_enc (void)
59{
60 const char *encoding = locale_charset ();
61 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
62 return enc_utf8;
63 if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
64 return enc_eucjp;
65 if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
66 || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
67 || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
68 return enc_94;
69 if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
70 return enc_euctw;
71 if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
72 return enc_gb18030;
73 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
74 return enc_sjis;
75 return enc_other;
76}
77
78#if GNULIB_WCHAR_SINGLE
79/* When we know that the locale does not change, provide a speedup by
80 caching the value of locale_enc. */
81static int cached_locale_enc = -1;
82static inline enc_t
83locale_enc_cached (void)
84{
85 if (cached_locale_enc < 0)
86 cached_locale_enc = locale_enc ();
87 return cached_locale_enc;
88}
89#else
90/* By default, don't make assumptions, hence no caching. */
91# define locale_enc_cached locale_enc
92#endif
8690e634
JK
93
94verify (sizeof (mbstate_t) >= 4);
95
96static char internal_state[4];
97
98size_t
99mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
100{
101 char *pstate = (char *)ps;
102
103 if (s == NULL)
104 {
105 pwc = NULL;
106 s = "";
107 n = 1;
108 }
109
110 if (n == 0)
111 return (size_t)(-2);
112
113 /* Here n > 0. */
114
115 if (pstate == NULL)
116 pstate = internal_state;
117
118 {
119 size_t nstate = pstate[0];
120 char buf[4];
121 const char *p;
122 size_t m;
123
124 switch (nstate)
125 {
126 case 0:
127 p = s;
128 m = n;
129 break;
130 case 3:
131 buf[2] = pstate[3];
7a6dbc2f 132 FALLTHROUGH;
8690e634
JK
133 case 2:
134 buf[1] = pstate[2];
7a6dbc2f 135 FALLTHROUGH;
8690e634
JK
136 case 1:
137 buf[0] = pstate[1];
138 p = buf;
139 m = nstate;
140 buf[m++] = s[0];
141 if (n >= 2 && m < 4)
142 {
143 buf[m++] = s[1];
144 if (n >= 3 && m < 4)
145 buf[m++] = s[2];
146 }
147 break;
148 default:
149 errno = EINVAL;
150 return (size_t)(-1);
151 }
152
153 /* Here m > 0. */
154
155# if __GLIBC__ || defined __UCLIBC__
7a6dbc2f 156 /* Work around bug <https://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
8690e634
JK
157 mbtowc (NULL, NULL, 0);
158# endif
159 {
160 int res = mbtowc (pwc, p, m);
161
162 if (res >= 0)
163 {
164 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
165 abort ();
166 if (nstate >= (res > 0 ? res : 1))
167 abort ();
168 res -= nstate;
169 pstate[0] = 0;
170 return res;
171 }
172
173 /* mbtowc does not distinguish between invalid and incomplete multibyte
174 sequences. But mbrtowc needs to make this distinction.
175 There are two possible approaches:
176 - Use iconv() and its return value.
177 - Use built-in knowledge about the possible encodings.
178 Given the low quality of implementation of iconv() on the systems that
179 lack mbrtowc(), we use the second approach.
180 The possible encodings are:
181 - 8-bit encodings,
182 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
183 - UTF-8.
184 Use specialized code for each. */
185 if (m >= 4 || m >= MB_CUR_MAX)
186 goto invalid;
187 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
7a6dbc2f
SDJ
188 switch (locale_enc_cached ())
189 {
190 case enc_utf8: /* UTF-8 */
8690e634
JK
191 {
192 /* Cf. unistr/u8-mblen.c. */
193 unsigned char c = (unsigned char) p[0];
194
195 if (c >= 0xc2)
196 {
197 if (c < 0xe0)
198 {
199 if (m == 1)
200 goto incomplete;
201 }
202 else if (c < 0xf0)
203 {
204 if (m == 1)
205 goto incomplete;
206 if (m == 2)
207 {
208 unsigned char c2 = (unsigned char) p[1];
209
210 if ((c2 ^ 0x80) < 0x40
211 && (c >= 0xe1 || c2 >= 0xa0)
212 && (c != 0xed || c2 < 0xa0))
213 goto incomplete;
214 }
215 }
216 else if (c <= 0xf4)
217 {
218 if (m == 1)
219 goto incomplete;
220 else /* m == 2 || m == 3 */
221 {
222 unsigned char c2 = (unsigned char) p[1];
223
224 if ((c2 ^ 0x80) < 0x40
225 && (c >= 0xf1 || c2 >= 0x90)
226 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
227 {
228 if (m == 2)
229 goto incomplete;
230 else /* m == 3 */
231 {
232 unsigned char c3 = (unsigned char) p[2];
233
234 if ((c3 ^ 0x80) < 0x40)
235 goto incomplete;
236 }
237 }
238 }
239 }
240 }
241 goto invalid;
242 }
243
244 /* As a reference for this code, you can use the GNU libiconv
245 implementation. Look for uses of the RET_TOOFEW macro. */
246
7a6dbc2f 247 case enc_eucjp: /* EUC-JP */
8690e634
JK
248 {
249 if (m == 1)
250 {
251 unsigned char c = (unsigned char) p[0];
252
253 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
254 goto incomplete;
255 }
256 if (m == 2)
257 {
258 unsigned char c = (unsigned char) p[0];
259
260 if (c == 0x8f)
261 {
262 unsigned char c2 = (unsigned char) p[1];
263
264 if (c2 >= 0xa1 && c2 < 0xff)
265 goto incomplete;
266 }
267 }
268 goto invalid;
269 }
7a6dbc2f
SDJ
270
271 case enc_94: /* EUC-KR, GB2312, BIG5 */
8690e634
JK
272 {
273 if (m == 1)
274 {
275 unsigned char c = (unsigned char) p[0];
276
277 if (c >= 0xa1 && c < 0xff)
278 goto incomplete;
279 }
280 goto invalid;
281 }
7a6dbc2f
SDJ
282
283 case enc_euctw: /* EUC-TW */
8690e634
JK
284 {
285 if (m == 1)
286 {
287 unsigned char c = (unsigned char) p[0];
288
289 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
290 goto incomplete;
291 }
292 else /* m == 2 || m == 3 */
293 {
294 unsigned char c = (unsigned char) p[0];
295
296 if (c == 0x8e)
297 goto incomplete;
298 }
299 goto invalid;
300 }
7a6dbc2f
SDJ
301
302 case enc_gb18030: /* GB18030 */
8690e634
JK
303 {
304 if (m == 1)
305 {
306 unsigned char c = (unsigned char) p[0];
307
308 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
309 goto incomplete;
310 }
311 else /* m == 2 || m == 3 */
312 {
313 unsigned char c = (unsigned char) p[0];
314
315 if (c >= 0x90 && c <= 0xe3)
316 {
317 unsigned char c2 = (unsigned char) p[1];
318
319 if (c2 >= 0x30 && c2 <= 0x39)
320 {
321 if (m == 2)
322 goto incomplete;
323 else /* m == 3 */
324 {
325 unsigned char c3 = (unsigned char) p[2];
326
327 if (c3 >= 0x81 && c3 <= 0xfe)
328 goto incomplete;
329 }
330 }
331 }
332 }
333 goto invalid;
334 }
7a6dbc2f
SDJ
335
336 case enc_sjis: /* SJIS */
8690e634
JK
337 {
338 if (m == 1)
339 {
340 unsigned char c = (unsigned char) p[0];
341
342 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
343 || (c >= 0xf0 && c <= 0xf9))
344 goto incomplete;
345 }
346 goto invalid;
347 }
348
7a6dbc2f
SDJ
349 default:
350 /* An unknown multibyte encoding. */
351 goto incomplete;
352 }
8690e634
JK
353
354 incomplete:
355 {
356 size_t k = nstate;
357 /* Here 0 <= k < m < 4. */
358 pstate[++k] = s[0];
359 if (k < m)
360 {
361 pstate[++k] = s[1];
362 if (k < m)
363 pstate[++k] = s[2];
364 }
365 if (k != m)
366 abort ();
367 }
368 pstate[0] = m;
369 return (size_t)(-2);
370
371 invalid:
372 errno = EILSEQ;
373 /* The conversion state is undefined, says POSIX. */
374 return (size_t)(-1);
375 }
376 }
377}
378
379#else
380/* Override the system's mbrtowc() function. */
381
382# undef mbrtowc
383
384size_t
385rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
386{
49e4877c
PA
387 size_t ret;
388 wchar_t wc;
389
4a626d0a 390# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
8690e634
JK
391 if (s == NULL)
392 {
393 pwc = NULL;
394 s = "";
395 n = 1;
396 }
397# endif
398
4a626d0a
PA
399# if MBRTOWC_EMPTY_INPUT_BUG
400 if (n == 0)
401 return (size_t) -2;
402# endif
403
49e4877c
PA
404 if (! pwc)
405 pwc = &wc;
406
8690e634
JK
407# if MBRTOWC_RETVAL_BUG
408 {
409 static mbstate_t internal_state;
410
411 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
412 hidden internal state, but we can call it on our variable. */
413 if (ps == NULL)
414 ps = &internal_state;
415
416 if (!mbsinit (ps))
417 {
418 /* Parse the rest of the multibyte character byte for byte. */
419 size_t count = 0;
420 for (; n > 0; s++, n--)
421 {
49e4877c 422 ret = mbrtowc (&wc, s, 1, ps);
8690e634
JK
423
424 if (ret == (size_t)(-1))
425 return (size_t)(-1);
426 count++;
427 if (ret != (size_t)(-2))
428 {
429 /* The multibyte character has been completed. */
49e4877c 430 *pwc = wc;
8690e634
JK
431 return (wc == 0 ? 0 : count);
432 }
433 }
434 return (size_t)(-2);
435 }
436 }
437# endif
438
49e4877c 439 ret = mbrtowc (pwc, s, n, ps);
8690e634 440
49e4877c
PA
441# if MBRTOWC_NUL_RETVAL_BUG
442 if (ret < (size_t) -2 && !*pwc)
443 return 0;
444# endif
8690e634 445
49e4877c
PA
446# if C_LOCALE_MAYBE_EILSEQ
447 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
448 {
449 unsigned char uc = *s;
450 *pwc = uc;
451 return 1;
452 }
8690e634 453# endif
49e4877c
PA
454
455 return ret;
8690e634
JK
456}
457
458#endif
This page took 0.523635 seconds and 4 git commands to generate.