1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2015 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #if GNULIB_defined_mbstate_t
24 /* Implement mbrtowc() on top of mbtowc(). */
29 # include "localcharset.h"
34 verify (sizeof (mbstate_t) >= 4);
36 static char internal_state
[4];
39 mbrtowc (wchar_t *pwc
, const char *s
, size_t n
, mbstate_t *ps
)
41 char *pstate
= (char *)ps
;
56 pstate
= internal_state
;
59 size_t nstate
= pstate
[0];
95 # if __GLIBC__ || defined __UCLIBC__
96 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
97 mbtowc (NULL
, NULL
, 0);
100 int res
= mbtowc (pwc
, p
, m
);
104 if (pwc
!= NULL
&& ((*pwc
== 0) != (res
== 0)))
106 if (nstate
>= (res
> 0 ? res
: 1))
113 /* mbtowc does not distinguish between invalid and incomplete multibyte
114 sequences. But mbrtowc needs to make this distinction.
115 There are two possible approaches:
116 - Use iconv() and its return value.
117 - Use built-in knowledge about the possible encodings.
118 Given the low quality of implementation of iconv() on the systems that
119 lack mbrtowc(), we use the second approach.
120 The possible encodings are:
122 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
124 Use specialized code for each. */
125 if (m
>= 4 || m
>= MB_CUR_MAX
)
127 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
129 const char *encoding
= locale_charset ();
131 if (STREQ_OPT (encoding
, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
133 /* Cf. unistr/u8-mblen.c. */
134 unsigned char c
= (unsigned char) p
[0];
149 unsigned char c2
= (unsigned char) p
[1];
151 if ((c2
^ 0x80) < 0x40
152 && (c
>= 0xe1 || c2
>= 0xa0)
153 && (c
!= 0xed || c2
< 0xa0))
161 else /* m == 2 || m == 3 */
163 unsigned char c2
= (unsigned char) p
[1];
165 if ((c2
^ 0x80) < 0x40
166 && (c
>= 0xf1 || c2
>= 0x90)
167 && (c
< 0xf4 || (c
== 0xf4 && c2
< 0x90)))
173 unsigned char c3
= (unsigned char) p
[2];
175 if ((c3
^ 0x80) < 0x40)
185 /* As a reference for this code, you can use the GNU libiconv
186 implementation. Look for uses of the RET_TOOFEW macro. */
188 if (STREQ_OPT (encoding
,
189 "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
193 unsigned char c
= (unsigned char) p
[0];
195 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e || c
== 0x8f)
200 unsigned char c
= (unsigned char) p
[0];
204 unsigned char c2
= (unsigned char) p
[1];
206 if (c2
>= 0xa1 && c2
< 0xff)
212 if (STREQ_OPT (encoding
,
213 "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
214 || STREQ_OPT (encoding
,
215 "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
216 || STREQ_OPT (encoding
,
217 "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
221 unsigned char c
= (unsigned char) p
[0];
223 if (c
>= 0xa1 && c
< 0xff)
228 if (STREQ_OPT (encoding
,
229 "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
233 unsigned char c
= (unsigned char) p
[0];
235 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e)
238 else /* m == 2 || m == 3 */
240 unsigned char c
= (unsigned char) p
[0];
247 if (STREQ_OPT (encoding
,
248 "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
252 unsigned char c
= (unsigned char) p
[0];
254 if ((c
>= 0x90 && c
<= 0xe3) || (c
>= 0xf8 && c
<= 0xfe))
257 else /* m == 2 || m == 3 */
259 unsigned char c
= (unsigned char) p
[0];
261 if (c
>= 0x90 && c
<= 0xe3)
263 unsigned char c2
= (unsigned char) p
[1];
265 if (c2
>= 0x30 && c2
<= 0x39)
271 unsigned char c3
= (unsigned char) p
[2];
273 if (c3
>= 0x81 && c3
<= 0xfe)
281 if (STREQ_OPT (encoding
, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
285 unsigned char c
= (unsigned char) p
[0];
287 if ((c
>= 0x81 && c
<= 0x9f) || (c
>= 0xe0 && c
<= 0xea)
288 || (c
>= 0xf0 && c
<= 0xf9))
294 /* An unknown multibyte encoding. */
301 /* Here 0 <= k < m < 4. */
317 /* The conversion state is undefined, says POSIX. */
324 /* Override the system's mbrtowc() function. */
329 rpl_mbrtowc (wchar_t *pwc
, const char *s
, size_t n
, mbstate_t *ps
)
331 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
340 # if MBRTOWC_EMPTY_INPUT_BUG
345 # if MBRTOWC_RETVAL_BUG
347 static mbstate_t internal_state
;
349 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
350 hidden internal state, but we can call it on our variable. */
352 ps
= &internal_state
;
356 /* Parse the rest of the multibyte character byte for byte. */
358 for (; n
> 0; s
++, n
--)
361 size_t ret
= mbrtowc (&wc
, s
, 1, ps
);
363 if (ret
== (size_t)(-1))
366 if (ret
!= (size_t)(-2))
368 /* The multibyte character has been completed. */
371 return (wc
== 0 ? 0 : count
);
379 # if MBRTOWC_NUL_RETVAL_BUG
382 size_t ret
= mbrtowc (&wc
, s
, n
, ps
);
384 if (ret
!= (size_t)(-1) && ret
!= (size_t)(-2))
395 # if MBRTOWC_NULL_ARG1_BUG
402 return mbrtowc (pwc
, s
, n
, ps
);
This page took 0.041606 seconds and 4 git commands to generate.