Commit | Line | Data |
---|---|---|
5df4cba6 SM |
1 | /* Convert multibyte character to wide character. |
2 | Copyright (C) 1999-2002, 2005-2020 Free Software Foundation, Inc. | |
3 | ||
4 | This program is free software: you can redistribute it and/or modify | |
5 | it under the terms of the GNU General Public License as published by | |
6 | the Free Software Foundation; either version 3 of the License, or | |
7 | (at your option) any later version. | |
8 | ||
9 | This program is distributed in the hope that it will be useful, | |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | GNU General Public License for more details. | |
13 | ||
14 | You should have received a copy of the GNU General Public License | |
15 | along with this program. If not, see <https://www.gnu.org/licenses/>. */ | |
16 | ||
17 | /* Written by Bruno Haible <bruno@clisp.org>, 2008. */ | |
18 | ||
19 | /* This file contains the body of the mbrtowc and mbrtoc32 functions, | |
20 | when GNULIB_defined_mbstate_t is defined. */ | |
21 | ||
22 | char *pstate = (char *)ps; | |
23 | ||
24 | if (s == NULL) | |
25 | { | |
26 | pwc = NULL; | |
27 | s = ""; | |
28 | n = 1; | |
29 | } | |
30 | ||
31 | if (n == 0) | |
32 | return (size_t)(-2); | |
33 | ||
34 | /* Here n > 0. */ | |
35 | ||
36 | if (pstate == NULL) | |
37 | pstate = internal_state; | |
38 | ||
39 | { | |
40 | size_t nstate = pstate[0]; | |
41 | char buf[4]; | |
42 | const char *p; | |
43 | size_t m; | |
44 | enc_t enc; | |
45 | int res; | |
46 | ||
47 | switch (nstate) | |
48 | { | |
49 | case 0: | |
50 | p = s; | |
51 | m = n; | |
52 | break; | |
53 | case 3: | |
54 | buf[2] = pstate[3]; | |
55 | FALLTHROUGH; | |
56 | case 2: | |
57 | buf[1] = pstate[2]; | |
58 | FALLTHROUGH; | |
59 | case 1: | |
60 | buf[0] = pstate[1]; | |
61 | p = buf; | |
62 | m = nstate; | |
63 | buf[m++] = s[0]; | |
64 | if (n >= 2 && m < 4) | |
65 | { | |
66 | buf[m++] = s[1]; | |
67 | if (n >= 3 && m < 4) | |
68 | buf[m++] = s[2]; | |
69 | } | |
70 | break; | |
71 | default: | |
72 | errno = EINVAL; | |
73 | return (size_t)(-1); | |
74 | } | |
75 | ||
76 | /* Here m > 0. */ | |
77 | ||
78 | enc = locale_encoding_classification (); | |
79 | ||
80 | if (enc == enc_utf8) /* UTF-8 */ | |
81 | { | |
82 | /* Achieve | |
83 | - multi-thread safety and | |
84 | - the ability to produce wide character values > WCHAR_MAX | |
85 | by not calling mbtowc() at all. */ | |
86 | #include "mbrtowc-impl-utf8.h" | |
87 | } | |
88 | else | |
89 | { | |
90 | /* The hidden internal state of mbtowc would make this function not | |
91 | multi-thread safe. Achieve multi-thread safety through a lock. */ | |
92 | wchar_t wc; | |
93 | res = mbtowc_with_lock (&wc, p, m); | |
94 | ||
95 | if (res >= 0) | |
96 | { | |
97 | if ((wc == 0) != (res == 0)) | |
98 | abort (); | |
99 | if (pwc != NULL) | |
100 | *pwc = wc; | |
101 | goto success; | |
102 | } | |
103 | ||
104 | /* mbtowc does not distinguish between invalid and incomplete multibyte | |
105 | sequences. But mbrtowc needs to make this distinction. | |
106 | There are two possible approaches: | |
107 | - Use iconv() and its return value. | |
108 | - Use built-in knowledge about the possible encodings. | |
109 | Given the low quality of implementation of iconv() on the systems | |
110 | that lack mbrtowc(), we use the second approach. | |
111 | The possible encodings are: | |
112 | - 8-bit encodings, | |
113 | - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, | |
114 | - UTF-8 (already handled above). | |
115 | Use specialized code for each. */ | |
116 | if (m >= 4 || m >= MB_CUR_MAX) | |
117 | goto invalid; | |
118 | /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ | |
119 | switch (enc) | |
120 | { | |
121 | /* As a reference for this code, you can use the GNU libiconv | |
122 | implementation. Look for uses of the RET_TOOFEW macro. */ | |
123 | ||
124 | case enc_eucjp: /* EUC-JP */ | |
125 | { | |
126 | if (m == 1) | |
127 | { | |
128 | unsigned char c = (unsigned char) p[0]; | |
129 | ||
130 | if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) | |
131 | goto incomplete; | |
132 | } | |
133 | if (m == 2) | |
134 | { | |
135 | unsigned char c = (unsigned char) p[0]; | |
136 | ||
137 | if (c == 0x8f) | |
138 | { | |
139 | unsigned char c2 = (unsigned char) p[1]; | |
140 | ||
141 | if (c2 >= 0xa1 && c2 < 0xff) | |
142 | goto incomplete; | |
143 | } | |
144 | } | |
145 | goto invalid; | |
146 | } | |
147 | ||
148 | case enc_94: /* EUC-KR, GB2312, BIG5 */ | |
149 | { | |
150 | if (m == 1) | |
151 | { | |
152 | unsigned char c = (unsigned char) p[0]; | |
153 | ||
154 | if (c >= 0xa1 && c < 0xff) | |
155 | goto incomplete; | |
156 | } | |
157 | goto invalid; | |
158 | } | |
159 | ||
160 | case enc_euctw: /* EUC-TW */ | |
161 | { | |
162 | if (m == 1) | |
163 | { | |
164 | unsigned char c = (unsigned char) p[0]; | |
165 | ||
166 | if ((c >= 0xa1 && c < 0xff) || c == 0x8e) | |
167 | goto incomplete; | |
168 | } | |
169 | else /* m == 2 || m == 3 */ | |
170 | { | |
171 | unsigned char c = (unsigned char) p[0]; | |
172 | ||
173 | if (c == 0x8e) | |
174 | goto incomplete; | |
175 | } | |
176 | goto invalid; | |
177 | } | |
178 | ||
179 | case enc_gb18030: /* GB18030 */ | |
180 | { | |
181 | if (m == 1) | |
182 | { | |
183 | unsigned char c = (unsigned char) p[0]; | |
184 | ||
185 | if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) | |
186 | goto incomplete; | |
187 | } | |
188 | else /* m == 2 || m == 3 */ | |
189 | { | |
190 | unsigned char c = (unsigned char) p[0]; | |
191 | ||
192 | if (c >= 0x90 && c <= 0xe3) | |
193 | { | |
194 | unsigned char c2 = (unsigned char) p[1]; | |
195 | ||
196 | if (c2 >= 0x30 && c2 <= 0x39) | |
197 | { | |
198 | if (m == 2) | |
199 | goto incomplete; | |
200 | else /* m == 3 */ | |
201 | { | |
202 | unsigned char c3 = (unsigned char) p[2]; | |
203 | ||
204 | if (c3 >= 0x81 && c3 <= 0xfe) | |
205 | goto incomplete; | |
206 | } | |
207 | } | |
208 | } | |
209 | } | |
210 | goto invalid; | |
211 | } | |
212 | ||
213 | case enc_sjis: /* SJIS */ | |
214 | { | |
215 | if (m == 1) | |
216 | { | |
217 | unsigned char c = (unsigned char) p[0]; | |
218 | ||
219 | if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) | |
220 | || (c >= 0xf0 && c <= 0xf9)) | |
221 | goto incomplete; | |
222 | } | |
223 | goto invalid; | |
224 | } | |
225 | ||
226 | default: | |
227 | /* An unknown multibyte encoding. */ | |
228 | goto incomplete; | |
229 | } | |
230 | } | |
231 | ||
232 | success: | |
233 | /* res >= 0 is the corrected return value of | |
234 | mbtowc_with_lock (&wc, p, m). */ | |
235 | if (nstate >= (res > 0 ? res : 1)) | |
236 | abort (); | |
237 | res -= nstate; | |
238 | pstate[0] = 0; | |
239 | return res; | |
240 | ||
241 | incomplete: | |
242 | { | |
243 | size_t k = nstate; | |
244 | /* Here 0 <= k < m < 4. */ | |
245 | pstate[++k] = s[0]; | |
246 | if (k < m) | |
247 | { | |
248 | pstate[++k] = s[1]; | |
249 | if (k < m) | |
250 | pstate[++k] = s[2]; | |
251 | } | |
252 | if (k != m) | |
253 | abort (); | |
254 | } | |
255 | pstate[0] = m; | |
256 | return (size_t)(-2); | |
257 | ||
258 | invalid: | |
259 | errno = EILSEQ; | |
260 | /* The conversion state is undefined, says POSIX. */ | |
261 | return (size_t)(-1); | |
262 | } |