Commit | Line | Data |
---|---|---|
8690e634 JK |
1 | /* Determine a canonical name for the current locale's character encoding. |
2 | ||
7a6dbc2f | 3 | Copyright (C) 2000-2006, 2008-2018 Free Software Foundation, Inc. |
8690e634 JK |
4 | |
5 | This program is free software; you can redistribute it and/or modify | |
6 | it under the terms of the GNU General Public License as published by | |
7 | the Free Software Foundation; either version 3, or (at your option) | |
8 | any later version. | |
9 | ||
10 | This program is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU General Public License along | |
7a6dbc2f | 16 | with this program; if not, see <https://www.gnu.org/licenses/>. */ |
8690e634 JK |
17 | |
18 | /* Written by Bruno Haible <bruno@clisp.org>. */ | |
19 | ||
20 | #include <config.h> | |
21 | ||
22 | /* Specification. */ | |
23 | #include "localcharset.h" | |
24 | ||
8690e634 JK |
25 | #include <stddef.h> |
26 | #include <stdio.h> | |
27 | #include <string.h> | |
28 | #include <stdlib.h> | |
29 | ||
30 | #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET | |
31 | # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */ | |
32 | #endif | |
33 | ||
7a6dbc2f | 34 | #if defined _WIN32 && !defined __CYGWIN__ |
8690e634 | 35 | # define WINDOWS_NATIVE |
4a626d0a | 36 | # include <locale.h> |
8690e634 JK |
37 | #endif |
38 | ||
39 | #if defined __EMX__ | |
40 | /* Assume EMX program runs on OS/2, even if compiled under DOS. */ | |
41 | # ifndef OS2 | |
42 | # define OS2 | |
43 | # endif | |
44 | #endif | |
45 | ||
46 | #if !defined WINDOWS_NATIVE | |
8690e634 JK |
47 | # if HAVE_LANGINFO_CODESET |
48 | # include <langinfo.h> | |
49 | # else | |
7a6dbc2f | 50 | # if 0 /* see comment regarding use of setlocale(), below */ |
8690e634 JK |
51 | # include <locale.h> |
52 | # endif | |
53 | # endif | |
54 | # ifdef __CYGWIN__ | |
55 | # define WIN32_LEAN_AND_MEAN | |
56 | # include <windows.h> | |
57 | # endif | |
58 | #elif defined WINDOWS_NATIVE | |
59 | # define WIN32_LEAN_AND_MEAN | |
60 | # include <windows.h> | |
61 | #endif | |
62 | #if defined OS2 | |
63 | # define INCL_DOS | |
64 | # include <os2.h> | |
65 | #endif | |
66 | ||
4a626d0a PA |
67 | /* For MB_CUR_MAX_L */ |
68 | #if defined DARWIN7 | |
69 | # include <xlocale.h> | |
70 | #endif | |
71 | ||
8690e634 | 72 | |
7a6dbc2f | 73 | #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2 |
8690e634 | 74 | |
7a6dbc2f SDJ |
75 | /* On these platforms, we use a mapping from non-canonical encoding name |
76 | to GNU canonical encoding name. */ | |
8690e634 | 77 | |
7a6dbc2f SDJ |
78 | /* With glibc-2.1 or newer, we don't need any canonicalization, |
79 | because glibc has iconv and both glibc and libiconv support all | |
80 | GNU canonical names directly. */ | |
81 | # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__) | |
8690e634 | 82 | |
7a6dbc2f | 83 | struct table_entry |
8690e634 | 84 | { |
7a6dbc2f SDJ |
85 | const char alias[11+1]; |
86 | const char canonical[11+1]; | |
87 | }; | |
88 | ||
89 | /* Table of platform-dependent mappings, sorted in ascending order. */ | |
90 | static const struct table_entry alias_table[] = | |
91 | { | |
92 | # if defined __FreeBSD__ /* FreeBSD */ | |
93 | /*{ "ARMSCII-8", "ARMSCII-8" },*/ | |
94 | { "Big5", "BIG5" }, | |
95 | { "C", "ASCII" }, | |
96 | /*{ "CP1131", "CP1131" },*/ | |
97 | /*{ "CP1251", "CP1251" },*/ | |
98 | /*{ "CP866", "CP866" },*/ | |
99 | /*{ "GB18030", "GB18030" },*/ | |
100 | /*{ "GB2312", "GB2312" },*/ | |
101 | /*{ "GBK", "GBK" },*/ | |
102 | /*{ "ISCII-DEV", "?" },*/ | |
103 | { "ISO8859-1", "ISO-8859-1" }, | |
104 | { "ISO8859-13", "ISO-8859-13" }, | |
105 | { "ISO8859-15", "ISO-8859-15" }, | |
106 | { "ISO8859-2", "ISO-8859-2" }, | |
107 | { "ISO8859-5", "ISO-8859-5" }, | |
108 | { "ISO8859-7", "ISO-8859-7" }, | |
109 | { "ISO8859-9", "ISO-8859-9" }, | |
110 | /*{ "KOI8-R", "KOI8-R" },*/ | |
111 | /*{ "KOI8-U", "KOI8-U" },*/ | |
112 | { "SJIS", "SHIFT_JIS" }, | |
113 | { "US-ASCII", "ASCII" }, | |
114 | { "eucCN", "GB2312" }, | |
115 | { "eucJP", "EUC-JP" }, | |
116 | { "eucKR", "EUC-KR" } | |
117 | # define alias_table_defined | |
118 | # endif | |
119 | # if defined __NetBSD__ /* NetBSD */ | |
120 | { "646", "ASCII" }, | |
121 | /*{ "ARMSCII-8", "ARMSCII-8" },*/ | |
122 | /*{ "BIG5", "BIG5" },*/ | |
123 | { "Big5-HKSCS", "BIG5-HKSCS" }, | |
124 | /*{ "CP1251", "CP1251" },*/ | |
125 | /*{ "CP866", "CP866" },*/ | |
126 | /*{ "GB18030", "GB18030" },*/ | |
127 | /*{ "GB2312", "GB2312" },*/ | |
128 | { "ISO8859-1", "ISO-8859-1" }, | |
129 | { "ISO8859-13", "ISO-8859-13" }, | |
130 | { "ISO8859-15", "ISO-8859-15" }, | |
131 | { "ISO8859-2", "ISO-8859-2" }, | |
132 | { "ISO8859-4", "ISO-8859-4" }, | |
133 | { "ISO8859-5", "ISO-8859-5" }, | |
134 | { "ISO8859-7", "ISO-8859-7" }, | |
135 | /*{ "KOI8-R", "KOI8-R" },*/ | |
136 | /*{ "KOI8-U", "KOI8-U" },*/ | |
137 | /*{ "PT154", "PT154" },*/ | |
138 | { "SJIS", "SHIFT_JIS" }, | |
139 | { "eucCN", "GB2312" }, | |
140 | { "eucJP", "EUC-JP" }, | |
141 | { "eucKR", "EUC-KR" }, | |
142 | { "eucTW", "EUC-TW" } | |
143 | # define alias_table_defined | |
144 | # endif | |
145 | # if defined __OpenBSD__ /* OpenBSD */ | |
146 | { "646", "ASCII" }, | |
147 | { "ISO8859-1", "ISO-8859-1" }, | |
148 | { "ISO8859-13", "ISO-8859-13" }, | |
149 | { "ISO8859-15", "ISO-8859-15" }, | |
150 | { "ISO8859-2", "ISO-8859-2" }, | |
151 | { "ISO8859-4", "ISO-8859-4" }, | |
152 | { "ISO8859-5", "ISO-8859-5" }, | |
153 | { "ISO8859-7", "ISO-8859-7" } | |
154 | # define alias_table_defined | |
155 | # endif | |
156 | # if defined __APPLE__ && defined __MACH__ /* Mac OS X */ | |
157 | /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is | |
158 | useless: | |
159 | - It returns the empty string when LANG is set to a locale of the | |
160 | form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8 | |
161 | LC_CTYPE file. | |
162 | - The environment variables LANG, LC_CTYPE, LC_ALL are not set by | |
163 | the system; nl_langinfo(CODESET) returns "US-ASCII" in this case. | |
164 | - The documentation says: | |
165 | "... all code that calls BSD system routines should ensure | |
166 | that the const *char parameters of these routines are in UTF-8 | |
167 | encoding. All BSD system functions expect their string | |
168 | parameters to be in UTF-8 encoding and nothing else." | |
169 | It also says | |
170 | "An additional caveat is that string parameters for files, | |
171 | paths, and other file-system entities must be in canonical | |
172 | UTF-8. In a canonical UTF-8 Unicode string, all decomposable | |
173 | characters are decomposed ..." | |
174 | but this is not true: You can pass non-decomposed UTF-8 strings | |
175 | to file system functions, and it is the OS which will convert | |
176 | them to decomposed UTF-8 before accessing the file system. | |
177 | - The Apple Terminal application displays UTF-8 by default. | |
178 | - However, other applications are free to use different encodings: | |
179 | - xterm uses ISO-8859-1 by default. | |
180 | - TextEdit uses MacRoman by default. | |
181 | We prefer UTF-8 over decomposed UTF-8-MAC because one should | |
182 | minimize the use of decomposed Unicode. Unfortunately, through the | |
183 | Darwin file system, decomposed UTF-8 strings are leaked into user | |
184 | space nevertheless. | |
185 | Then there are also the locales with encodings other than US-ASCII | |
186 | and UTF-8. These locales can be occasionally useful to users (e.g. | |
187 | when grepping through ISO-8859-1 encoded text files), when all their | |
188 | file names are in US-ASCII. | |
189 | */ | |
190 | { "ARMSCII-8", "ARMSCII-8" }, | |
191 | { "Big5", "BIG5" }, | |
192 | { "Big5HKSCS", "BIG5-HKSCS" }, | |
193 | { "CP1131", "CP1131" }, | |
194 | { "CP1251", "CP1251" }, | |
195 | { "CP866", "CP866" }, | |
196 | { "CP949", "CP949" }, | |
197 | { "GB18030", "GB18030" }, | |
198 | { "GB2312", "GB2312" }, | |
199 | { "GBK", "GBK" }, | |
200 | /*{ "ISCII-DEV", "?" },*/ | |
201 | { "ISO8859-1", "ISO-8859-1" }, | |
202 | { "ISO8859-13", "ISO-8859-13" }, | |
203 | { "ISO8859-15", "ISO-8859-15" }, | |
204 | { "ISO8859-2", "ISO-8859-2" }, | |
205 | { "ISO8859-4", "ISO-8859-4" }, | |
206 | { "ISO8859-5", "ISO-8859-5" }, | |
207 | { "ISO8859-7", "ISO-8859-7" }, | |
208 | { "ISO8859-9", "ISO-8859-9" }, | |
209 | { "KOI8-R", "KOI8-R" }, | |
210 | { "KOI8-U", "KOI8-U" }, | |
211 | { "PT154", "PT154" }, | |
212 | { "SJIS", "SHIFT_JIS" }, | |
213 | { "eucCN", "GB2312" }, | |
214 | { "eucJP", "EUC-JP" }, | |
215 | { "eucKR", "EUC-KR" } | |
216 | # define alias_table_defined | |
217 | # endif | |
218 | # if defined _AIX /* AIX */ | |
219 | /*{ "GBK", "GBK" },*/ | |
220 | { "IBM-1046", "CP1046" }, | |
221 | { "IBM-1124", "CP1124" }, | |
222 | { "IBM-1129", "CP1129" }, | |
223 | { "IBM-1252", "CP1252" }, | |
224 | { "IBM-850", "CP850" }, | |
225 | { "IBM-856", "CP856" }, | |
226 | { "IBM-921", "ISO-8859-13" }, | |
227 | { "IBM-922", "CP922" }, | |
228 | { "IBM-932", "CP932" }, | |
229 | { "IBM-943", "CP943" }, | |
230 | { "IBM-eucCN", "GB2312" }, | |
231 | { "IBM-eucJP", "EUC-JP" }, | |
232 | { "IBM-eucKR", "EUC-KR" }, | |
233 | { "IBM-eucTW", "EUC-TW" }, | |
234 | { "ISO8859-1", "ISO-8859-1" }, | |
235 | { "ISO8859-15", "ISO-8859-15" }, | |
236 | { "ISO8859-2", "ISO-8859-2" }, | |
237 | { "ISO8859-5", "ISO-8859-5" }, | |
238 | { "ISO8859-6", "ISO-8859-6" }, | |
239 | { "ISO8859-7", "ISO-8859-7" }, | |
240 | { "ISO8859-8", "ISO-8859-8" }, | |
241 | { "ISO8859-9", "ISO-8859-9" }, | |
242 | { "TIS-620", "TIS-620" }, | |
243 | /*{ "UTF-8", "UTF-8" },*/ | |
244 | { "big5", "BIG5" } | |
245 | # define alias_table_defined | |
246 | # endif | |
247 | # if defined __hpux /* HP-UX */ | |
248 | { "SJIS", "SHIFT_JIS" }, | |
249 | { "arabic8", "HP-ARABIC8" }, | |
250 | { "big5", "BIG5" }, | |
251 | { "cp1251", "CP1251" }, | |
252 | { "eucJP", "EUC-JP" }, | |
253 | { "eucKR", "EUC-KR" }, | |
254 | { "eucTW", "EUC-TW" }, | |
255 | { "gb18030", "GB18030" }, | |
256 | { "greek8", "HP-GREEK8" }, | |
257 | { "hebrew8", "HP-HEBREW8" }, | |
258 | { "hkbig5", "BIG5-HKSCS" }, | |
259 | { "hp15CN", "GB2312" }, | |
260 | { "iso88591", "ISO-8859-1" }, | |
261 | { "iso885913", "ISO-8859-13" }, | |
262 | { "iso885915", "ISO-8859-15" }, | |
263 | { "iso88592", "ISO-8859-2" }, | |
264 | { "iso88594", "ISO-8859-4" }, | |
265 | { "iso88595", "ISO-8859-5" }, | |
266 | { "iso88596", "ISO-8859-6" }, | |
267 | { "iso88597", "ISO-8859-7" }, | |
268 | { "iso88598", "ISO-8859-8" }, | |
269 | { "iso88599", "ISO-8859-9" }, | |
270 | { "kana8", "HP-KANA8" }, | |
271 | { "koi8r", "KOI8-R" }, | |
272 | { "roman8", "HP-ROMAN8" }, | |
273 | { "tis620", "TIS-620" }, | |
274 | { "turkish8", "HP-TURKISH8" }, | |
275 | { "utf8", "UTF-8" } | |
276 | # define alias_table_defined | |
277 | # endif | |
278 | # if defined __sgi /* IRIX */ | |
279 | { "ISO8859-1", "ISO-8859-1" }, | |
280 | { "ISO8859-15", "ISO-8859-15" }, | |
281 | { "ISO8859-2", "ISO-8859-2" }, | |
282 | { "ISO8859-5", "ISO-8859-5" }, | |
283 | { "ISO8859-7", "ISO-8859-7" }, | |
284 | { "ISO8859-9", "ISO-8859-9" }, | |
285 | { "eucCN", "GB2312" }, | |
286 | { "eucJP", "EUC-JP" }, | |
287 | { "eucKR", "EUC-KR" }, | |
288 | { "eucTW", "EUC-TW" } | |
289 | # define alias_table_defined | |
290 | # endif | |
291 | # if defined __osf__ /* OSF/1 */ | |
292 | /*{ "GBK", "GBK" },*/ | |
293 | { "ISO8859-1", "ISO-8859-1" }, | |
294 | { "ISO8859-15", "ISO-8859-15" }, | |
295 | { "ISO8859-2", "ISO-8859-2" }, | |
296 | { "ISO8859-4", "ISO-8859-4" }, | |
297 | { "ISO8859-5", "ISO-8859-5" }, | |
298 | { "ISO8859-7", "ISO-8859-7" }, | |
299 | { "ISO8859-8", "ISO-8859-8" }, | |
300 | { "ISO8859-9", "ISO-8859-9" }, | |
301 | { "KSC5601", "CP949" }, | |
302 | { "SJIS", "SHIFT_JIS" }, | |
303 | { "TACTIS", "TIS-620" }, | |
304 | /*{ "UTF-8", "UTF-8" },*/ | |
305 | { "big5", "BIG5" }, | |
306 | { "cp850", "CP850" }, | |
307 | { "dechanyu", "DEC-HANYU" }, | |
308 | { "dechanzi", "GB2312" }, | |
309 | { "deckanji", "DEC-KANJI" }, | |
310 | { "deckorean", "EUC-KR" }, | |
311 | { "eucJP", "EUC-JP" }, | |
312 | { "eucKR", "EUC-KR" }, | |
313 | { "eucTW", "EUC-TW" }, | |
314 | { "sdeckanji", "EUC-JP" } | |
315 | # define alias_table_defined | |
316 | # endif | |
317 | # if defined __sun /* Solaris */ | |
318 | { "5601", "EUC-KR" }, | |
319 | { "646", "ASCII" }, | |
320 | /*{ "BIG5", "BIG5" },*/ | |
321 | { "Big5-HKSCS", "BIG5-HKSCS" }, | |
322 | { "GB18030", "GB18030" }, | |
323 | /*{ "GBK", "GBK" },*/ | |
324 | { "ISO8859-1", "ISO-8859-1" }, | |
325 | { "ISO8859-11", "TIS-620" }, | |
326 | { "ISO8859-13", "ISO-8859-13" }, | |
327 | { "ISO8859-15", "ISO-8859-15" }, | |
328 | { "ISO8859-2", "ISO-8859-2" }, | |
329 | { "ISO8859-3", "ISO-8859-3" }, | |
330 | { "ISO8859-4", "ISO-8859-4" }, | |
331 | { "ISO8859-5", "ISO-8859-5" }, | |
332 | { "ISO8859-6", "ISO-8859-6" }, | |
333 | { "ISO8859-7", "ISO-8859-7" }, | |
334 | { "ISO8859-8", "ISO-8859-8" }, | |
335 | { "ISO8859-9", "ISO-8859-9" }, | |
336 | { "PCK", "SHIFT_JIS" }, | |
337 | { "TIS620.2533", "TIS-620" }, | |
338 | /*{ "UTF-8", "UTF-8" },*/ | |
339 | { "ansi-1251", "CP1251" }, | |
340 | { "cns11643", "EUC-TW" }, | |
341 | { "eucJP", "EUC-JP" }, | |
342 | { "gb2312", "GB2312" }, | |
343 | { "koi8-r", "KOI8-R" } | |
344 | # define alias_table_defined | |
345 | # endif | |
346 | # if defined __minix /* Minix */ | |
347 | { "646", "ASCII" } | |
348 | # define alias_table_defined | |
349 | # endif | |
350 | # if defined WINDOWS_NATIVE || defined __CYGWIN__ /* Windows */ | |
351 | { "CP1361", "JOHAB" }, | |
352 | { "CP20127", "ASCII" }, | |
353 | { "CP20866", "KOI8-R" }, | |
354 | { "CP20936", "GB2312" }, | |
355 | { "CP21866", "KOI8-RU" }, | |
356 | { "CP28591", "ISO-8859-1" }, | |
357 | { "CP28592", "ISO-8859-2" }, | |
358 | { "CP28593", "ISO-8859-3" }, | |
359 | { "CP28594", "ISO-8859-4" }, | |
360 | { "CP28595", "ISO-8859-5" }, | |
361 | { "CP28596", "ISO-8859-6" }, | |
362 | { "CP28597", "ISO-8859-7" }, | |
363 | { "CP28598", "ISO-8859-8" }, | |
364 | { "CP28599", "ISO-8859-9" }, | |
365 | { "CP28605", "ISO-8859-15" }, | |
366 | { "CP38598", "ISO-8859-8" }, | |
367 | { "CP51932", "EUC-JP" }, | |
368 | { "CP51936", "GB2312" }, | |
369 | { "CP51949", "EUC-KR" }, | |
370 | { "CP51950", "EUC-TW" }, | |
371 | { "CP54936", "GB18030" }, | |
372 | { "CP65001", "UTF-8" }, | |
373 | { "CP936", "GBK" } | |
374 | # define alias_table_defined | |
375 | # endif | |
376 | # if defined OS2 /* OS/2 */ | |
377 | /* The list of encodings is taken from "List of OS/2 Codepages" | |
378 | by Alex Taylor: | |
379 | <http://altsan.org/os2/toolkits/uls/index.html#codepages>. | |
380 | See also "IBM Globalization - Code page identifiers": | |
381 | <https://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>. */ | |
382 | { "CP1089", "ISO-8859-6" }, | |
383 | { "CP1208", "UTF-8" }, | |
384 | { "CP1381", "GB2312" }, | |
385 | { "CP1386", "GBK" }, | |
386 | { "CP3372", "EUC-JP" }, | |
387 | { "CP813", "ISO-8859-7" }, | |
388 | { "CP819", "ISO-8859-1" }, | |
389 | { "CP878", "KOI8-R" }, | |
390 | { "CP912", "ISO-8859-2" }, | |
391 | { "CP913", "ISO-8859-3" }, | |
392 | { "CP914", "ISO-8859-4" }, | |
393 | { "CP915", "ISO-8859-5" }, | |
394 | { "CP916", "ISO-8859-8" }, | |
395 | { "CP920", "ISO-8859-9" }, | |
396 | { "CP921", "ISO-8859-13" }, | |
397 | { "CP923", "ISO-8859-15" }, | |
398 | { "CP954", "EUC-JP" }, | |
399 | { "CP964", "EUC-TW" }, | |
400 | { "CP970", "EUC-KR" } | |
401 | # define alias_table_defined | |
402 | # endif | |
403 | # if defined VMS /* OpenVMS */ | |
404 | /* The list of encodings is taken from the OpenVMS 7.3-1 documentation | |
405 | "Compaq C Run-Time Library Reference Manual for OpenVMS systems" | |
406 | section 10.7 "Handling Different Character Sets". */ | |
407 | { "DECHANYU", "DEC-HANYU" }, | |
408 | { "DECHANZI", "GB2312" }, | |
409 | { "DECKANJI", "DEC-KANJI" }, | |
410 | { "DECKOREAN", "EUC-KR" }, | |
411 | { "ISO8859-1", "ISO-8859-1" }, | |
412 | { "ISO8859-2", "ISO-8859-2" }, | |
413 | { "ISO8859-5", "ISO-8859-5" }, | |
414 | { "ISO8859-7", "ISO-8859-7" }, | |
415 | { "ISO8859-8", "ISO-8859-8" }, | |
416 | { "ISO8859-9", "ISO-8859-9" }, | |
417 | { "SDECKANJI", "EUC-JP" }, | |
418 | { "SJIS", "SHIFT_JIS" }, | |
419 | { "eucJP", "EUC-JP" }, | |
420 | { "eucTW", "EUC-TW" } | |
421 | # define alias_table_defined | |
422 | # endif | |
423 | # ifndef alias_table_defined | |
424 | /* Just a dummy entry, to avoid a C syntax error. */ | |
425 | { "", "" } | |
426 | # endif | |
427 | }; | |
8690e634 | 428 | |
7a6dbc2f | 429 | # endif |
8690e634 JK |
430 | |
431 | #else | |
432 | ||
7a6dbc2f SDJ |
433 | /* On these platforms, we use a mapping from locale name to GNU canonical |
434 | encoding name. */ | |
8690e634 | 435 | |
7a6dbc2f SDJ |
436 | struct table_entry |
437 | { | |
438 | const char locale[17+1]; | |
439 | const char canonical[11+1]; | |
440 | }; | |
441 | ||
442 | /* Table of platform-dependent mappings, sorted in ascending order. */ | |
443 | static const struct table_entry locale_table[] = | |
444 | { | |
445 | # if defined __FreeBSD__ /* FreeBSD 4.2 */ | |
446 | { "cs_CZ.ISO_8859-2", "ISO-8859-2" }, | |
447 | { "da_DK.DIS_8859-15", "ISO-8859-15" }, | |
448 | { "da_DK.ISO_8859-1", "ISO-8859-1" }, | |
449 | { "de_AT.DIS_8859-15", "ISO-8859-15" }, | |
450 | { "de_AT.ISO_8859-1", "ISO-8859-1" }, | |
451 | { "de_CH.DIS_8859-15", "ISO-8859-15" }, | |
452 | { "de_CH.ISO_8859-1", "ISO-8859-1" }, | |
453 | { "de_DE.DIS_8859-15", "ISO-8859-15" }, | |
454 | { "de_DE.ISO_8859-1", "ISO-8859-1" }, | |
455 | { "en_AU.DIS_8859-15", "ISO-8859-15" }, | |
456 | { "en_AU.ISO_8859-1", "ISO-8859-1" }, | |
457 | { "en_CA.DIS_8859-15", "ISO-8859-15" }, | |
458 | { "en_CA.ISO_8859-1", "ISO-8859-1" }, | |
459 | { "en_GB.DIS_8859-15", "ISO-8859-15" }, | |
460 | { "en_GB.ISO_8859-1", "ISO-8859-1" }, | |
461 | { "en_US.DIS_8859-15", "ISO-8859-15" }, | |
462 | { "en_US.ISO_8859-1", "ISO-8859-1" }, | |
463 | { "es_ES.DIS_8859-15", "ISO-8859-15" }, | |
464 | { "es_ES.ISO_8859-1", "ISO-8859-1" }, | |
465 | { "fi_FI.DIS_8859-15", "ISO-8859-15" }, | |
466 | { "fi_FI.ISO_8859-1", "ISO-8859-1" }, | |
467 | { "fr_BE.DIS_8859-15", "ISO-8859-15" }, | |
468 | { "fr_BE.ISO_8859-1", "ISO-8859-1" }, | |
469 | { "fr_CA.DIS_8859-15", "ISO-8859-15" }, | |
470 | { "fr_CA.ISO_8859-1", "ISO-8859-1" }, | |
471 | { "fr_CH.DIS_8859-15", "ISO-8859-15" }, | |
472 | { "fr_CH.ISO_8859-1", "ISO-8859-1" }, | |
473 | { "fr_FR.DIS_8859-15", "ISO-8859-15" }, | |
474 | { "fr_FR.ISO_8859-1", "ISO-8859-1" }, | |
475 | { "hr_HR.ISO_8859-2", "ISO-8859-2" }, | |
476 | { "hu_HU.ISO_8859-2", "ISO-8859-2" }, | |
477 | { "is_IS.DIS_8859-15", "ISO-8859-15" }, | |
478 | { "is_IS.ISO_8859-1", "ISO-8859-1" }, | |
479 | { "it_CH.DIS_8859-15", "ISO-8859-15" }, | |
480 | { "it_CH.ISO_8859-1", "ISO-8859-1" }, | |
481 | { "it_IT.DIS_8859-15", "ISO-8859-15" }, | |
482 | { "it_IT.ISO_8859-1", "ISO-8859-1" }, | |
483 | { "ja_JP.EUC", "EUC-JP" }, | |
484 | { "ja_JP.SJIS", "SHIFT_JIS" }, | |
485 | { "ja_JP.Shift_JIS", "SHIFT_JIS" }, | |
486 | { "ko_KR.EUC", "EUC-KR" }, | |
487 | { "la_LN.ASCII", "ASCII" }, | |
488 | { "la_LN.DIS_8859-15", "ISO-8859-15" }, | |
489 | { "la_LN.ISO_8859-1", "ISO-8859-1" }, | |
490 | { "la_LN.ISO_8859-2", "ISO-8859-2" }, | |
491 | { "la_LN.ISO_8859-4", "ISO-8859-4" }, | |
492 | { "lt_LN.ASCII", "ASCII" }, | |
493 | { "lt_LN.DIS_8859-15", "ISO-8859-15" }, | |
494 | { "lt_LN.ISO_8859-1", "ISO-8859-1" }, | |
495 | { "lt_LN.ISO_8859-2", "ISO-8859-2" }, | |
496 | { "lt_LT.ISO_8859-4", "ISO-8859-4" }, | |
497 | { "nl_BE.DIS_8859-15", "ISO-8859-15" }, | |
498 | { "nl_BE.ISO_8859-1", "ISO-8859-1" }, | |
499 | { "nl_NL.DIS_8859-15", "ISO-8859-15" }, | |
500 | { "nl_NL.ISO_8859-1", "ISO-8859-1" }, | |
501 | { "no_NO.DIS_8859-15", "ISO-8859-15" }, | |
502 | { "no_NO.ISO_8859-1", "ISO-8859-1" }, | |
503 | { "pl_PL.ISO_8859-2", "ISO-8859-2" }, | |
504 | { "pt_PT.DIS_8859-15", "ISO-8859-15" }, | |
505 | { "pt_PT.ISO_8859-1", "ISO-8859-1" }, | |
506 | { "ru_RU.CP866", "CP866" }, | |
507 | { "ru_RU.ISO_8859-5", "ISO-8859-5" }, | |
508 | { "ru_RU.KOI8-R", "KOI8-R" }, | |
509 | { "ru_SU.CP866", "CP866" }, | |
510 | { "ru_SU.ISO_8859-5", "ISO-8859-5" }, | |
511 | { "ru_SU.KOI8-R", "KOI8-R" }, | |
512 | { "sl_SI.ISO_8859-2", "ISO-8859-2" }, | |
513 | { "sv_SE.DIS_8859-15", "ISO-8859-15" }, | |
514 | { "sv_SE.ISO_8859-1", "ISO-8859-1" }, | |
515 | { "uk_UA.KOI8-U", "KOI8-U" }, | |
516 | { "zh_CN.EUC", "GB2312" }, | |
517 | { "zh_TW.BIG5", "BIG5" }, | |
518 | { "zh_TW.Big5", "BIG5" } | |
519 | # define locale_table_defined | |
8690e634 | 520 | # endif |
7a6dbc2f SDJ |
521 | # if defined __DJGPP__ /* DOS / DJGPP 2.03 */ |
522 | /* The encodings given here may not all be correct. | |
523 | If you find that the encoding given for your language and | |
524 | country is not the one your DOS machine actually uses, just | |
525 | correct it in this file, and send a mail to | |
526 | Juan Manuel Guerrero <juan.guerrero@gmx.de> | |
527 | and <bug-gnulib@gnu.org>. */ | |
528 | { "C", "ASCII" }, | |
529 | { "ar", "CP864" }, | |
530 | { "ar_AE", "CP864" }, | |
531 | { "ar_DZ", "CP864" }, | |
532 | { "ar_EG", "CP864" }, | |
533 | { "ar_IQ", "CP864" }, | |
534 | { "ar_IR", "CP864" }, | |
535 | { "ar_JO", "CP864" }, | |
536 | { "ar_KW", "CP864" }, | |
537 | { "ar_MA", "CP864" }, | |
538 | { "ar_OM", "CP864" }, | |
539 | { "ar_QA", "CP864" }, | |
540 | { "ar_SA", "CP864" }, | |
541 | { "ar_SY", "CP864" }, | |
542 | { "be", "CP866" }, | |
543 | { "be_BE", "CP866" }, | |
544 | { "bg", "CP866" }, /* not CP855 ?? */ | |
545 | { "bg_BG", "CP866" }, /* not CP855 ?? */ | |
546 | { "ca", "CP850" }, | |
547 | { "ca_ES", "CP850" }, | |
548 | { "cs", "CP852" }, | |
549 | { "cs_CZ", "CP852" }, | |
550 | { "da", "CP865" }, /* not CP850 ?? */ | |
551 | { "da_DK", "CP865" }, /* not CP850 ?? */ | |
552 | { "de", "CP850" }, | |
553 | { "de_AT", "CP850" }, | |
554 | { "de_CH", "CP850" }, | |
555 | { "de_DE", "CP850" }, | |
556 | { "el", "CP869" }, | |
557 | { "el_GR", "CP869" }, | |
558 | { "en", "CP850" }, | |
559 | { "en_AU", "CP850" }, /* not CP437 ?? */ | |
560 | { "en_CA", "CP850" }, | |
561 | { "en_GB", "CP850" }, | |
562 | { "en_NZ", "CP437" }, | |
563 | { "en_US", "CP437" }, | |
564 | { "en_ZA", "CP850" }, /* not CP437 ?? */ | |
565 | { "eo", "CP850" }, | |
566 | { "eo_EO", "CP850" }, | |
567 | { "es", "CP850" }, | |
568 | { "es_AR", "CP850" }, | |
569 | { "es_BO", "CP850" }, | |
570 | { "es_CL", "CP850" }, | |
571 | { "es_CO", "CP850" }, | |
572 | { "es_CR", "CP850" }, | |
573 | { "es_CU", "CP850" }, | |
574 | { "es_DO", "CP850" }, | |
575 | { "es_EC", "CP850" }, | |
576 | { "es_ES", "CP850" }, | |
577 | { "es_GT", "CP850" }, | |
578 | { "es_HN", "CP850" }, | |
579 | { "es_MX", "CP850" }, | |
580 | { "es_NI", "CP850" }, | |
581 | { "es_PA", "CP850" }, | |
582 | { "es_PE", "CP850" }, | |
583 | { "es_PY", "CP850" }, | |
584 | { "es_SV", "CP850" }, | |
585 | { "es_UY", "CP850" }, | |
586 | { "es_VE", "CP850" }, | |
587 | { "et", "CP850" }, | |
588 | { "et_EE", "CP850" }, | |
589 | { "eu", "CP850" }, | |
590 | { "eu_ES", "CP850" }, | |
591 | { "fi", "CP850" }, | |
592 | { "fi_FI", "CP850" }, | |
593 | { "fr", "CP850" }, | |
594 | { "fr_BE", "CP850" }, | |
595 | { "fr_CA", "CP850" }, | |
596 | { "fr_CH", "CP850" }, | |
597 | { "fr_FR", "CP850" }, | |
598 | { "ga", "CP850" }, | |
599 | { "ga_IE", "CP850" }, | |
600 | { "gd", "CP850" }, | |
601 | { "gd_GB", "CP850" }, | |
602 | { "gl", "CP850" }, | |
603 | { "gl_ES", "CP850" }, | |
604 | { "he", "CP862" }, | |
605 | { "he_IL", "CP862" }, | |
606 | { "hr", "CP852" }, | |
607 | { "hr_HR", "CP852" }, | |
608 | { "hu", "CP852" }, | |
609 | { "hu_HU", "CP852" }, | |
610 | { "id", "CP850" }, /* not CP437 ?? */ | |
611 | { "id_ID", "CP850" }, /* not CP437 ?? */ | |
612 | { "is", "CP861" }, /* not CP850 ?? */ | |
613 | { "is_IS", "CP861" }, /* not CP850 ?? */ | |
614 | { "it", "CP850" }, | |
615 | { "it_CH", "CP850" }, | |
616 | { "it_IT", "CP850" }, | |
617 | { "ja", "CP932" }, | |
618 | { "ja_JP", "CP932" }, | |
619 | { "kr", "CP949" }, /* not CP934 ?? */ | |
620 | { "kr_KR", "CP949" }, /* not CP934 ?? */ | |
621 | { "lt", "CP775" }, | |
622 | { "lt_LT", "CP775" }, | |
623 | { "lv", "CP775" }, | |
624 | { "lv_LV", "CP775" }, | |
625 | { "mk", "CP866" }, /* not CP855 ?? */ | |
626 | { "mk_MK", "CP866" }, /* not CP855 ?? */ | |
627 | { "mt", "CP850" }, | |
628 | { "mt_MT", "CP850" }, | |
629 | { "nb", "CP865" }, /* not CP850 ?? */ | |
630 | { "nb_NO", "CP865" }, /* not CP850 ?? */ | |
631 | { "nl", "CP850" }, | |
632 | { "nl_BE", "CP850" }, | |
633 | { "nl_NL", "CP850" }, | |
634 | { "nn", "CP865" }, /* not CP850 ?? */ | |
635 | { "nn_NO", "CP865" }, /* not CP850 ?? */ | |
636 | { "no", "CP865" }, /* not CP850 ?? */ | |
637 | { "no_NO", "CP865" }, /* not CP850 ?? */ | |
638 | { "pl", "CP852" }, | |
639 | { "pl_PL", "CP852" }, | |
640 | { "pt", "CP850" }, | |
641 | { "pt_BR", "CP850" }, | |
642 | { "pt_PT", "CP850" }, | |
643 | { "ro", "CP852" }, | |
644 | { "ro_RO", "CP852" }, | |
645 | { "ru", "CP866" }, | |
646 | { "ru_RU", "CP866" }, | |
647 | { "sk", "CP852" }, | |
648 | { "sk_SK", "CP852" }, | |
649 | { "sl", "CP852" }, | |
650 | { "sl_SI", "CP852" }, | |
651 | { "sq", "CP852" }, | |
652 | { "sq_AL", "CP852" }, | |
653 | { "sr", "CP852" }, /* CP852 or CP866 or CP855 ?? */ | |
654 | { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */ | |
655 | { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */ | |
656 | { "sv", "CP850" }, | |
657 | { "sv_SE", "CP850" }, | |
658 | { "th", "CP874" }, | |
659 | { "th_TH", "CP874" }, | |
660 | { "tr", "CP857" }, | |
661 | { "tr_TR", "CP857" }, | |
662 | { "uk", "CP1125" }, | |
663 | { "uk_UA", "CP1125" }, | |
664 | { "zh_CN", "GBK" }, | |
665 | { "zh_TW", "CP950" } /* not CP938 ?? */ | |
666 | # define locale_table_defined | |
8690e634 | 667 | # endif |
7a6dbc2f SDJ |
668 | # ifndef locale_table_defined |
669 | /* Just a dummy entry, to avoid a C syntax error. */ | |
670 | { "", "" } | |
4a626d0a | 671 | # endif |
7a6dbc2f | 672 | }; |
8690e634 | 673 | |
7a6dbc2f | 674 | #endif |
8690e634 | 675 | |
8690e634 JK |
676 | |
677 | /* Determine the current locale's character encoding, and canonicalize it | |
7a6dbc2f | 678 | into one of the canonical names listed in localcharset.h. |
8690e634 JK |
679 | The result must not be freed; it is statically allocated. |
680 | If the canonical name cannot be determined, the result is a non-canonical | |
681 | name. */ | |
682 | ||
683 | #ifdef STATIC | |
684 | STATIC | |
685 | #endif | |
686 | const char * | |
687 | locale_charset (void) | |
688 | { | |
689 | const char *codeset; | |
8690e634 | 690 | |
7a6dbc2f | 691 | #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2 |
8690e634 JK |
692 | |
693 | # if HAVE_LANGINFO_CODESET | |
694 | ||
695 | /* Most systems support nl_langinfo (CODESET) nowadays. */ | |
696 | codeset = nl_langinfo (CODESET); | |
697 | ||
698 | # ifdef __CYGWIN__ | |
699 | /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always | |
700 | returns "US-ASCII". Return the suffix of the locale name from the | |
701 | environment variables (if present) or the codepage as a number. */ | |
702 | if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) | |
703 | { | |
704 | const char *locale; | |
705 | static char buf[2 + 10 + 1]; | |
706 | ||
707 | locale = getenv ("LC_ALL"); | |
708 | if (locale == NULL || locale[0] == '\0') | |
709 | { | |
710 | locale = getenv ("LC_CTYPE"); | |
711 | if (locale == NULL || locale[0] == '\0') | |
712 | locale = getenv ("LANG"); | |
713 | } | |
714 | if (locale != NULL && locale[0] != '\0') | |
715 | { | |
716 | /* If the locale name contains an encoding after the dot, return | |
717 | it. */ | |
718 | const char *dot = strchr (locale, '.'); | |
719 | ||
720 | if (dot != NULL) | |
721 | { | |
722 | const char *modifier; | |
723 | ||
724 | dot++; | |
725 | /* Look for the possible @... trailer and remove it, if any. */ | |
726 | modifier = strchr (dot, '@'); | |
727 | if (modifier == NULL) | |
728 | return dot; | |
729 | if (modifier - dot < sizeof (buf)) | |
730 | { | |
731 | memcpy (buf, dot, modifier - dot); | |
732 | buf [modifier - dot] = '\0'; | |
733 | return buf; | |
734 | } | |
735 | } | |
736 | } | |
737 | ||
738 | /* The Windows API has a function returning the locale's codepage as a | |
739 | number: GetACP(). This encoding is used by Cygwin, unless the user | |
740 | has set the environment variable CYGWIN=codepage:oem (which very few | |
741 | people do). | |
742 | Output directed to console windows needs to be converted (to | |
743 | GetOEMCP() if the console is using a raster font, or to | |
744 | GetConsoleOutputCP() if it is using a TrueType font). Cygwin does | |
745 | this conversion transparently (see winsup/cygwin/fhandler_console.cc), | |
746 | converting to GetConsoleOutputCP(). This leads to correct results, | |
747 | except when SetConsoleOutputCP has been called and a raster font is | |
748 | in use. */ | |
749 | sprintf (buf, "CP%u", GetACP ()); | |
750 | codeset = buf; | |
751 | } | |
752 | # endif | |
753 | ||
7a6dbc2f SDJ |
754 | if (codeset == NULL) |
755 | /* The canonical name cannot be determined. */ | |
756 | codeset = ""; | |
8690e634 | 757 | |
7a6dbc2f | 758 | # elif defined WINDOWS_NATIVE |
8690e634 JK |
759 | |
760 | static char buf[2 + 10 + 1]; | |
761 | ||
4a626d0a PA |
762 | /* The Windows API has a function returning the locale's codepage as |
763 | a number, but the value doesn't change according to what the | |
764 | 'setlocale' call specified. So we use it as a last resort, in | |
765 | case the string returned by 'setlocale' doesn't specify the | |
766 | codepage. */ | |
767 | char *current_locale = setlocale (LC_ALL, NULL); | |
768 | char *pdot; | |
769 | ||
770 | /* If they set different locales for different categories, | |
771 | 'setlocale' will return a semi-colon separated list of locale | |
772 | values. To make sure we use the correct one, we choose LC_CTYPE. */ | |
773 | if (strchr (current_locale, ';')) | |
774 | current_locale = setlocale (LC_CTYPE, NULL); | |
775 | ||
776 | pdot = strrchr (current_locale, '.'); | |
7a6dbc2f | 777 | if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf)) |
4a626d0a PA |
778 | sprintf (buf, "CP%s", pdot + 1); |
779 | else | |
780 | { | |
781 | /* The Windows API has a function returning the locale's codepage as a | |
782 | number: GetACP(). | |
783 | When the output goes to a console window, it needs to be provided in | |
784 | GetOEMCP() encoding if the console is using a raster font, or in | |
785 | GetConsoleOutputCP() encoding if it is using a TrueType font. | |
786 | But in GUI programs and for output sent to files and pipes, GetACP() | |
787 | encoding is the best bet. */ | |
788 | sprintf (buf, "CP%u", GetACP ()); | |
789 | } | |
8690e634 JK |
790 | codeset = buf; |
791 | ||
7a6dbc2f | 792 | # elif defined OS2 |
8690e634 JK |
793 | |
794 | const char *locale; | |
795 | static char buf[2 + 10 + 1]; | |
796 | ULONG cp[3]; | |
797 | ULONG cplen; | |
798 | ||
4a626d0a PA |
799 | codeset = NULL; |
800 | ||
8690e634 JK |
801 | /* Allow user to override the codeset, as set in the operating system, |
802 | with standard language environment variables. */ | |
803 | locale = getenv ("LC_ALL"); | |
804 | if (locale == NULL || locale[0] == '\0') | |
805 | { | |
806 | locale = getenv ("LC_CTYPE"); | |
807 | if (locale == NULL || locale[0] == '\0') | |
808 | locale = getenv ("LANG"); | |
809 | } | |
810 | if (locale != NULL && locale[0] != '\0') | |
811 | { | |
812 | /* If the locale name contains an encoding after the dot, return it. */ | |
813 | const char *dot = strchr (locale, '.'); | |
814 | ||
815 | if (dot != NULL) | |
816 | { | |
817 | const char *modifier; | |
818 | ||
819 | dot++; | |
820 | /* Look for the possible @... trailer and remove it, if any. */ | |
821 | modifier = strchr (dot, '@'); | |
822 | if (modifier == NULL) | |
823 | return dot; | |
824 | if (modifier - dot < sizeof (buf)) | |
825 | { | |
826 | memcpy (buf, dot, modifier - dot); | |
827 | buf [modifier - dot] = '\0'; | |
828 | return buf; | |
829 | } | |
830 | } | |
831 | ||
4a626d0a PA |
832 | /* For the POSIX locale, don't use the system's codepage. */ |
833 | if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0) | |
834 | codeset = ""; | |
8690e634 | 835 | } |
4a626d0a PA |
836 | |
837 | if (codeset == NULL) | |
8690e634 JK |
838 | { |
839 | /* OS/2 has a function returning the locale's codepage as a number. */ | |
840 | if (DosQueryCp (sizeof (cp), cp, &cplen)) | |
841 | codeset = ""; | |
842 | else | |
843 | { | |
844 | sprintf (buf, "CP%u", cp[0]); | |
845 | codeset = buf; | |
846 | } | |
847 | } | |
848 | ||
7a6dbc2f | 849 | # else |
8690e634 | 850 | |
7a6dbc2f | 851 | # error "Add code for other platforms here." |
8690e634 | 852 | |
7a6dbc2f SDJ |
853 | # endif |
854 | ||
855 | /* Resolve alias. */ | |
856 | { | |
857 | # ifdef alias_table_defined | |
858 | /* On some platforms, UTF-8 locales are the most frequently used ones. | |
859 | Speed up the common case and slow down the less common cases by | |
860 | testing for this case first. */ | |
861 | # if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__ | |
862 | if (strcmp (codeset, "UTF-8") == 0) | |
863 | goto done_table_lookup; | |
864 | else | |
865 | # endif | |
8690e634 | 866 | { |
7a6dbc2f SDJ |
867 | const struct table_entry * const table = alias_table; |
868 | size_t const table_size = | |
869 | sizeof (alias_table) / sizeof (struct table_entry); | |
870 | /* The table is sorted. Perform a binary search. */ | |
871 | size_t hi = table_size; | |
872 | size_t lo = 0; | |
873 | while (lo < hi) | |
874 | { | |
875 | /* Invariant: | |
876 | for i < lo, strcmp (table[i].alias, codeset) < 0, | |
877 | for i >= hi, strcmp (table[i].alias, codeset) > 0. */ | |
878 | size_t mid = (hi + lo) >> 1; /* >= lo, < hi */ | |
879 | int cmp = strcmp (table[mid].alias, codeset); | |
880 | if (cmp < 0) | |
881 | lo = mid + 1; | |
882 | else if (cmp > 0) | |
883 | hi = mid; | |
884 | else | |
885 | { | |
886 | /* Found an i with | |
887 | strcmp (table[i].alias, codeset) == 0. */ | |
888 | codeset = table[mid].canonical; | |
889 | goto done_table_lookup; | |
890 | } | |
891 | } | |
8690e634 | 892 | } |
7a6dbc2f SDJ |
893 | if (0) |
894 | done_table_lookup: ; | |
895 | else | |
896 | # endif | |
897 | { | |
898 | /* Did not find it in the table. */ | |
899 | /* On Mac OS X, all modern locales use the UTF-8 encoding. | |
900 | BeOS and Haiku have a single locale, and it has UTF-8 encoding. */ | |
901 | # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__ | |
902 | codeset = "UTF-8"; | |
903 | # else | |
904 | /* Don't return an empty string. GNU libc and GNU libiconv interpret | |
905 | the empty string as denoting "the locale's character encoding", | |
906 | thus GNU libiconv would call this function a second time. */ | |
907 | if (codeset[0] == '\0') | |
908 | codeset = "ASCII"; | |
909 | # endif | |
910 | } | |
911 | } | |
8690e634 | 912 | |
7a6dbc2f SDJ |
913 | #else |
914 | ||
915 | /* On old systems which lack it, use setlocale or getenv. */ | |
916 | const char *locale = NULL; | |
917 | ||
918 | /* But most old systems don't have a complete set of locales. Some | |
919 | (like DJGPP) have only the C locale. Therefore we don't use setlocale | |
920 | here; it would return "C" when it doesn't support the locale name the | |
921 | user has set. */ | |
922 | # if 0 | |
923 | locale = setlocale (LC_CTYPE, NULL); | |
924 | # endif | |
925 | if (locale == NULL || locale[0] == '\0') | |
926 | { | |
927 | locale = getenv ("LC_ALL"); | |
928 | if (locale == NULL || locale[0] == '\0') | |
929 | { | |
930 | locale = getenv ("LC_CTYPE"); | |
931 | if (locale == NULL || locale[0] == '\0') | |
932 | locale = getenv ("LANG"); | |
933 | if (locale == NULL) | |
934 | locale = ""; | |
935 | } | |
936 | } | |
937 | ||
938 | /* Map locale name to canonical encoding name. */ | |
939 | { | |
940 | # ifdef locale_table_defined | |
941 | const struct table_entry * const table = locale_table; | |
942 | size_t const table_size = | |
943 | sizeof (locale_table) / sizeof (struct table_entry); | |
944 | /* The table is sorted. Perform a binary search. */ | |
945 | size_t hi = table_size; | |
946 | size_t lo = 0; | |
947 | while (lo < hi) | |
948 | { | |
949 | /* Invariant: | |
950 | for i < lo, strcmp (table[i].locale, locale) < 0, | |
951 | for i >= hi, strcmp (table[i].locale, locale) > 0. */ | |
952 | size_t mid = (hi + lo) >> 1; /* >= lo, < hi */ | |
953 | int cmp = strcmp (table[mid].locale, locale); | |
954 | if (cmp < 0) | |
955 | lo = mid + 1; | |
956 | else if (cmp > 0) | |
957 | hi = mid; | |
958 | else | |
959 | { | |
960 | /* Found an i with | |
961 | strcmp (table[i].locale, locale) == 0. */ | |
962 | codeset = table[mid].canonical; | |
963 | goto done_table_lookup; | |
964 | } | |
965 | } | |
966 | if (0) | |
967 | done_table_lookup: ; | |
968 | else | |
969 | # endif | |
970 | { | |
971 | /* Did not find it in the table. */ | |
972 | /* On Mac OS X, all modern locales use the UTF-8 encoding. | |
973 | BeOS and Haiku have a single locale, and it has UTF-8 encoding. */ | |
974 | # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__ | |
975 | codeset = "UTF-8"; | |
976 | # else | |
977 | /* The canonical name cannot be determined. */ | |
978 | /* Don't return an empty string. GNU libc and GNU libiconv interpret | |
979 | the empty string as denoting "the locale's character encoding", | |
980 | thus GNU libiconv would call this function a second time. */ | |
981 | codeset = "ASCII"; | |
982 | # endif | |
983 | } | |
984 | } | |
985 | ||
986 | #endif | |
8690e634 | 987 | |
a512b375 JB |
988 | #ifdef DARWIN7 |
989 | /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8" | |
990 | (the default codeset) does not work when MB_CUR_MAX is 1. */ | |
4a626d0a | 991 | if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1) |
a512b375 JB |
992 | codeset = "ASCII"; |
993 | #endif | |
994 | ||
8690e634 JK |
995 | return codeset; |
996 | } |