1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Determine a canonical name for the current locale's character encoding.
5 Copyright (C) 2000-2006, 2008-2010 Free Software Foundation, Inc.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 /* Written by Bruno Haible <bruno@clisp.org>. */
26 #include "localcharset.h"
34 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
35 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
38 #if defined _WIN32 || defined __WIN32__
43 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
49 #if !defined WIN32_NATIVE
51 # if HAVE_LANGINFO_CODESET
52 # include <langinfo.h>
54 # if 0 /* see comment below */
59 # define WIN32_LEAN_AND_MEAN
62 #elif defined WIN32_NATIVE
63 # define WIN32_LEAN_AND_MEAN
71 #if ENABLE_RELOCATABLE
72 # include "relocatable.h"
74 # define relocate(pathname) (pathname)
79 # include "configmake.h"
82 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
87 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
88 /* Win32, Cygwin, OS/2, DOS */
89 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
92 #ifndef DIRECTORY_SEPARATOR
93 # define DIRECTORY_SEPARATOR '/'
97 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
100 #if HAVE_DECL_GETC_UNLOCKED
102 # define getc getc_unlocked
105 /* The following static variable is declared 'volatile' to avoid a
106 possible multithread problem in the function get_charset_aliases. If we
107 are running in a threaded environment, and if two threads initialize
108 'charset_aliases' simultaneously, both will produce the same value,
109 and everything will be ok if the two assignments to 'charset_aliases'
110 are atomic. But I don't know what will happen if the two assignments mix. */
112 # define volatile /* empty */
114 /* Pointer to the contents of the charset.alias file, if it has already been
115 read, else NULL. Its format is:
116 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
117 static const char * volatile charset_aliases;
119 /* Return a pointer to the contents of the charset.alias file. */
121 get_charset_aliases (void)
125 cp = charset_aliases;
128 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
130 const char *base = "charset.alias";
133 /* Make it possible to override the charset.alias location. This is
134 necessary for running the testsuite before "make install". */
135 dir = getenv ("CHARSETALIASDIR");
136 if (dir == NULL || dir[0] == '\0')
137 dir = relocate (LIBDIR);
139 /* Concatenate dir and base into freshly allocated file_name. */
141 size_t dir_len = strlen (dir);
142 size_t base_len = strlen (base);
143 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
144 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
145 if (file_name != NULL)
147 memcpy (file_name, dir, dir_len);
149 file_name[dir_len] = DIRECTORY_SEPARATOR;
150 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
154 if (file_name == NULL)
155 /* Out of memory. Treat the file as empty. */
161 /* Open the file. Reject symbolic links on platforms that support
162 O_NOFOLLOW. This is a security feature. Without it, an attacker
163 could retrieve parts of the contents (namely, the tail of the
164 first line that starts with "* ") of an arbitrary file by placing
165 a symbolic link to that file under the name "charset.alias" in
166 some writable directory and defining the environment variable
167 CHARSETALIASDIR to point to that directory. */
168 fd = open (file_name,
169 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
171 /* File not found. Treat it as empty. */
177 fp = fdopen (fd, "r");
180 /* Out of memory. Treat the file as empty. */
186 /* Parse the file's contents. */
187 char *res_ptr = NULL;
201 if (c == '\n' || c == ' ' || c == '\t')
205 /* Skip comment, to end of line. */
208 while (!(c == EOF || c == '\n'));
214 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
218 old_res_ptr = res_ptr;
221 res_size = l1 + 1 + l2 + 1;
222 res_ptr = (char *) malloc (res_size + 1);
226 res_size += l1 + 1 + l2 + 1;
227 res_ptr = (char *) realloc (res_ptr, res_size + 1);
233 if (old_res_ptr != NULL)
237 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
238 strcpy (res_ptr + res_size - (l2 + 1), buf2);
245 *(res_ptr + res_size) = '\0';
257 /* To avoid the trouble of installing a file that is shared by many
258 GNU packages -- many packaging systems have problems with this --,
259 simply inline the aliases here. */
260 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
261 "ISO8859-2" "\0" "ISO-8859-2" "\0"
262 "ISO8859-4" "\0" "ISO-8859-4" "\0"
263 "ISO8859-5" "\0" "ISO-8859-5" "\0"
264 "ISO8859-7" "\0" "ISO-8859-7" "\0"
265 "ISO8859-9" "\0" "ISO-8859-9" "\0"
266 "ISO8859-13" "\0" "ISO-8859-13" "\0"
267 "ISO8859-15" "\0" "ISO-8859-15" "\0"
268 "KOI8-R" "\0" "KOI8-R" "\0"
269 "KOI8-U" "\0" "KOI8-U" "\0"
270 "CP866" "\0" "CP866" "\0"
271 "CP949" "\0" "CP949" "\0"
272 "CP1131" "\0" "CP1131" "\0"
273 "CP1251" "\0" "CP1251" "\0"
274 "eucCN" "\0" "GB2312" "\0"
275 "GB2312" "\0" "GB2312" "\0"
276 "eucJP" "\0" "EUC-JP" "\0"
277 "eucKR" "\0" "EUC-KR" "\0"
278 "Big5" "\0" "BIG5" "\0"
279 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
280 "GBK" "\0" "GBK" "\0"
281 "GB18030" "\0" "GB18030" "\0"
282 "SJIS" "\0" "SHIFT_JIS" "\0"
283 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
284 "PT154" "\0" "PT154" "\0"
285 /*"ISCII-DEV" "\0" "?" "\0"*/
286 "*" "\0" "UTF-8" "\0";
290 /* To avoid the troubles of an extra file charset.alias_vms in the
291 sources of many GNU packages, simply inline the aliases here. */
292 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
293 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
294 section 10.7 "Handling Different Character Sets". */
295 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
296 "ISO8859-2" "\0" "ISO-8859-2" "\0"
297 "ISO8859-5" "\0" "ISO-8859-5" "\0"
298 "ISO8859-7" "\0" "ISO-8859-7" "\0"
299 "ISO8859-8" "\0" "ISO-8859-8" "\0"
300 "ISO8859-9" "\0" "ISO-8859-9" "\0"
302 "eucJP" "\0" "EUC-JP" "\0"
303 "SJIS" "\0" "SHIFT_JIS" "\0"
304 "DECKANJI" "\0" "DEC-KANJI" "\0"
305 "SDECKANJI" "\0" "EUC-JP" "\0"
307 "eucTW" "\0" "EUC-TW" "\0"
308 "DECHANYU" "\0" "DEC-HANYU" "\0"
309 "DECHANZI" "\0" "GB2312" "\0"
311 "DECKOREAN" "\0" "EUC-KR" "\0";
314 # if defined WIN32_NATIVE || defined __CYGWIN__
315 /* To avoid the troubles of installing a separate file in the same
316 directory as the DLL and of retrieving the DLL's directory at
317 runtime, simply inline the aliases here. */
319 cp = "CP936" "\0" "GBK" "\0"
320 "CP1361" "\0" "JOHAB" "\0"
321 "CP20127" "\0" "ASCII" "\0"
322 "CP20866" "\0" "KOI8-R" "\0"
323 "CP20936" "\0" "GB2312" "\0"
324 "CP21866" "\0" "KOI8-RU" "\0"
325 "CP28591" "\0" "ISO-8859-1" "\0"
326 "CP28592" "\0" "ISO-8859-2" "\0"
327 "CP28593" "\0" "ISO-8859-3" "\0"
328 "CP28594" "\0" "ISO-8859-4" "\0"
329 "CP28595" "\0" "ISO-8859-5" "\0"
330 "CP28596" "\0" "ISO-8859-6" "\0"
331 "CP28597" "\0" "ISO-8859-7" "\0"
332 "CP28598" "\0" "ISO-8859-8" "\0"
333 "CP28599" "\0" "ISO-8859-9" "\0"
334 "CP28605" "\0" "ISO-8859-15" "\0"
335 "CP38598" "\0" "ISO-8859-8" "\0"
336 "CP51932" "\0" "EUC-JP" "\0"
337 "CP51936" "\0" "GB2312" "\0"
338 "CP51949" "\0" "EUC-KR" "\0"
339 "CP51950" "\0" "EUC-TW" "\0"
340 "CP54936" "\0" "GB18030" "\0"
341 "CP65001" "\0" "UTF-8" "\0";
345 charset_aliases = cp;
351 /* Determine the current locale's character encoding, and canonicalize it
352 into one of the canonical names listed in config.charset.
353 The result must not be freed; it is statically allocated.
354 If the canonical name cannot be determined, the result is a non-canonical
361 locale_charset (void)
366 #if !(defined WIN32_NATIVE || defined OS2)
368 # if HAVE_LANGINFO_CODESET
370 /* Most systems support nl_langinfo (CODESET) nowadays. */
371 codeset = nl_langinfo (CODESET);
374 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
375 returns "US-ASCII". Return the suffix of the locale name from the
376 environment variables (if present) or the codepage as a number. */
377 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
380 static char buf[2 + 10 + 1];
382 locale = getenv ("LC_ALL");
383 if (locale == NULL || locale[0] == '\0')
385 locale = getenv ("LC_CTYPE");
386 if (locale == NULL || locale[0] == '\0')
387 locale = getenv ("LANG");
389 if (locale != NULL && locale[0] != '\0')
391 /* If the locale name contains an encoding after the dot, return
393 const char *dot = strchr (locale, '.');
397 const char *modifier;
400 /* Look for the possible @... trailer and remove it, if any. */
401 modifier = strchr (dot, '@');
402 if (modifier == NULL)
404 if (modifier - dot < sizeof (buf))
406 memcpy (buf, dot, modifier - dot);
407 buf [modifier - dot] = '\0';
413 /* Woe32 has a function returning the locale's codepage as a number:
414 GetACP(). This encoding is used by Cygwin, unless the user has set
415 the environment variable CYGWIN=codepage:oem (which very few people
417 Output directed to console windows needs to be converted (to
418 GetOEMCP() if the console is using a raster font, or to
419 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
420 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
421 converting to GetConsoleOutputCP(). This leads to correct results,
422 except when SetConsoleOutputCP has been called and a raster font is
424 sprintf (buf, "CP%u", GetACP ());
431 /* On old systems which lack it, use setlocale or getenv. */
432 const char *locale = NULL;
434 /* But most old systems don't have a complete set of locales. Some
435 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
436 use setlocale here; it would return "C" when it doesn't support the
437 locale name the user has set. */
439 locale = setlocale (LC_CTYPE, NULL);
441 if (locale == NULL || locale[0] == '\0')
443 locale = getenv ("LC_ALL");
444 if (locale == NULL || locale[0] == '\0')
446 locale = getenv ("LC_CTYPE");
447 if (locale == NULL || locale[0] == '\0')
448 locale = getenv ("LANG");
452 /* On some old systems, one used to set locale = "iso8859_1". On others,
453 you set it to "language_COUNTRY.charset". In any case, we resolve it
454 through the charset.alias file. */
459 #elif defined WIN32_NATIVE
461 static char buf[2 + 10 + 1];
463 /* Woe32 has a function returning the locale's codepage as a number:
465 When the output goes to a console window, it needs to be provided in
466 GetOEMCP() encoding if the console is using a raster font, or in
467 GetConsoleOutputCP() encoding if it is using a TrueType font.
468 But in GUI programs and for output sent to files and pipes, GetACP()
469 encoding is the best bet. */
470 sprintf (buf, "CP%u", GetACP ());
476 static char buf[2 + 10 + 1];
480 /* Allow user to override the codeset, as set in the operating system,
481 with standard language environment variables. */
482 locale = getenv ("LC_ALL");
483 if (locale == NULL || locale[0] == '\0')
485 locale = getenv ("LC_CTYPE");
486 if (locale == NULL || locale[0] == '\0')
487 locale = getenv ("LANG");
489 if (locale != NULL && locale[0] != '\0')
491 /* If the locale name contains an encoding after the dot, return it. */
492 const char *dot = strchr (locale, '.');
496 const char *modifier;
499 /* Look for the possible @... trailer and remove it, if any. */
500 modifier = strchr (dot, '@');
501 if (modifier == NULL)
503 if (modifier - dot < sizeof (buf))
505 memcpy (buf, dot, modifier - dot);
506 buf [modifier - dot] = '\0';
511 /* Resolve through the charset.alias file. */
516 /* OS/2 has a function returning the locale's codepage as a number. */
517 if (DosQueryCp (sizeof (cp), cp, &cplen))
521 sprintf (buf, "CP%u", cp[0]);
529 /* The canonical name cannot be determined. */
533 for (aliases = get_charset_aliases ();
535 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
536 if (strcmp (codeset, aliases) == 0
537 || (aliases[0] == '*' && aliases[1] == '\0'))
539 codeset = aliases + strlen (aliases) + 1;
543 /* Don't return an empty string. GNU libc and GNU libiconv interpret
544 the empty string as denoting "the locale's character encoding",
545 thus GNU libiconv would call this function a second time. */
546 if (codeset[0] == '\0')