(Merge 3.2) Issue #16455: On FreeBSD and Solaris, if the locale is C, the

author Victor Stinner <victor.stinner@gmail.com>

Thu, 3 Jan 2013 00:21:07 +0000 (01:21 +0100)

committer Victor Stinner <victor.stinner@gmail.com>

Thu, 3 Jan 2013 00:21:07 +0000 (01:21 +0100)
author Victor Stinner <victor.stinner@gmail.com>
Thu, 3 Jan 2013 00:21:07 +0000 (01:21 +0100)
committer Victor Stinner <victor.stinner@gmail.com>
Thu, 3 Jan 2013 00:21:07 +0000 (01:21 +0100)
diff --cc Misc/NEWS
Simple merge
diff --cc Objects/unicodeobject.c

index 0b9d65291f70245b25d8fded825609620a478401,f8c738bc75f898eff571f138b1c6be78ee625ca6..3b307ed1d9b12e3425515e81a44607572828c827
--- 1/Objects/unicodeobject.c
--- 2/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@@ -3089,16 -1473,17 +3089,16 @@@ PyUnicode_Decode(const char *s
       Py_buffer info;
       char lower[11];  /* Enough for any encoding shortcut */
   
- -    if (encoding == NULL)
- -        encoding = PyUnicode_GetDefaultEncoding();
- -
       /* Shortcuts for common default encodings */
-     if (normalize_encoding(encoding, lower, sizeof(lower))) {
+     if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
- -        if (strcmp(lower, "utf-8") == 0)
- -            return PyUnicode_DecodeUTF8(s, size, errors);
+ +        if ((strcmp(lower, "utf-8") == 0) ||
+ +            (strcmp(lower, "utf8") == 0))
+ +            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
           else if ((strcmp(lower, "latin-1") == 0) ||
+ +                 (strcmp(lower, "latin1") == 0) ||
                    (strcmp(lower, "iso-8859-1") == 0))
               return PyUnicode_DecodeLatin1(s, size, errors);
- -#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
+ +#ifdef HAVE_MBCS
           else if (strcmp(lower, "mbcs") == 0)
               return PyUnicode_DecodeMBCS(s, size, errors);
   #endif
@@@ -3454,27 -1691,46 +3454,27 @@@ PyUnicode_AsEncodedString(PyObject *uni
           return NULL;
       }
   
- -    if (encoding == NULL)
- -        encoding = PyUnicode_GetDefaultEncoding();
- -
       /* Shortcuts for common default encodings */
-     if (normalize_encoding(encoding, lower, sizeof(lower))) {
+     if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
- -        if (strcmp(lower, "utf-8") == 0)
- -            return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
- -                                        PyUnicode_GET_SIZE(unicode),
- -                                        errors);
+ +        if ((strcmp(lower, "utf-8") == 0) ||
+ +            (strcmp(lower, "utf8") == 0))
+ +        {
+ +            if (errors == NULL || strcmp(errors, "strict") == 0)
+ +                return _PyUnicode_AsUTF8String(unicode, NULL);
+ +            else
+ +                return _PyUnicode_AsUTF8String(unicode, errors);
+ +        }
           else if ((strcmp(lower, "latin-1") == 0) ||
+ +                 (strcmp(lower, "latin1") == 0) ||
                    (strcmp(lower, "iso-8859-1") == 0))
- -            return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
- -                                          PyUnicode_GET_SIZE(unicode),
- -                                          errors);
- -#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
+ +            return _PyUnicode_AsLatin1String(unicode, errors);
+ +#ifdef HAVE_MBCS
           else if (strcmp(lower, "mbcs") == 0)
- -            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
- -                                        PyUnicode_GET_SIZE(unicode),
- -                                        errors);
+ +            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
   #endif
           else if (strcmp(lower, "ascii") == 0)
- -            return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
- -                                         PyUnicode_GET_SIZE(unicode),
- -                                         errors);
- -    }
- -    /* During bootstrap, we may need to find the encodings
- -       package, to load the file system encoding, and require the
- -       file system encoding in order to load the encodings
- -       package.
- -
- -       Break out of this dependency by assuming that the path to
- -       the encodings module is ASCII-only.  XXX could try wcstombs
- -       instead, if the file system encoding is the locale's
- -       encoding. */
- -    if (Py_FileSystemDefaultEncoding &&
- -             strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
- -             !PyThreadState_GET()->interp->codecs_initialized)
- -        return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
- -                                     PyUnicode_GET_SIZE(unicode),
- -                                     errors);
+ +            return _PyUnicode_AsASCIIString(unicode, errors);
+ +    }
   
       /* Encode via the codec registry */
       v = PyCodec_Encode(unicode, encoding, errors);
diff --cc Python/fileutils.c

index 2e9ea359da0fcf322a87d14260fa1c431f357b15,53e8a470e952b30194ce85e1448a4c9a2c4813ce..b7c42e8e85b09ecbc586dd86d1e29f6093c72cc1
--- 1/Python/fileutils.c
--- 2/Python/fileutils.c
+++ b/Python/fileutils.c
@@@ -12,37 -12,182 +13,212 @@@
   extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
   #endif
   
- #ifdef HAVE_STAT
+ +PyObject *
+ +_Py_device_encoding(int fd)
+ +{
+ +#if defined(MS_WINDOWS) || defined(MS_WIN64)
+ +    UINT cp;
+ +#endif
+ +    if (!_PyVerify_fd(fd) || !isatty(fd)) {
+ +        Py_RETURN_NONE;
+ +    }
+ +#if defined(MS_WINDOWS) || defined(MS_WIN64)
+ +    if (fd == 0)
+ +        cp = GetConsoleCP();
+ +    else if (fd == 1 || fd == 2)
+ +        cp = GetConsoleOutputCP();
+ +    else
+ +        cp = 0;
+ +    /* GetConsoleCP() and GetConsoleOutputCP() return 0 if the application
+ +       has no console */
+ +    if (cp != 0)
+ +        return PyUnicode_FromFormat("cp%u", (unsigned int)cp);
+ +#elif defined(CODESET)
+ +    {
+ +        char *codeset = nl_langinfo(CODESET);
+ +        if (codeset != NULL && codeset[0] != 0)
+ +            return PyUnicode_FromString(codeset);
+ +    }
+ +#endif
+ +    Py_RETURN_NONE;
+ +}
+ +
+ #if !defined(__APPLE__) && !defined(MS_WINDOWS)
+ extern int _Py_normalize_encoding(const char *, char *, size_t);
+ 
+ /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
+    On these operating systems, nl_langinfo(CODESET) announces an alias of the
+    ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
+    ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
+    locale.getpreferredencoding() codec. For example, if command line arguments
+    are decoded by mbstowcs() and encoded back by os.fsencode(), we get a
+    UnicodeEncodeError instead of retrieving the original byte string.
+ 
+    The workaround is enabled if setlocale(LC_CTYPE, NULL) returns "C",
+    nl_langinfo(CODESET) announces "ascii" (or an alias to ASCII), and at least
+    one byte in range 0x80-0xff can be decoded from the locale encoding. The
+    workaround is also enabled on error, for example if getting the locale
+    failed.
+ 
+    Values of locale_is_ascii:
+ 
+        1: the workaround is used: _Py_wchar2char() uses
+           encode_ascii_surrogateescape() and _Py_char2wchar() uses
+           decode_ascii_surrogateescape()
+        0: the workaround is not used: _Py_wchar2char() uses wcstombs() and
+           _Py_char2wchar() uses mbstowcs()
+       -1: unknown, need to call check_force_ascii() to get the value
+ */
+ static int force_ascii = -1;
+ 
+ static int
+ check_force_ascii(void)
+ {
+     char *loc;
+ #if defined(HAVE_LANGINFO_H) && defined(CODESET)
+     char *codeset, **alias;
+     char encoding[100];
+     int is_ascii;
+     unsigned int i;
+     char* ascii_aliases[] = {
+         "ascii",
+         "646",
+         "ansi-x3.4-1968",
+         "ansi-x3-4-1968",
+         "ansi-x3.4-1986",
+         "cp367",
+         "csascii",
+         "ibm367",
+         "iso646-us",
+         "iso-646.irv-1991",
+         "iso-ir-6",
+         "us",
+         "us-ascii",
+         NULL
+     };
+ #endif
+ 
+     loc = setlocale(LC_CTYPE, NULL);
+     if (loc == NULL)
+         goto error;
+     if (strcmp(loc, "C") != 0) {
+         /* the LC_CTYPE locale is different than C */
+         return 0;
+     }
+ 
+ #if defined(HAVE_LANGINFO_H) && defined(CODESET)
+     codeset = nl_langinfo(CODESET);
+     if (!codeset || codeset[0] == '\0') {
+         /* CODESET is not set or empty */
+         goto error;
+     }
+     if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding)))
+         goto error;
+ 
+     is_ascii = 0;
+     for (alias=ascii_aliases; *alias != NULL; alias++) {
+         if (strcmp(encoding, *alias) == 0) {
+             is_ascii = 1;
+             break;
+         }
+     }
+     if (!is_ascii) {
+         /* nl_langinfo(CODESET) is not "ascii" or an alias of ASCII */
+         return 0;
+     }
+ 
+     for (i=0x80; i<0xff; i++) {
+         unsigned char ch;
+         wchar_t wch;
+         size_t res;
+ 
+         ch = (unsigned char)i;
+         res = mbstowcs(&wch, (char*)&ch, 1);
+         if (res != (size_t)-1) {
+             /* decoding a non-ASCII character from the locale encoding succeed:
+                the locale encoding is not ASCII, force ASCII */
+             return 1;
+         }
+     }
+     /* None of the bytes in the range 0x80-0xff can be decoded from the locale
+        encoding: the locale encoding is really ASCII */
+     return 0;
+ #else
+     /* nl_langinfo(CODESET) is not available: always force ASCII */
+     return 1;
+ #endif
+ 
+ error:
+     /* if an error occured, force the ASCII encoding */
+     return 1;
+ }
+ 
+ static char*
+ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos)
+ {
+     char *result = NULL, *out;
+     size_t len, i;
+     wchar_t ch;
+ 
+     if (error_pos != NULL)
+         *error_pos = (size_t)-1;
+ 
+     len = wcslen(text);
+ 
+     result = PyMem_Malloc(len + 1);  /* +1 for NUL byte */
+     if (result == NULL)
+         return NULL;
+ 
+     out = result;
+     for (i=0; i<len; i++) {
+         ch = text[i];
+ 
+         if (ch <= 0x7f) {
+             /* ASCII character */
+             *out++ = (char)ch;
+         }
+         else if (0xdc80 <= ch && ch <= 0xdcff) {
+             /* UTF-8b surrogate */
+             *out++ = (char)(ch - 0xdc00);
+         }
+         else {
+             if (error_pos != NULL)
+                 *error_pos = i;
+             PyMem_Free(result);
+             return NULL;
+         }
+     }
+     *out = '\0';
+     return result;
+ }
+ #endif   /* !defined(__APPLE__) && !defined(MS_WINDOWS) */
+ 
+ #if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC))
+ static wchar_t*
+ decode_ascii_surrogateescape(const char *arg, size_t *size)
+ {
+     wchar_t *res;
+     unsigned char *in;
+     wchar_t *out;
+ 
+     res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
+     if (!res)
+         return NULL;
+ 
+     in = (unsigned char*)arg;
+     out = res;
+     while(*in)
+         if(*in < 128)
+             *out++ = *in++;
+         else
+             *out++ = 0xdc00 + *in++;
+     *out = 0;
+     if (size != NULL)
+         *size = out - res;
+     return res;
+ }
+ #endif
+ 
   
   /* Decode a byte string from the locale encoding with the
      surrogateescape error handler (undecodable bytes are decoded as characters
@@@ -164,24 -323,13 +357,14 @@@ _Py_char2wchar(const char* arg, size_t 
       /* Cannot use C locale for escaping; manually escape as if charset
          is ASCII (i.e. escape all bytes > 128. This will still roundtrip
          correctly in the locale's charset, which must be an ASCII superset. */
-     res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
-     if (!res)
+     res = decode_ascii_surrogateescape(arg, size);
+     if (res == NULL)
           goto oom;
-     in = (unsigned char*)arg;
-     out = res;
-     while(*in)
-         if(*in < 128)
-             *out++ = *in++;
-         else
-             *out++ = 0xdc00 + *in++;
-     *out = 0;
   #endif   /* HAVE_MBRTOWC */
-     if (size != NULL)
-         *size = out - res;
       return res;
   oom:
- -    fprintf(stderr, "out of memory\n");
+ +    if (size != NULL)
+ +        *size = (size_t)-1;
       return NULL;
   #endif   /* __APPLE__ */
   }
author	Victor Stinner <victor.stinner@gmail.com>
	Thu, 3 Jan 2013 00:21:07 +0000 (01:21 +0100)
committer	Victor Stinner <victor.stinner@gmail.com>
	Thu, 3 Jan 2013 00:21:07 +0000 (01:21 +0100)
		1	2
Misc/NEWS	patch \|	diff1 \|	diff2 \|	blob \| history
Objects/unicodeobject.c	patch \|	diff1 \|	diff2 \|	blob \| history
Python/fileutils.c	patch \|	diff1 \|	diff2 \|	blob \| history