Issue #16455: On FreeBSD and Solaris, if the locale is C, the

vstinner · vstinner · commit d45c7f8d74d3 · 2012-12-04T01:34:47.000+01:00
ASCII/surrogateescape codec is now used, instead of the locale encoding, to
decode the command line arguments. This change fixes inconsistencies with
os.fsencode() and os.fsdecode() because these operating systems announces an
ASCII locale encoding, whereas the ISO-8859-1 encoding is used in practice.
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
@@ -1742,7 +1742,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
 /* Encode a Unicode object to the current locale encoding. The encoder is
    strict is *surrogateescape* is equal to zero, otherwise the
    "surrogateescape" error handler is used. Return a bytes object. The string
-   cannot contain embedded null characters.. */
+   cannot contain embedded null characters. */
 
 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
     PyObject *unicode,
diff --git a/Lib/test/test_cmd_line_script.py b/Lib/test/test_cmd_line_script.py
@@ -367,11 +367,10 @@ def test_non_ascii(self):
         # Mac OS X denies the creation of a file with an invalid UTF-8 name.
         # Windows allows to create a name with an arbitrary bytes name, but
         # Python cannot a undecodable bytes argument to a subprocess.
-        #if (support.TESTFN_UNDECODABLE
-        #and sys.platform not in ('win32', 'darwin')):
-        #    name = os.fsdecode(support.TESTFN_UNDECODABLE)
-        #elif support.TESTFN_NONASCII:
-        if support.TESTFN_NONASCII:
+        if (support.TESTFN_UNDECODABLE
+        and sys.platform not in ('win32', 'darwin')):
+            name = os.fsdecode(support.TESTFN_UNDECODABLE)
+        elif support.TESTFN_NONASCII:
             name = support.TESTFN_NONASCII
         else:
             self.skipTest("need support.TESTFN_NONASCII")
diff --git a/Misc/NEWS b/Misc/NEWS
@@ -10,6 +10,12 @@ What's New in Python 3.4.0 Alpha 1?
 Core and Builtins
 -----------------
 
+- Issue #16455: On FreeBSD and Solaris, if the locale is C, the
+  ASCII/surrogateescape codec is now used, instead of the locale encoding, to
+  decode the command line arguments. This change fixes inconsistencies with
+  os.fsencode() and os.fsdecode() because these operating systems announces an
+  ASCII locale encoding, whereas the ISO-8859-1 encoding is used in practice.
+
 - Issue #16562: Optimize dict equality testing.  Patch by Serhiy Storchaka.
 
 - Issue #16588: Silence unused-but-set warnings in Python/thread_pthread
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -2863,10 +2863,10 @@ PyUnicode_FromEncodedObject(register PyObject *obj,
 /* Convert encoding to lower case and replace '_' with '-' in order to
    catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
    1 on success. */
-static int
-normalize_encoding(const char *encoding,
-                   char *lower,
-                   size_t lower_len)
+int
+_Py_normalize_encoding(const char *encoding,
+                       char *lower,
+                       size_t lower_len)
 {
     const char *e;
     char *l;
@@ -2908,7 +2908,7 @@ PyUnicode_Decode(const char *s,
     char lower[11];  /* Enough for any encoding shortcut */
 
     /* Shortcuts for common default encodings */
-    if (normalize_encoding(encoding, lower, sizeof(lower))) {
+    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
         if ((strcmp(lower, "utf-8") == 0) ||
             (strcmp(lower, "utf8") == 0))
             return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
@@ -3110,7 +3110,8 @@ locale_error_handler(const char *errors, int *surrogateescape)
         *surrogateescape = 0;
         return 0;
     }
-    if (strcmp(errors, "surrogateescape") == 0) {
+    if (errors == "surrogateescape"
+        || strcmp(errors, "surrogateescape") == 0) {
         *surrogateescape = 1;
         return 0;
     }
@@ -3148,7 +3149,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
     }
 
     if (surrogateescape) {
-        /* locale encoding with surrogateescape */
+        /* "surrogateescape" error handler */
         char *str;
 
         str = _Py_wchar2char(wstr, &error_pos);
@@ -3168,6 +3169,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
         PyMem_Free(str);
     }
     else {
+        /* strict mode */
         size_t len, len2;
 
         len = wcstombs(NULL, wstr, 0);
@@ -3273,7 +3275,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
     }
 
     /* Shortcuts for common default encodings */
-    if (normalize_encoding(encoding, lower, sizeof(lower))) {
+    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
         if ((strcmp(lower, "utf-8") == 0) ||
             (strcmp(lower, "utf8") == 0))
         {
@@ -3413,8 +3415,8 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
         return NULL;
     }
 
-    if (surrogateescape)
-    {
+    if (surrogateescape) {
+        /* "surrogateescape" error handler */
         wstr = _Py_char2wchar(str, &wlen);
         if (wstr == NULL) {
             if (wlen == (size_t)-1)
@@ -3428,6 +3430,7 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
         PyMem_Free(wstr);
     }
     else {
+        /* strict mode */
 #ifndef HAVE_BROKEN_MBSTOWCS
         wlen = mbstowcs(NULL, str, 0);
 #else
@@ -3447,7 +3450,6 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
                 return PyErr_NoMemory();
         }
 
-        /* This shouldn't fail now */
         wlen2 = mbstowcs(wstr, str, wlen+1);
         if (wlen2 == (size_t)-1) {
             if (wstr != smallbuf)
diff --git a/Python/fileutils.c b/Python/fileutils.c