bpo-34403: On HP-UX, force ASCII for C locale (GH-8969)

vstinner · web-flow · commit d500e5307aec · 2018-08-28T17:27:36.000+02:00
On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns
"ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale
is not coerced).

nl_langinfo(CODESET) announces "roman8" whereas it uses the Latin1
encoding in practice.
diff --git a/Include/fileutils.h b/Include/fileutils.h
@@ -170,6 +170,11 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
 
 #endif   /* Py_LIMITED_API */
 
+
+#ifdef Py_BUILD_CORE
+PyAPI_FUNC(int) _Py_GetForceASCII(void);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-08-28-10-49-55.bpo-34403.4Q3LzP.rst b/Misc/NEWS.d/next/Core and Builtins/2018-08-28-10-49-55.bpo-34403.4Q3LzP.rst
@@ -0,0 +1,3 @@
+On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns
+"ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale
+is not coerced).
diff --git a/Python/coreconfig.c b/Python/coreconfig.c
@@ -828,18 +828,21 @@ config_read_complex_options(_PyCoreConfig *config)
 static void
 config_init_locale(_PyCoreConfig *config)
 {
-    if (_Py_LegacyLocaleDetected()) {
+    if (config->coerce_c_locale < 0) {
         /* The C locale enables the C locale coercion (PEP 538) */
-        if (config->coerce_c_locale < 0) {
+        if (_Py_LegacyLocaleDetected()) {
             config->coerce_c_locale = 1;
         }
     }
+
 #ifndef MS_WINDOWS
-    const char *ctype_loc = setlocale(LC_CTYPE, NULL);
-    if (ctype_loc != NULL
-       && (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0)) {
+    if (config->utf8_mode < 0) {
         /* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */
-        if (config->utf8_mode < 0) {
+        const char *ctype_loc = setlocale(LC_CTYPE, NULL);
+        if (ctype_loc != NULL
+           && (strcmp(ctype_loc, "C") == 0
+               || strcmp(ctype_loc, "POSIX") == 0))
+        {
             config->utf8_mode = 1;
         }
     }
diff --git a/Python/fileutils.c b/Python/fileutils.c
@@ -72,8 +72,8 @@ _Py_device_encoding(int fd)
 
 extern int _Py_normalize_encoding(const char *, char *, size_t);
 
-/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
-   On these operating systems, nl_langinfo(CODESET) announces an alias of the
+/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
+   and POSIX locale. nl_langinfo(CODESET) announces an alias of the
    ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
    ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
    locale.getpreferredencoding() codec. For example, if command line arguments
@@ -86,6 +86,10 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
    workaround is also enabled on error, for example if getting the locale
    failed.
 
+   On HP-UX with the C locale or the POSIX locale, nl_langinfo(CODESET)
+   announces "roman8" but mbstowcs() uses Latin1 in practice. Force also the
+   ASCII encoding in this case.
+
    Values of force_ascii:
 
        1: the workaround is used: Py_EncodeLocale() uses
@@ -100,13 +104,46 @@ static int force_ascii = -1;
 static int
 check_force_ascii(void)
 {
-    char *loc;
+    char *loc = setlocale(LC_CTYPE, NULL);
+    if (loc == NULL) {
+        goto error;
+    }
+    if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
+        /* the LC_CTYPE locale is different than C and POSIX */
+        return 0;
+    }
+
 #if defined(HAVE_LANGINFO_H) && defined(CODESET)
-    char *codeset, **alias;
+    const char *codeset = nl_langinfo(CODESET);
+    if (!codeset || codeset[0] == '\0') {
+        /* CODESET is not set or empty */
+        goto error;
+    }
+
     char encoding[20];   /* longest name: "iso_646.irv_1991\0" */
-    int is_ascii;
-    unsigned int i;
-    char* ascii_aliases[] = {
+    if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
+        goto error;
+    }
+
+#ifdef __hpux
+    if (strcmp(encoding, "roman8") == 0) {
+        unsigned char ch;
+        wchar_t wch;
+        size_t res;
+
+        ch = (unsigned char)0xA7;
+        res = mbstowcs(&wch, (char*)&ch, 1);
+        if (res != (size_t)-1 && wch == L'\xA7') {
+            /* On HP-UX withe C locale or the POSIX locale,
+               nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
+               Latin1 encoding in practice. Force ASCII in this case.
+
+               Roman8 decodes 0xA7 to U+00CF. Latin1 decodes 0xA7 to U+00A7. */
+            return 1;
+        }
+    }
+#else
+    const char* ascii_aliases[] = {
         "ascii",
         /* Aliases from Lib/encodings/aliases.py */
         "646",
@@ -123,27 +160,9 @@ check_force_ascii(void)
         "us_ascii",
         NULL
     };
-#endif
-
-    loc = setlocale(LC_CTYPE, NULL);
-    if (loc == NULL)
-        goto error;
-    if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
-        /* the LC_CTYPE locale is different than C */
-        return 0;
-    }
-
-#if defined(HAVE_LANGINFO_H) && defined(CODESET)
-    codeset = nl_langinfo(CODESET);
-    if (!codeset || codeset[0] == '\0') {
-        /* CODESET is not set or empty */
-        goto error;
-    }
-    if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding)))
-        goto error;
 
-    is_ascii = 0;
-    for (alias=ascii_aliases; *alias != NULL; alias++) {
+    int is_ascii = 0;
+    for (const char **alias=ascii_aliases; *alias != NULL; alias++) {
         if (strcmp(encoding, *alias) == 0) {
             is_ascii = 1;
             break;
@@ -154,13 +173,14 @@ check_force_ascii(void)
         return 0;
     }
 
-    for (i=0x80; i<0xff; i++) {
-        unsigned char ch;
-        wchar_t wch;
+    for (unsigned int i=0x80; i<=0xff; i++) {
+        char ch[1];
+        wchar_t wch[1];
         size_t res;
 
-        ch = (unsigned char)i;
-        res = mbstowcs(&wch, (char*)&ch, 1);
+        unsigned uch = (unsigned char)i;
+        ch[0] = (char)uch;
+        res = mbstowcs(wch, ch, 1);
         if (res != (size_t)-1) {
             /* decoding a non-ASCII character from the locale encoding succeed:
                the locale encoding is not ASCII, force ASCII */
@@ -169,17 +189,29 @@ check_force_ascii(void)
     }
     /* None of the bytes in the range 0x80-0xff can be decoded from the locale
        encoding: the locale encoding is really ASCII */
+#endif   /* !defined(__hpux) */
     return 0;
 #else
     /* nl_langinfo(CODESET) is not available: always force ASCII */
     return 1;
-#endif
+#endif   /* defined(HAVE_LANGINFO_H) && defined(CODESET) */
 
 error:
     /* if an error occurred, force the ASCII encoding */
     return 1;
 }
 
+
+int
+_Py_GetForceASCII(void)
+{
+    if (force_ascii == -1) {
+        force_ascii = check_force_ascii();
+    }
+    return force_ascii;
+}
+
+
 static int
 encode_ascii(const wchar_t *text, char **str,
              size_t *error_pos, const char **reason,
@@ -234,6 +266,12 @@ encode_ascii(const wchar_t *text, char **str,
     *str = result;
     return 0;
 }
+#else
+int
+_Py_GetForceASCII(void)
+{
+    return 0;
+}
 #endif   /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
 
 
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
@@ -1576,21 +1576,25 @@ initfsencoding(PyInterpreterState *interp)
         Py_FileSystemDefaultEncodeErrors = "surrogatepass";
     }
 #else
-    if (Py_FileSystemDefaultEncoding == NULL &&
-        interp->core_config.utf8_mode)
-    {
-        Py_FileSystemDefaultEncoding = "utf-8";
-        Py_HasFileSystemDefaultEncoding = 1;
-    }
-    else if (Py_FileSystemDefaultEncoding == NULL) {
-        Py_FileSystemDefaultEncoding = get_locale_encoding();
-        if (Py_FileSystemDefaultEncoding == NULL) {
-            return _Py_INIT_ERR("Unable to get the locale encoding");
+    if (Py_FileSystemDefaultEncoding == NULL) {
+        if (interp->core_config.utf8_mode) {
+            Py_FileSystemDefaultEncoding = "utf-8";
+            Py_HasFileSystemDefaultEncoding = 1;
+        }
+        else if (_Py_GetForceASCII()) {
+            Py_FileSystemDefaultEncoding = "ascii";
+            Py_HasFileSystemDefaultEncoding = 1;
         }
+        else {
+            Py_FileSystemDefaultEncoding = get_locale_encoding();
+            if (Py_FileSystemDefaultEncoding == NULL) {
+                return _Py_INIT_ERR("Unable to get the locale encoding");
+            }
 
-        Py_HasFileSystemDefaultEncoding = 0;
-        interp->fscodec_initialized = 1;
-        return _Py_INIT_OK();
+            Py_HasFileSystemDefaultEncoding = 0;
+            interp->fscodec_initialized = 1;
+            return _Py_INIT_OK();
+        }
     }
 #endif
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns`
	`2`	`+"ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale`
	`3`	`+is not coerced).`
Original file line number	Diff line number	Diff line change
`@@ -828,18 +828,21 @@ config_read_complex_options(_PyCoreConfig *config)`
`828`	`828`	`static void`
`829`	`829`	`config_init_locale(_PyCoreConfig *config)`
`830`	`830`	`{`
`831`		`- if (_Py_LegacyLocaleDetected()) {`
	`831`	`+ if (config->coerce_c_locale < 0) {`
`832`	`832`	`/* The C locale enables the C locale coercion (PEP 538) */`
`833`		`- if (config->coerce_c_locale < 0) {`
	`833`	`+ if (_Py_LegacyLocaleDetected()) {`
`834`	`834`	`config->coerce_c_locale = 1;`
`835`	`835`	`}`
`836`	`836`	`}`
	`837`	`+`
`837`	`838`	`#ifndef MS_WINDOWS`
`838`		`- const char *ctype_loc = setlocale(LC_CTYPE, NULL);`
`839`		`- if (ctype_loc != NULL`
`840`		`- && (strcmp(ctype_loc, "C") == 0 \|\| strcmp(ctype_loc, "POSIX") == 0)) {`
	`839`	`+ if (config->utf8_mode < 0) {`
`841`	`840`	`/* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */`
`842`		`- if (config->utf8_mode < 0) {`
	`841`	`+ const char *ctype_loc = setlocale(LC_CTYPE, NULL);`
	`842`	`+ if (ctype_loc != NULL`
	`843`	`+ && (strcmp(ctype_loc, "C") == 0`
	`844`	`+ \|\| strcmp(ctype_loc, "POSIX") == 0))`
	`845`	`+ {`
`843`	`846`	`config->utf8_mode = 1;`
`844`	`847`	`}`
`845`	`848`	`}`