Skip to content

Commit d45c7f8

Browse files
committed
Issue #16455: On FreeBSD and Solaris, if the locale is C, the
ASCII/surrogateescape codec is now used, instead of the locale encoding, to decode the command line arguments. This change fixes inconsistencies with os.fsencode() and os.fsdecode() because these operating systems announces an ASCII locale encoding, whereas the ISO-8859-1 encoding is used in practice.
1 parent ca9f8b2 commit d45c7f8

File tree

5 files changed

+241
-40
lines changed

5 files changed

+241
-40
lines changed

Include/unicodeobject.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1742,7 +1742,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
17421742
/* Encode a Unicode object to the current locale encoding. The encoder is
17431743
strict is *surrogateescape* is equal to zero, otherwise the
17441744
"surrogateescape" error handler is used. Return a bytes object. The string
1745-
cannot contain embedded null characters.. */
1745+
cannot contain embedded null characters. */
17461746

17471747
PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
17481748
PyObject *unicode,

Lib/test/test_cmd_line_script.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -367,11 +367,10 @@ def test_non_ascii(self):
367367
# Mac OS X denies the creation of a file with an invalid UTF-8 name.
368368
# Windows allows to create a name with an arbitrary bytes name, but
369369
# Python cannot a undecodable bytes argument to a subprocess.
370-
#if (support.TESTFN_UNDECODABLE
371-
#and sys.platform not in ('win32', 'darwin')):
372-
# name = os.fsdecode(support.TESTFN_UNDECODABLE)
373-
#elif support.TESTFN_NONASCII:
374-
if support.TESTFN_NONASCII:
370+
if (support.TESTFN_UNDECODABLE
371+
and sys.platform not in ('win32', 'darwin')):
372+
name = os.fsdecode(support.TESTFN_UNDECODABLE)
373+
elif support.TESTFN_NONASCII:
375374
name = support.TESTFN_NONASCII
376375
else:
377376
self.skipTest("need support.TESTFN_NONASCII")

Misc/NEWS

+6
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ What's New in Python 3.4.0 Alpha 1?
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #16455: On FreeBSD and Solaris, if the locale is C, the
14+
ASCII/surrogateescape codec is now used, instead of the locale encoding, to
15+
decode the command line arguments. This change fixes inconsistencies with
16+
os.fsencode() and os.fsdecode() because these operating systems announces an
17+
ASCII locale encoding, whereas the ISO-8859-1 encoding is used in practice.
18+
1319
- Issue #16562: Optimize dict equality testing. Patch by Serhiy Storchaka.
1420

1521
- Issue #16588: Silence unused-but-set warnings in Python/thread_pthread

Objects/unicodeobject.c

+13-11
Original file line numberDiff line numberDiff line change
@@ -2863,10 +2863,10 @@ PyUnicode_FromEncodedObject(register PyObject *obj,
28632863
/* Convert encoding to lower case and replace '_' with '-' in order to
28642864
catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
28652865
1 on success. */
2866-
static int
2867-
normalize_encoding(const char *encoding,
2868-
char *lower,
2869-
size_t lower_len)
2866+
int
2867+
_Py_normalize_encoding(const char *encoding,
2868+
char *lower,
2869+
size_t lower_len)
28702870
{
28712871
const char *e;
28722872
char *l;
@@ -2908,7 +2908,7 @@ PyUnicode_Decode(const char *s,
29082908
char lower[11]; /* Enough for any encoding shortcut */
29092909

29102910
/* Shortcuts for common default encodings */
2911-
if (normalize_encoding(encoding, lower, sizeof(lower))) {
2911+
if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
29122912
if ((strcmp(lower, "utf-8") == 0) ||
29132913
(strcmp(lower, "utf8") == 0))
29142914
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
@@ -3110,7 +3110,8 @@ locale_error_handler(const char *errors, int *surrogateescape)
31103110
*surrogateescape = 0;
31113111
return 0;
31123112
}
3113-
if (strcmp(errors, "surrogateescape") == 0) {
3113+
if (errors == "surrogateescape"
3114+
|| strcmp(errors, "surrogateescape") == 0) {
31143115
*surrogateescape = 1;
31153116
return 0;
31163117
}
@@ -3148,7 +3149,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
31483149
}
31493150

31503151
if (surrogateescape) {
3151-
/* locale encoding with surrogateescape */
3152+
/* "surrogateescape" error handler */
31523153
char *str;
31533154

31543155
str = _Py_wchar2char(wstr, &error_pos);
@@ -3168,6 +3169,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
31683169
PyMem_Free(str);
31693170
}
31703171
else {
3172+
/* strict mode */
31713173
size_t len, len2;
31723174

31733175
len = wcstombs(NULL, wstr, 0);
@@ -3273,7 +3275,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
32733275
}
32743276

32753277
/* Shortcuts for common default encodings */
3276-
if (normalize_encoding(encoding, lower, sizeof(lower))) {
3278+
if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
32773279
if ((strcmp(lower, "utf-8") == 0) ||
32783280
(strcmp(lower, "utf8") == 0))
32793281
{
@@ -3413,8 +3415,8 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
34133415
return NULL;
34143416
}
34153417

3416-
if (surrogateescape)
3417-
{
3418+
if (surrogateescape) {
3419+
/* "surrogateescape" error handler */
34183420
wstr = _Py_char2wchar(str, &wlen);
34193421
if (wstr == NULL) {
34203422
if (wlen == (size_t)-1)
@@ -3428,6 +3430,7 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
34283430
PyMem_Free(wstr);
34293431
}
34303432
else {
3433+
/* strict mode */
34313434
#ifndef HAVE_BROKEN_MBSTOWCS
34323435
wlen = mbstowcs(NULL, str, 0);
34333436
#else
@@ -3447,7 +3450,6 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
34473450
return PyErr_NoMemory();
34483451
}
34493452

3450-
/* This shouldn't fail now */
34513453
wlen2 = mbstowcs(wstr, str, wlen+1);
34523454
if (wlen2 == (size_t)-1) {
34533455
if (wstr != smallbuf)

0 commit comments

Comments
 (0)