Skip to content

Commit d500e53

Browse files
authored
bpo-34403: On HP-UX, force ASCII for C locale (GH-8969)
On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns "ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale is not coerced). nl_langinfo(CODESET) announces "roman8" whereas it uses the Latin1 encoding in practice.
1 parent 5cb2589 commit d500e53

File tree

5 files changed

+105
-52
lines changed

5 files changed

+105
-52
lines changed

Diff for: Include/fileutils.h

+5
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,11 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
170170

171171
#endif /* Py_LIMITED_API */
172172

173+
174+
#ifdef Py_BUILD_CORE
175+
PyAPI_FUNC(int) _Py_GetForceASCII(void);
176+
#endif
177+
173178
#ifdef __cplusplus
174179
}
175180
#endif
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns
2+
"ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale
3+
is not coerced).

Diff for: Python/coreconfig.c

+9-6
Original file line numberDiff line numberDiff line change
@@ -828,18 +828,21 @@ config_read_complex_options(_PyCoreConfig *config)
828828
static void
829829
config_init_locale(_PyCoreConfig *config)
830830
{
831-
if (_Py_LegacyLocaleDetected()) {
831+
if (config->coerce_c_locale < 0) {
832832
/* The C locale enables the C locale coercion (PEP 538) */
833-
if (config->coerce_c_locale < 0) {
833+
if (_Py_LegacyLocaleDetected()) {
834834
config->coerce_c_locale = 1;
835835
}
836836
}
837+
837838
#ifndef MS_WINDOWS
838-
const char *ctype_loc = setlocale(LC_CTYPE, NULL);
839-
if (ctype_loc != NULL
840-
&& (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0)) {
839+
if (config->utf8_mode < 0) {
841840
/* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */
842-
if (config->utf8_mode < 0) {
841+
const char *ctype_loc = setlocale(LC_CTYPE, NULL);
842+
if (ctype_loc != NULL
843+
&& (strcmp(ctype_loc, "C") == 0
844+
|| strcmp(ctype_loc, "POSIX") == 0))
845+
{
843846
config->utf8_mode = 1;
844847
}
845848
}

Diff for: Python/fileutils.c

+71-33
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ _Py_device_encoding(int fd)
7272

7373
extern int _Py_normalize_encoding(const char *, char *, size_t);
7474

75-
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
76-
On these operating systems, nl_langinfo(CODESET) announces an alias of the
75+
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
76+
and POSIX locale. nl_langinfo(CODESET) announces an alias of the
7777
ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
7878
ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
7979
locale.getpreferredencoding() codec. For example, if command line arguments
@@ -86,6 +86,10 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
8686
workaround is also enabled on error, for example if getting the locale
8787
failed.
8888
89+
On HP-UX with the C locale or the POSIX locale, nl_langinfo(CODESET)
90+
announces "roman8" but mbstowcs() uses Latin1 in practice. Force also the
91+
ASCII encoding in this case.
92+
8993
Values of force_ascii:
9094
9195
1: the workaround is used: Py_EncodeLocale() uses
@@ -100,13 +104,46 @@ static int force_ascii = -1;
100104
static int
101105
check_force_ascii(void)
102106
{
103-
char *loc;
107+
char *loc = setlocale(LC_CTYPE, NULL);
108+
if (loc == NULL) {
109+
goto error;
110+
}
111+
if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
112+
/* the LC_CTYPE locale is different than C and POSIX */
113+
return 0;
114+
}
115+
104116
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
105-
char *codeset, **alias;
117+
const char *codeset = nl_langinfo(CODESET);
118+
if (!codeset || codeset[0] == '\0') {
119+
/* CODESET is not set or empty */
120+
goto error;
121+
}
122+
106123
char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
107-
int is_ascii;
108-
unsigned int i;
109-
char* ascii_aliases[] = {
124+
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
125+
goto error;
126+
}
127+
128+
#ifdef __hpux
129+
if (strcmp(encoding, "roman8") == 0) {
130+
unsigned char ch;
131+
wchar_t wch;
132+
size_t res;
133+
134+
ch = (unsigned char)0xA7;
135+
res = mbstowcs(&wch, (char*)&ch, 1);
136+
if (res != (size_t)-1 && wch == L'\xA7') {
137+
/* On HP-UX withe C locale or the POSIX locale,
138+
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
139+
Latin1 encoding in practice. Force ASCII in this case.
140+
141+
Roman8 decodes 0xA7 to U+00CF. Latin1 decodes 0xA7 to U+00A7. */
142+
return 1;
143+
}
144+
}
145+
#else
146+
const char* ascii_aliases[] = {
110147
"ascii",
111148
/* Aliases from Lib/encodings/aliases.py */
112149
"646",
@@ -123,27 +160,9 @@ check_force_ascii(void)
123160
"us_ascii",
124161
NULL
125162
};
126-
#endif
127-
128-
loc = setlocale(LC_CTYPE, NULL);
129-
if (loc == NULL)
130-
goto error;
131-
if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
132-
/* the LC_CTYPE locale is different than C */
133-
return 0;
134-
}
135-
136-
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
137-
codeset = nl_langinfo(CODESET);
138-
if (!codeset || codeset[0] == '\0') {
139-
/* CODESET is not set or empty */
140-
goto error;
141-
}
142-
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding)))
143-
goto error;
144163

145-
is_ascii = 0;
146-
for (alias=ascii_aliases; *alias != NULL; alias++) {
164+
int is_ascii = 0;
165+
for (const char **alias=ascii_aliases; *alias != NULL; alias++) {
147166
if (strcmp(encoding, *alias) == 0) {
148167
is_ascii = 1;
149168
break;
@@ -154,13 +173,14 @@ check_force_ascii(void)
154173
return 0;
155174
}
156175

157-
for (i=0x80; i<0xff; i++) {
158-
unsigned char ch;
159-
wchar_t wch;
176+
for (unsigned int i=0x80; i<=0xff; i++) {
177+
char ch[1];
178+
wchar_t wch[1];
160179
size_t res;
161180

162-
ch = (unsigned char)i;
163-
res = mbstowcs(&wch, (char*)&ch, 1);
181+
unsigned uch = (unsigned char)i;
182+
ch[0] = (char)uch;
183+
res = mbstowcs(wch, ch, 1);
164184
if (res != (size_t)-1) {
165185
/* decoding a non-ASCII character from the locale encoding succeed:
166186
the locale encoding is not ASCII, force ASCII */
@@ -169,17 +189,29 @@ check_force_ascii(void)
169189
}
170190
/* None of the bytes in the range 0x80-0xff can be decoded from the locale
171191
encoding: the locale encoding is really ASCII */
192+
#endif /* !defined(__hpux) */
172193
return 0;
173194
#else
174195
/* nl_langinfo(CODESET) is not available: always force ASCII */
175196
return 1;
176-
#endif
197+
#endif /* defined(HAVE_LANGINFO_H) && defined(CODESET) */
177198

178199
error:
179200
/* if an error occurred, force the ASCII encoding */
180201
return 1;
181202
}
182203

204+
205+
int
206+
_Py_GetForceASCII(void)
207+
{
208+
if (force_ascii == -1) {
209+
force_ascii = check_force_ascii();
210+
}
211+
return force_ascii;
212+
}
213+
214+
183215
static int
184216
encode_ascii(const wchar_t *text, char **str,
185217
size_t *error_pos, const char **reason,
@@ -234,6 +266,12 @@ encode_ascii(const wchar_t *text, char **str,
234266
*str = result;
235267
return 0;
236268
}
269+
#else
270+
int
271+
_Py_GetForceASCII(void)
272+
{
273+
return 0;
274+
}
237275
#endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
238276

239277

Diff for: Python/pylifecycle.c

+17-13
Original file line numberDiff line numberDiff line change
@@ -1576,21 +1576,25 @@ initfsencoding(PyInterpreterState *interp)
15761576
Py_FileSystemDefaultEncodeErrors = "surrogatepass";
15771577
}
15781578
#else
1579-
if (Py_FileSystemDefaultEncoding == NULL &&
1580-
interp->core_config.utf8_mode)
1581-
{
1582-
Py_FileSystemDefaultEncoding = "utf-8";
1583-
Py_HasFileSystemDefaultEncoding = 1;
1584-
}
1585-
else if (Py_FileSystemDefaultEncoding == NULL) {
1586-
Py_FileSystemDefaultEncoding = get_locale_encoding();
1587-
if (Py_FileSystemDefaultEncoding == NULL) {
1588-
return _Py_INIT_ERR("Unable to get the locale encoding");
1579+
if (Py_FileSystemDefaultEncoding == NULL) {
1580+
if (interp->core_config.utf8_mode) {
1581+
Py_FileSystemDefaultEncoding = "utf-8";
1582+
Py_HasFileSystemDefaultEncoding = 1;
1583+
}
1584+
else if (_Py_GetForceASCII()) {
1585+
Py_FileSystemDefaultEncoding = "ascii";
1586+
Py_HasFileSystemDefaultEncoding = 1;
15891587
}
1588+
else {
1589+
Py_FileSystemDefaultEncoding = get_locale_encoding();
1590+
if (Py_FileSystemDefaultEncoding == NULL) {
1591+
return _Py_INIT_ERR("Unable to get the locale encoding");
1592+
}
15901593

1591-
Py_HasFileSystemDefaultEncoding = 0;
1592-
interp->fscodec_initialized = 1;
1593-
return _Py_INIT_OK();
1594+
Py_HasFileSystemDefaultEncoding = 0;
1595+
interp->fscodec_initialized = 1;
1596+
return _Py_INIT_OK();
1597+
}
15941598
}
15951599
#endif
15961600

0 commit comments

Comments
 (0)