@@ -72,8 +72,8 @@ _Py_device_encoding(int fd)
72
72
73
73
extern int _Py_normalize_encoding (const char * , char * , size_t );
74
74
75
- /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
76
- On these operating systems, nl_langinfo(CODESET) announces an alias of the
75
+ /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
76
+ and POSIX locale. nl_langinfo(CODESET) announces an alias of the
77
77
ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
78
78
ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
79
79
locale.getpreferredencoding() codec. For example, if command line arguments
@@ -86,6 +86,10 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
86
86
workaround is also enabled on error, for example if getting the locale
87
87
failed.
88
88
89
+ On HP-UX with the C locale or the POSIX locale, nl_langinfo(CODESET)
90
+ announces "roman8" but mbstowcs() uses Latin1 in practice. Force also the
91
+ ASCII encoding in this case.
92
+
89
93
Values of force_ascii:
90
94
91
95
1: the workaround is used: Py_EncodeLocale() uses
@@ -100,13 +104,46 @@ static int force_ascii = -1;
100
104
static int
101
105
check_force_ascii (void )
102
106
{
103
- char * loc ;
107
+ char * loc = setlocale (LC_CTYPE , NULL );
108
+ if (loc == NULL ) {
109
+ goto error ;
110
+ }
111
+ if (strcmp (loc , "C" ) != 0 && strcmp (loc , "POSIX" ) != 0 ) {
112
+ /* the LC_CTYPE locale is different than C and POSIX */
113
+ return 0 ;
114
+ }
115
+
104
116
#if defined(HAVE_LANGINFO_H ) && defined(CODESET )
105
- char * codeset , * * alias ;
117
+ const char * codeset = nl_langinfo (CODESET );
118
+ if (!codeset || codeset [0 ] == '\0' ) {
119
+ /* CODESET is not set or empty */
120
+ goto error ;
121
+ }
122
+
106
123
char encoding [20 ]; /* longest name: "iso_646.irv_1991\0" */
107
- int is_ascii ;
108
- unsigned int i ;
109
- char * ascii_aliases [] = {
124
+ if (!_Py_normalize_encoding (codeset , encoding , sizeof (encoding ))) {
125
+ goto error ;
126
+ }
127
+
128
+ #ifdef __hpux
129
+ if (strcmp (encoding , "roman8" ) == 0 ) {
130
+ unsigned char ch ;
131
+ wchar_t wch ;
132
+ size_t res ;
133
+
134
+ ch = (unsigned char )0xA7 ;
135
+ res = mbstowcs (& wch , (char * )& ch , 1 );
136
+ if (res != (size_t )-1 && wch == L'\xA7' ) {
137
+ /* On HP-UX withe C locale or the POSIX locale,
138
+ nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
139
+ Latin1 encoding in practice. Force ASCII in this case.
140
+
141
+ Roman8 decodes 0xA7 to U+00CF. Latin1 decodes 0xA7 to U+00A7. */
142
+ return 1 ;
143
+ }
144
+ }
145
+ #else
146
+ const char * ascii_aliases [] = {
110
147
"ascii" ,
111
148
/* Aliases from Lib/encodings/aliases.py */
112
149
"646" ,
@@ -123,27 +160,9 @@ check_force_ascii(void)
123
160
"us_ascii" ,
124
161
NULL
125
162
};
126
- #endif
127
-
128
- loc = setlocale (LC_CTYPE , NULL );
129
- if (loc == NULL )
130
- goto error ;
131
- if (strcmp (loc , "C" ) != 0 && strcmp (loc , "POSIX" ) != 0 ) {
132
- /* the LC_CTYPE locale is different than C */
133
- return 0 ;
134
- }
135
-
136
- #if defined(HAVE_LANGINFO_H ) && defined(CODESET )
137
- codeset = nl_langinfo (CODESET );
138
- if (!codeset || codeset [0 ] == '\0' ) {
139
- /* CODESET is not set or empty */
140
- goto error ;
141
- }
142
- if (!_Py_normalize_encoding (codeset , encoding , sizeof (encoding )))
143
- goto error ;
144
163
145
- is_ascii = 0 ;
146
- for (alias = ascii_aliases ; * alias != NULL ; alias ++ ) {
164
+ int is_ascii = 0 ;
165
+ for (const char * * alias = ascii_aliases ; * alias != NULL ; alias ++ ) {
147
166
if (strcmp (encoding , * alias ) == 0 ) {
148
167
is_ascii = 1 ;
149
168
break ;
@@ -154,13 +173,14 @@ check_force_ascii(void)
154
173
return 0 ;
155
174
}
156
175
157
- for (i = 0x80 ; i < 0xff ; i ++ ) {
158
- unsigned char ch ;
159
- wchar_t wch ;
176
+ for (unsigned int i = 0x80 ; i <= 0xff ; i ++ ) {
177
+ char ch [ 1 ] ;
178
+ wchar_t wch [ 1 ] ;
160
179
size_t res ;
161
180
162
- ch = (unsigned char )i ;
163
- res = mbstowcs (& wch , (char * )& ch , 1 );
181
+ unsigned uch = (unsigned char )i ;
182
+ ch [0 ] = (char )uch ;
183
+ res = mbstowcs (wch , ch , 1 );
164
184
if (res != (size_t )-1 ) {
165
185
/* decoding a non-ASCII character from the locale encoding succeed:
166
186
the locale encoding is not ASCII, force ASCII */
@@ -169,17 +189,29 @@ check_force_ascii(void)
169
189
}
170
190
/* None of the bytes in the range 0x80-0xff can be decoded from the locale
171
191
encoding: the locale encoding is really ASCII */
192
+ #endif /* !defined(__hpux) */
172
193
return 0 ;
173
194
#else
174
195
/* nl_langinfo(CODESET) is not available: always force ASCII */
175
196
return 1 ;
176
- #endif
197
+ #endif /* defined(HAVE_LANGINFO_H) && defined(CODESET) */
177
198
178
199
error :
179
200
/* if an error occurred, force the ASCII encoding */
180
201
return 1 ;
181
202
}
182
203
204
+
205
+ int
206
+ _Py_GetForceASCII (void )
207
+ {
208
+ if (force_ascii == -1 ) {
209
+ force_ascii = check_force_ascii ();
210
+ }
211
+ return force_ascii ;
212
+ }
213
+
214
+
183
215
static int
184
216
encode_ascii (const wchar_t * text , char * * str ,
185
217
size_t * error_pos , const char * * reason ,
@@ -234,6 +266,12 @@ encode_ascii(const wchar_t *text, char **str,
234
266
* str = result ;
235
267
return 0 ;
236
268
}
269
+ #else
270
+ int
271
+ _Py_GetForceASCII (void )
272
+ {
273
+ return 0 ;
274
+ }
237
275
#endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
238
276
239
277
0 commit comments