Skip to content

Commit 29fd712

Browse files
committed
Minimal change that disables (AFAICT) the interpolation of \u and \U inside
raw string literals. I added a whole bunch of tests but am still not sure I am testing all paths through the code. I really think the code could be simplified quite a bit.
1 parent aec75c3 commit 29fd712

File tree

2 files changed

+160
-8
lines changed

2 files changed

+160
-8
lines changed

Lib/test/test_strlit.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
r"""Test correct treatment of various string literals by the parser.
2+
3+
There are four types of string literals:
4+
5+
'abc' -- normal str
6+
r'abc' -- raw str
7+
b'xyz' -- normal bytes
8+
br'xyz' -- raw bytes
9+
10+
The difference between normal and raw strings is of course that in a
11+
raw string, \ escapes (while still used to determine the end of the
12+
literal) are not interpreted, so that r'\x00' contains four
13+
characters: a backslash, an x, and two zeros; while '\x00' contains a
14+
single character (code point zero).
15+
16+
The tricky thing is what should happen when non-ASCII bytes are used
17+
inside literals. For bytes literals, this is considered illegal. But
18+
for str literals, those bytes are supposed to be decoded using the
19+
encoding declared for the file (UTF-8 by default).
20+
21+
We have to test this with various file encodings. We also test it with
22+
exec()/eval(), which uses a different code path.
23+
24+
This file is really about correct treatment of encodings and
25+
backslashes. It doens't concern itself with issues like single
26+
vs. double quotes or singly- vs. triply-quoted strings: that's dealt
27+
with elsewhere (I assume).
28+
"""
29+
30+
import os
31+
import sys
32+
import shutil
33+
import tempfile
34+
import unittest
35+
36+
37+
TEMPLATE = r"""# coding: %s
38+
a = 'x'
39+
assert ord(a) == 120
40+
b = '\x01'
41+
assert ord(b) == 1
42+
c = r'\x01'
43+
assert list(map(ord, c)) == [92, 120, 48, 49]
44+
d = '\x81'
45+
assert ord(d) == 0x81
46+
e = r'\x81'
47+
assert list(map(ord, e)) == [92, 120, 56, 49]
48+
f = '\u1881'
49+
assert ord(f) == 0x1881
50+
g = r'\u1881'
51+
assert list(map(ord, g)) == [92, 117, 49, 56, 56, 49]
52+
"""
53+
54+
55+
def byte(i):
56+
return bytes([i])
57+
58+
59+
class TestLiterals(unittest.TestCase):
60+
61+
def setUp(self):
62+
self.save_path = sys.path[:]
63+
self.tmpdir = tempfile.mkdtemp()
64+
sys.path.insert(0, self.tmpdir)
65+
66+
def tearDown(self):
67+
sys.path = self.save_path
68+
shutil.rmtree(self.tmpdir, ignore_errors=True)
69+
70+
def test_template(self):
71+
# Check that the template doesn't contain any non-printables
72+
# except for \n.
73+
for c in TEMPLATE:
74+
assert c == '\n' or ' ' <= c <= '~', repr(c)
75+
76+
def test_eval_str_normal(self):
77+
self.assertEqual(eval(""" 'x' """), 'x')
78+
self.assertEqual(eval(r""" '\x01' """), chr(1))
79+
self.assertEqual(eval(""" '\x01' """), chr(1))
80+
self.assertEqual(eval(r""" '\x81' """), chr(0x81))
81+
self.assertEqual(eval(""" '\x81' """), chr(0x81))
82+
self.assertEqual(eval(r""" '\u1881' """), chr(0x1881))
83+
self.assertEqual(eval(""" '\u1881' """), chr(0x1881))
84+
85+
def test_eval_str_raw(self):
86+
self.assertEqual(eval(""" r'x' """), 'x')
87+
self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01')
88+
self.assertEqual(eval(""" r'\x01' """), chr(1))
89+
self.assertEqual(eval(r""" r'\x81' """), '\\' + 'x81')
90+
self.assertEqual(eval(""" r'\x81' """), chr(0x81))
91+
self.assertEqual(eval(r""" r'\u1881' """), '\\' + 'u1881')
92+
self.assertEqual(eval(""" r'\u1881' """), chr(0x1881))
93+
94+
def test_eval_bytes_normal(self):
95+
self.assertEqual(eval(""" b'x' """), b'x')
96+
self.assertEqual(eval(r""" b'\x01' """), byte(1))
97+
self.assertEqual(eval(""" b'\x01' """), byte(1))
98+
self.assertEqual(eval(r""" b'\x81' """), byte(0x81))
99+
self.assertRaises(SyntaxError, eval, """ b'\x81' """)
100+
self.assertEqual(eval(r""" b'\u1881' """), b'\\' + b'u1881')
101+
self.assertRaises(SyntaxError, eval, """ b'\u1881' """)
102+
103+
def test_eval_bytes_raw(self):
104+
self.assertEqual(eval(""" br'x' """), b'x')
105+
self.assertEqual(eval(r""" br'\x01' """), b'\\' + b'x01')
106+
self.assertEqual(eval(""" br'\x01' """), byte(1))
107+
self.assertEqual(eval(r""" br'\x81' """), b"\\" + b"x81")
108+
self.assertRaises(SyntaxError, eval, """ br'\x81' """)
109+
self.assertEqual(eval(r""" br'\u1881' """), b"\\" + b"u1881")
110+
self.assertRaises(SyntaxError, eval, """ br'\u1881' """)
111+
112+
def check_encoding(self, encoding, extra=""):
113+
modname = "xx_" + encoding.replace("-", "_")
114+
fn = os.path.join(self.tmpdir, modname + ".py")
115+
f = open(fn, "w", encoding=encoding)
116+
try:
117+
f.write(TEMPLATE % encoding)
118+
f.write(extra)
119+
finally:
120+
f.close()
121+
__import__(modname)
122+
del sys.modules[modname]
123+
124+
def test_file_utf_8(self):
125+
extra = "z = '\u1234'; assert ord(z) == 0x1234\n"
126+
self.check_encoding("utf-8", extra)
127+
128+
def test_file_utf_8_error(self):
129+
extra = "b'\x80'\n"
130+
self.assertRaises(SyntaxError, self.check_encoding, "utf-8", extra)
131+
132+
def test_file_utf8(self):
133+
self.check_encoding("utf8")
134+
135+
def test_file_iso_8859_1(self):
136+
self.check_encoding("iso-8859-1")
137+
138+
def test_file_latin_1(self):
139+
self.check_encoding("latin-1")
140+
141+
def test_file_latin9(self):
142+
self.check_encoding("latin9")
143+
144+
145+
if __name__ == "__main__":
146+
# Hack so that error messages containing non-ASCII can be printed
147+
sys.stdout._encoding = sys.stderr._encoding = "utf-8"
148+
unittest.main()

Python/ast.c

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1292,7 +1292,7 @@ ast_for_atom(struct compiling *c, const node *n)
12921292
case STRING: {
12931293
PyObject *str = parsestrplus(c, n, &bytesmode);
12941294
if (!str) {
1295-
if (PyErr_ExceptionMatches(PyExc_UnicodeError)){
1295+
if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
12961296
PyObject *type, *value, *tback, *errstr;
12971297
PyErr_Fetch(&type, &value, &tback);
12981298
errstr = ((PyUnicodeErrorObject *)value)->reason;
@@ -3117,6 +3117,7 @@ decode_unicode(const char *s, size_t len, int rawmode, const char *encoding)
31173117
char *buf;
31183118
char *p;
31193119
const char *end;
3120+
31203121
if (encoding == NULL) {
31213122
buf = (char *)s;
31223123
u = NULL;
@@ -3218,7 +3219,7 @@ parsestr(const node *n, const char *encoding, int *bytesmode)
32183219
return NULL;
32193220
}
32203221
}
3221-
if (!*bytesmode) {
3222+
if (!*bytesmode && !rawmode) {
32223223
return decode_unicode(s, len, rawmode, encoding);
32233224
}
32243225
if (*bytesmode) {
@@ -3238,21 +3239,25 @@ parsestr(const node *n, const char *encoding, int *bytesmode)
32383239
if (rawmode || strchr(s, '\\') == NULL) {
32393240
if (need_encoding) {
32403241
PyObject *v, *u = PyUnicode_DecodeUTF8(s, len, NULL);
3241-
if (u == NULL)
3242-
return NULL;
3242+
if (u == NULL || !*bytesmode)
3243+
return u;
32433244
v = PyUnicode_AsEncodedString(u, encoding, NULL);
32443245
Py_DECREF(u);
32453246
return v;
3246-
} else {
3247+
} else if (*bytesmode) {
32473248
return PyString_FromStringAndSize(s, len);
3249+
} else if (strcmp(encoding, "utf-8") == 0) {
3250+
return PyUnicode_FromStringAndSize(s, len);
3251+
} else {
3252+
return PyUnicode_DecodeLatin1(s, len, NULL);
32483253
}
32493254
}
32503255

32513256
return PyString_DecodeEscape(s, len, NULL, 1,
32523257
need_encoding ? encoding : NULL);
32533258
}
32543259

3255-
/* Build a Python string object out of a STRING atom. This takes care of
3260+
/* Build a Python string object out of a STRING+ atom. This takes care of
32563261
* compile-time literal catenation, calling parsestr() on each piece, and
32573262
* pasting the intermediate results together.
32583263
*/
@@ -3272,8 +3277,7 @@ parsestrplus(struct compiling *c, const node *n, int *bytesmode)
32723277
if (s == NULL)
32733278
goto onError;
32743279
if (*bytesmode != subbm) {
3275-
ast_error(n, "cannot mix bytes and nonbytes"
3276-
"literals");
3280+
ast_error(n, "cannot mix bytes and nonbytes literals");
32773281
goto onError;
32783282
}
32793283
if (PyString_Check(v) && PyString_Check(s)) {

0 commit comments

Comments
 (0)