|
| 1 | +r"""Test correct treatment of various string literals by the parser. |
| 2 | +
|
| 3 | +There are four types of string literals: |
| 4 | +
|
| 5 | + 'abc' -- normal str |
| 6 | + r'abc' -- raw str |
| 7 | + b'xyz' -- normal bytes |
| 8 | + br'xyz' -- raw bytes |
| 9 | +
|
| 10 | +The difference between normal and raw strings is of course that in a |
| 11 | +raw string, \ escapes (while still used to determine the end of the |
| 12 | +literal) are not interpreted, so that r'\x00' contains four |
| 13 | +characters: a backslash, an x, and two zeros; while '\x00' contains a |
| 14 | +single character (code point zero). |
| 15 | +
|
| 16 | +The tricky thing is what should happen when non-ASCII bytes are used |
| 17 | +inside literals. For bytes literals, this is considered illegal. But |
| 18 | +for str literals, those bytes are supposed to be decoded using the |
| 19 | +encoding declared for the file (UTF-8 by default). |
| 20 | +
|
| 21 | +We have to test this with various file encodings. We also test it with |
| 22 | +exec()/eval(), which uses a different code path. |
| 23 | +
|
| 24 | +This file is really about correct treatment of encodings and |
| 25 | +backslashes. It doens't concern itself with issues like single |
| 26 | +vs. double quotes or singly- vs. triply-quoted strings: that's dealt |
| 27 | +with elsewhere (I assume). |
| 28 | +""" |
| 29 | + |
| 30 | +import os |
| 31 | +import sys |
| 32 | +import shutil |
| 33 | +import tempfile |
| 34 | +import unittest |
| 35 | + |
| 36 | + |
| 37 | +TEMPLATE = r"""# coding: %s |
| 38 | +a = 'x' |
| 39 | +assert ord(a) == 120 |
| 40 | +b = '\x01' |
| 41 | +assert ord(b) == 1 |
| 42 | +c = r'\x01' |
| 43 | +assert list(map(ord, c)) == [92, 120, 48, 49] |
| 44 | +d = '\x81' |
| 45 | +assert ord(d) == 0x81 |
| 46 | +e = r'\x81' |
| 47 | +assert list(map(ord, e)) == [92, 120, 56, 49] |
| 48 | +f = '\u1881' |
| 49 | +assert ord(f) == 0x1881 |
| 50 | +g = r'\u1881' |
| 51 | +assert list(map(ord, g)) == [92, 117, 49, 56, 56, 49] |
| 52 | +""" |
| 53 | + |
| 54 | + |
| 55 | +def byte(i): |
| 56 | + return bytes([i]) |
| 57 | + |
| 58 | + |
| 59 | +class TestLiterals(unittest.TestCase): |
| 60 | + |
| 61 | + def setUp(self): |
| 62 | + self.save_path = sys.path[:] |
| 63 | + self.tmpdir = tempfile.mkdtemp() |
| 64 | + sys.path.insert(0, self.tmpdir) |
| 65 | + |
| 66 | + def tearDown(self): |
| 67 | + sys.path = self.save_path |
| 68 | + shutil.rmtree(self.tmpdir, ignore_errors=True) |
| 69 | + |
| 70 | + def test_template(self): |
| 71 | + # Check that the template doesn't contain any non-printables |
| 72 | + # except for \n. |
| 73 | + for c in TEMPLATE: |
| 74 | + assert c == '\n' or ' ' <= c <= '~', repr(c) |
| 75 | + |
| 76 | + def test_eval_str_normal(self): |
| 77 | + self.assertEqual(eval(""" 'x' """), 'x') |
| 78 | + self.assertEqual(eval(r""" '\x01' """), chr(1)) |
| 79 | + self.assertEqual(eval(""" '\x01' """), chr(1)) |
| 80 | + self.assertEqual(eval(r""" '\x81' """), chr(0x81)) |
| 81 | + self.assertEqual(eval(""" '\x81' """), chr(0x81)) |
| 82 | + self.assertEqual(eval(r""" '\u1881' """), chr(0x1881)) |
| 83 | + self.assertEqual(eval(""" '\u1881' """), chr(0x1881)) |
| 84 | + |
| 85 | + def test_eval_str_raw(self): |
| 86 | + self.assertEqual(eval(""" r'x' """), 'x') |
| 87 | + self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01') |
| 88 | + self.assertEqual(eval(""" r'\x01' """), chr(1)) |
| 89 | + self.assertEqual(eval(r""" r'\x81' """), '\\' + 'x81') |
| 90 | + self.assertEqual(eval(""" r'\x81' """), chr(0x81)) |
| 91 | + self.assertEqual(eval(r""" r'\u1881' """), '\\' + 'u1881') |
| 92 | + self.assertEqual(eval(""" r'\u1881' """), chr(0x1881)) |
| 93 | + |
| 94 | + def test_eval_bytes_normal(self): |
| 95 | + self.assertEqual(eval(""" b'x' """), b'x') |
| 96 | + self.assertEqual(eval(r""" b'\x01' """), byte(1)) |
| 97 | + self.assertEqual(eval(""" b'\x01' """), byte(1)) |
| 98 | + self.assertEqual(eval(r""" b'\x81' """), byte(0x81)) |
| 99 | + self.assertRaises(SyntaxError, eval, """ b'\x81' """) |
| 100 | + self.assertEqual(eval(r""" b'\u1881' """), b'\\' + b'u1881') |
| 101 | + self.assertRaises(SyntaxError, eval, """ b'\u1881' """) |
| 102 | + |
| 103 | + def test_eval_bytes_raw(self): |
| 104 | + self.assertEqual(eval(""" br'x' """), b'x') |
| 105 | + self.assertEqual(eval(r""" br'\x01' """), b'\\' + b'x01') |
| 106 | + self.assertEqual(eval(""" br'\x01' """), byte(1)) |
| 107 | + self.assertEqual(eval(r""" br'\x81' """), b"\\" + b"x81") |
| 108 | + self.assertRaises(SyntaxError, eval, """ br'\x81' """) |
| 109 | + self.assertEqual(eval(r""" br'\u1881' """), b"\\" + b"u1881") |
| 110 | + self.assertRaises(SyntaxError, eval, """ br'\u1881' """) |
| 111 | + |
| 112 | + def check_encoding(self, encoding, extra=""): |
| 113 | + modname = "xx_" + encoding.replace("-", "_") |
| 114 | + fn = os.path.join(self.tmpdir, modname + ".py") |
| 115 | + f = open(fn, "w", encoding=encoding) |
| 116 | + try: |
| 117 | + f.write(TEMPLATE % encoding) |
| 118 | + f.write(extra) |
| 119 | + finally: |
| 120 | + f.close() |
| 121 | + __import__(modname) |
| 122 | + del sys.modules[modname] |
| 123 | + |
| 124 | + def test_file_utf_8(self): |
| 125 | + extra = "z = '\u1234'; assert ord(z) == 0x1234\n" |
| 126 | + self.check_encoding("utf-8", extra) |
| 127 | + |
| 128 | + def test_file_utf_8_error(self): |
| 129 | + extra = "b'\x80'\n" |
| 130 | + self.assertRaises(SyntaxError, self.check_encoding, "utf-8", extra) |
| 131 | + |
| 132 | + def test_file_utf8(self): |
| 133 | + self.check_encoding("utf8") |
| 134 | + |
| 135 | + def test_file_iso_8859_1(self): |
| 136 | + self.check_encoding("iso-8859-1") |
| 137 | + |
| 138 | + def test_file_latin_1(self): |
| 139 | + self.check_encoding("latin-1") |
| 140 | + |
| 141 | + def test_file_latin9(self): |
| 142 | + self.check_encoding("latin9") |
| 143 | + |
| 144 | + |
| 145 | +if __name__ == "__main__": |
| 146 | + # Hack so that error messages containing non-ASCII can be printed |
| 147 | + sys.stdout._encoding = sys.stderr._encoding = "utf-8" |
| 148 | + unittest.main() |
0 commit comments