Skip to content

Commit 60d12b3

Browse files
committed
parser: implement string escapes
1 parent 6125042 commit 60d12b3

File tree

6 files changed

+312
-18
lines changed

6 files changed

+312
-18
lines changed

notes.txt

+9-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ Limitations
1111

1212
FIXME parsing 00007 works fine whereas it should be throwing an error
1313

14-
FIXME "\tstring\n" isn't lexing properly :-(
15-
1614
FIXME interesting crash with compiling `int("42"), sausage=11)`
1715
Compile error: SystemError: [interface conversion: *ast.Call is not ast.SetCtxer: missing method SetCtx]
1816

@@ -88,6 +86,15 @@ Arg struct {
8886

8987
And pass args using []Arg instead of StringDict
9088

89+
Compiler
90+
========
91+
92+
Complete but without optimisation.
93+
94+
Easy wins
95+
* Constant folding, eg -1 is LOAD_CONST 1; NEGATE
96+
* Jump optimisation - compiler emits lots of jumps to jumps
97+
9198
Testing
9299
=======
93100

parser/lexer.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -818,7 +818,12 @@ found:
818818
}
819819
foundEndOfString:
820820
if !rawString {
821-
// FIXME expand / sequences
821+
var err error
822+
buf, err = DecodeEscape(buf, byteString)
823+
if err != nil {
824+
x.Errorf("Decode error: %v", err)
825+
return eofError, nil
826+
}
822827
}
823828
if byteString {
824829
return STRING, py.Bytes(buf.Bytes())

parser/lexer_test.go

+14-14
Original file line numberDiff line numberDiff line change
@@ -546,49 +546,49 @@ func TestLexerReadString(t *testing.T) {
546546
{``, eof, nil, ``},
547547
{`1`, eof, nil, `1`},
548548

549-
{`""a`, STRING, py.String(``), `a`},
550-
{`u"abc"`, STRING, py.String(`abc`), ``},
551-
{`"a\nc"`, STRING, py.String(`a\nc`), ``},
549+
{`""a`, STRING, py.String(""), `a`},
550+
{`u"abc"`, STRING, py.String("abc"), ``},
551+
{`"a\nc"`, STRING, py.String("a\nc"), ``},
552552
{`r"a\nc"`, STRING, py.String(`a\nc`), ``},
553-
{`"a\"c"`, STRING, py.String(`a\"c`), ``},
554-
{`"a\\"+`, STRING, py.String(`a\\`), `+`},
555-
{`"a`, eofError, nil, `a`},
553+
{`"a\"c"`, STRING, py.String("a\"c"), ``},
554+
{`"a\\"+`, STRING, py.String("a\\"), `+`},
555+
{`"a`, eofError, nil, "a"},
556556
{"\"a\n", eofError, nil, "a\n"},
557557
{"\"a\\\nb\"c", STRING, py.String(`ab`), `c`},
558558

559559
{`''a`, STRING, py.String(``), `a`},
560560
{`U'abc'`, STRING, py.String(`abc`), ``},
561-
{`'a\nc'`, STRING, py.String(`a\nc`), ``},
561+
{`'a\nc'`, STRING, py.String("a\nc"), ``},
562562
{`R'a\nc'`, STRING, py.String(`a\nc`), ``},
563-
{`'a\'c'`, STRING, py.String(`a\'c`), ``},
563+
{`'a\'c'`, STRING, py.String("a'c"), ``},
564564
{`'\n`, eofError, nil, `\n`},
565565
{`'a`, eofError, nil, `a`},
566566
{"'\\\n\\\npotato\\\nX\\\n'c", STRING, py.String(`potatoX`), `c`},
567567

568568
{`""""""a`, STRING, py.String(``), `a`},
569569
{`u"""abc"""`, STRING, py.String(`abc`), ``},
570-
{`"""a\nc"""`, STRING, py.String(`a\nc`), ``},
570+
{`"""a\nc"""`, STRING, py.String("a\nc"), ``},
571571
{`r"""a\"""c"""`, STRING, py.String(`a\"""c`), ``},
572-
{`"""a\"""c"""`, STRING, py.String(`a\"""c`), ``},
572+
{`"""a\"""c"""`, STRING, py.String(`a"""c`), ``},
573573
{`"""a`, eofError, nil, `a`},
574574
{"\"\"\"a\nb\nc\n\"\"\"\n", STRING, py.String("a\nb\nc\n"), "\n"},
575575
{"\"\"\"a\nb\nc\na", eofError, nil, "a"},
576576
{"\"\"\"a\\\nb\"\"\"c", STRING, py.String(`ab`), `c`},
577577

578578
{`''''''a`, STRING, py.String(``), `a`},
579579
{`U'''abc'''`, STRING, py.String(`abc`), ``},
580-
{`'''a\nc'''`, STRING, py.String(`a\nc`), ``},
580+
{`'''a\nc'''`, STRING, py.String("a\nc"), ``},
581581
{`R'''a\nc'''`, STRING, py.String(`a\nc`), ``},
582-
{`'''a\'''c'''`, STRING, py.String(`a\'''c`), ``},
582+
{`'''a\'''c'''`, STRING, py.String(`a'''c`), ``},
583583
{`'''a`, eofError, nil, `a`},
584584
{"'''a\nb\nc\n'''\n", STRING, py.String("a\nb\nc\n"), "\n"},
585585
{"'''a\nb\nc\na", eofError, nil, "a"},
586586
{"'''\\\na\\\nb\\\n'''c", STRING, py.String(`ab`), `c`},
587587

588588
{`b""a`, STRING, py.Bytes{}, "a"},
589589
{`b'abc'`, STRING, py.Bytes(string(`abc`)), ``},
590-
{`B"""a\nc"""`, STRING, py.Bytes(string(`a\nc`)), ``},
591-
{`B'''a\"c'''`, STRING, py.Bytes(string(`a\"c`)), ``},
590+
{`B"""a\nc"""`, STRING, py.Bytes(string("a\nc")), ``},
591+
{`B'''a\"c'''`, STRING, py.Bytes(string(`a"c`)), ``},
592592

593593
{`rb""a`, STRING, py.Bytes{}, "a"},
594594
{`bR'abc'`, STRING, py.Bytes(string(`abc`)), ``},

parser/stringescape.go

+144
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
package parser
2+
3+
import (
4+
"bytes"
5+
"strconv"
6+
7+
"github.com/ncw/gpython/py"
8+
)
9+
10+
// DecodeEscape unescapes a backslash-escaped buffer
11+
//
12+
// byteMode indicates whether we are creating a unicode string or a bytes output
13+
func DecodeEscape(in *bytes.Buffer, byteMode bool) (out *bytes.Buffer, err error) {
14+
// Early exit if no escape sequences
15+
// NB in.Bytes() is cheap
16+
inBytes := in.Bytes()
17+
if bytes.IndexRune(inBytes, '\\') < 0 {
18+
return in, nil
19+
}
20+
out = new(bytes.Buffer)
21+
runes := bytes.Runes(inBytes)
22+
decodeHex := func(what byte, i, size int) error {
23+
i++
24+
if i+size <= len(runes) {
25+
cout, err := strconv.ParseInt(string(runes[i:i+size]), 16, 32)
26+
if err != nil {
27+
return py.ExceptionNewf(py.ValueError, "invalid \\%c escape at position %d", what, i-2)
28+
}
29+
if byteMode {
30+
out.WriteByte(byte(cout))
31+
} else {
32+
out.WriteRune(rune(cout))
33+
}
34+
} else {
35+
return py.ExceptionNewf(py.ValueError, "truncated \\%c escape at position %d", what, i-2)
36+
}
37+
return nil
38+
}
39+
ignoreEscape := false
40+
for i := 0; i < len(runes); i++ {
41+
c := runes[i]
42+
if c != '\\' {
43+
out.WriteRune(c)
44+
continue
45+
}
46+
i++
47+
if i >= len(runes) {
48+
return nil, py.ExceptionNewf(py.ValueError, "Trailing \\ in string")
49+
}
50+
c = runes[i]
51+
switch c {
52+
case '\n':
53+
case '\\':
54+
out.WriteRune('\\')
55+
case '\'':
56+
out.WriteRune('\'')
57+
case '"':
58+
out.WriteRune('"')
59+
case 'b':
60+
out.WriteRune('\b')
61+
case 'f':
62+
out.WriteRune('\014') // FF
63+
case 't':
64+
out.WriteRune('\t')
65+
case 'n':
66+
out.WriteRune('\n')
67+
case 'r':
68+
out.WriteRune('\r')
69+
case 'v':
70+
out.WriteRune('\013') // VT
71+
case 'a':
72+
out.WriteRune('\007') // BEL, not classic C
73+
case '0', '1', '2', '3', '4', '5', '6', '7':
74+
// 1 to 3 characters of octal escape
75+
cout := c - '0'
76+
if i+1 < len(runes) && '0' <= runes[i+1] && runes[i+1] <= '7' {
77+
i++
78+
cout = (cout << 3) + runes[i] - '0'
79+
if i+1 < len(runes) && '0' <= runes[i+1] && runes[i+1] <= '7' {
80+
i++
81+
cout = (cout << 3) + runes[i] - '0'
82+
}
83+
}
84+
if byteMode {
85+
out.WriteByte(byte(cout))
86+
} else {
87+
out.WriteRune(cout)
88+
}
89+
case 'x':
90+
// \xhh exactly 2 characters of hex
91+
err = decodeHex('x', i, 2)
92+
if err != nil {
93+
return nil, err
94+
}
95+
i += 2
96+
// FIXME In a bytes literal, hexadecimal and
97+
// octal escapes denote the byte with the
98+
// given value. In a string literal, these
99+
// escapes denote a Unicode character with the
100+
// given value.
101+
case 'u':
102+
// \uxxxx Character with 16-bit hex value xxxx - 4 characters required
103+
if byteMode {
104+
ignoreEscape = true
105+
break
106+
}
107+
err = decodeHex('u', i, 4)
108+
if err != nil {
109+
return nil, err
110+
}
111+
i += 4
112+
case 'U':
113+
// \Uxxxxxxxx Character with 32-bit hex value xxxxxxxx - 8 characters required
114+
if byteMode {
115+
ignoreEscape = true
116+
break
117+
}
118+
119+
err = decodeHex('U', i, 8)
120+
if err != nil {
121+
return nil, err
122+
}
123+
i += 8
124+
case 'N':
125+
// \N{name} Character named name in the Unicode database
126+
if byteMode {
127+
ignoreEscape = true
128+
break
129+
}
130+
// FIXME go can't do this as builtin so ignore for the moment
131+
ignoreEscape = true
132+
default:
133+
ignoreEscape = true
134+
break
135+
}
136+
// ignore unrecognised escape
137+
if ignoreEscape {
138+
i--
139+
out.WriteRune('\\')
140+
ignoreEscape = false
141+
}
142+
}
143+
return out, nil
144+
}

parser/stringescape_test.go

+138
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
package parser
2+
3+
import (
4+
"bytes"
5+
"testing"
6+
7+
"github.com/ncw/gpython/py"
8+
)
9+
10+
func TestDecodeEscape(t *testing.T) {
11+
for _, test := range []struct {
12+
in string
13+
want string
14+
errString string
15+
byteMode bool
16+
}{
17+
// Stringmode tests
18+
{``, ``, "", false},
19+
{`Potato`, `Potato`, "", false},
20+
{`Potato\`, ``, `Trailing \ in string`, false},
21+
{`\Potato`, `\Potato`, "", false},
22+
{`n\\`, `n\`, "", false},
23+
{`\'x`, `'x`, "", false},
24+
{`\"`, `"`, "", false},
25+
{"\\\n", ``, "", false},
26+
{`\b`, "\010", "", false},
27+
{`\f`, "\014", "", false},
28+
{`\t`, "\011", "", false},
29+
{`\n`, "\012", "", false},
30+
{`\r`, "\015", "", false},
31+
{`\v`, "\013", "", false},
32+
{`\a`, "\007", "", false},
33+
{`\1`, "\001", "", false},
34+
{`\12`, "\012", "", false},
35+
{`\123`, "\123", "", false},
36+
{`\777`, "\u01ff", "", false},
37+
{`\1\12\123\1234`, "\001\012\123\123" + "4", "", false},
38+
{`a\1a\12a\123a`, "a\001a\012a\123a", "", false},
39+
{`\x`, "", `truncated \x escape at position 0`, false},
40+
{`\x1`, "", `truncated \x escape at position 0`, false},
41+
{`\x11`, "\x11", "", false},
42+
{`\xzz`, "", `invalid \x escape at position 0`, false},
43+
{`{\x11}`, "{\x11}", "", false},
44+
{`\x01\x8a\xff`, "\x01\u008a\u00ff", "", false},
45+
{`\x01\x8A\xFF`, "\x01\u008a\u00ff", "", false},
46+
{`\u`, "", `truncated \u escape at position 0`, false},
47+
{`\u1`, "", `truncated \u escape at position 0`, false},
48+
{`\u12`, "", `truncated \u escape at position 0`, false},
49+
{`z\u134`, "", `truncated \u escape at position 1`, false},
50+
{`\u1234`, "\u1234", "", false},
51+
{`z\uzzzz`, "", `invalid \u escape at position 1`, false},
52+
{`{\u1234}`, "{\u1234}", "", false},
53+
{`\U00000001\U0000018a\U000012ff`, "\U00000001\U0000018a\U000012ff", "", false},
54+
{`\U00000001\U0000018A\U000012FF`, "\U00000001\U0000018a\U000012ff", "", false},
55+
{`\U0000`, "", `truncated \U escape at position 0`, false},
56+
{`\U00001`, "", `truncated \U escape at position 0`, false},
57+
{`\U000012`, "", `truncated \U escape at position 0`, false},
58+
{`z\U0000134`, "", `truncated \U escape at position 1`, false},
59+
{`\U00001234`, "\U00001234", "", false},
60+
{`z\Uzzzzzzzz`, "", `invalid \U escape at position 1`, false},
61+
{`{\U00001234}`, "{\U00001234}", "", false},
62+
{`\U00000001\U0000018a\U000012ff`, "\U00000001\U0000018a\U000012ff", "", false},
63+
{`\U00000001\U0000018A\U000012FF`, "\U00000001\U0000018a\U000012ff", "", false},
64+
{`\N{potato}`, `\N{potato}`, "", false},
65+
66+
// Bytemode tests
67+
{``, ``, "", true},
68+
{`Potato`, `Potato`, "", true},
69+
{`Potato\`, ``, `Trailing \ in string`, true},
70+
{`\Potato`, `\Potato`, "", true},
71+
{`n\\`, `n\`, "", true},
72+
{`\'x`, `'x`, "", true},
73+
{`\"`, `"`, "", true},
74+
{"\\\n", ``, "", true},
75+
{`\b`, "\010", "", true},
76+
{`\f`, "\014", "", true},
77+
{`\t`, "\011", "", true},
78+
{`\n`, "\012", "", true},
79+
{`\r`, "\015", "", true},
80+
{`\v`, "\013", "", true},
81+
{`\a`, "\007", "", true},
82+
{`\1`, "\001", "", true},
83+
{`\12`, "\012", "", true},
84+
{`\123`, "\123", "", true},
85+
{`\777`, "\xff", "", true},
86+
{`\1\12\123\1234`, "\001\012\123\123" + "4", "", true},
87+
{`a\1a\12a\123a`, "a\001a\012a\123a", "", true},
88+
{`\x`, "", `truncated \x escape at position 0`, true},
89+
{`\x1`, "", `truncated \x escape at position 0`, true},
90+
{`\x11`, "\x11", "", true},
91+
{`\xzz`, "", `invalid \x escape at position 0`, true},
92+
{`{\x11}`, "{\x11}", "", true},
93+
{`\x01\x8a\xff`, "\x01\x8a\xff", "", true},
94+
{`\x01\x8A\xFF`, "\x01\x8a\xff", "", true},
95+
{`\u`, `\u`, "", true},
96+
{`\u1`, `\u1`, "", true},
97+
{`\u12`, `\u12`, "", true},
98+
{`z\u134`, `z\u134`, "", true},
99+
{`\u1234`, `\u1234`, "", true},
100+
{`z\uzzzz`, `z\uzzzz`, "", true},
101+
{`{\u1234}`, `{\u1234}`, "", true},
102+
{`\U00000001\U0000018a\U000012ff`, `\U00000001\U0000018a\U000012ff`, "", true},
103+
{`\U00000001\U0000018A\U000012FF`, `\U00000001\U0000018A\U000012FF`, "", true},
104+
{`\U0000`, `\U0000`, "", true},
105+
{`\U00001`, `\U00001`, "", true},
106+
{`\U000012`, `\U000012`, "", true},
107+
{`z\U0000134`, `z\U0000134`, "", true},
108+
{`\U00001234`, `\U00001234`, "", true},
109+
{`z\Uzzzzzzzz`, `z\Uzzzzzzzz`, "", true},
110+
{`{\U00001234}`, `{\U00001234}`, "", true},
111+
{`\U00000001\U0000018a\U000012ff`, `\U00000001\U0000018a\U000012ff`, "", true},
112+
{`\U00000001\U0000018A\U000012FF`, `\U00000001\U0000018A\U000012FF`, "", true},
113+
{`\N{potato}`, `\N{potato}`, "", true},
114+
} {
115+
in := bytes.NewBufferString(test.in)
116+
out, err := DecodeEscape(in, test.byteMode)
117+
if err != nil {
118+
if test.errString == "" {
119+
t.Errorf("%q: not expecting error but got: %v", test.in, err)
120+
} else {
121+
exc := err.(*py.Exception)
122+
args := exc.Args.(py.Tuple)
123+
if string(args[0].(py.String)) != test.errString {
124+
t.Errorf("%q: want error %q but got %q", test.in, test.errString, args[0])
125+
}
126+
}
127+
continue
128+
}
129+
if test.errString != "" {
130+
t.Errorf("%q: expecting error but didn't get one", test.in)
131+
continue
132+
}
133+
got := out.String()
134+
if test.want != got {
135+
t.Errorf("%q: want %q but got %q", test.in, test.want, got)
136+
}
137+
}
138+
}

py/tests/int.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def assertRaises(expecting, s, base=None):
9090
doc="whitespace"
9191
assert int(" +100000", 0) == +tenE5
9292
assert int("+100000 ", 0) == +tenE5
93-
# FIXME broken in lexer? assert int("\t\t\t\t100000\t\t\t\t", 0) == tenE5
93+
assert int("\t\t\t\t100000\t\t\t\t", 0) == tenE5
9494
assert int(" 100000 ", 0) == tenE5
9595

9696
doc="sigils"

0 commit comments

Comments
 (0)