Skip to content

Commit 96ec934

Browse files
committed
Issue #24619: Simplify async/await tokenization.
This commit simplifies async/await tokenization in tokenizer.c, tokenize.py & lib2to3/tokenize.py. Previous solution was to keep a stack of async-def & def blocks, whereas the new approach is just to remember position of the outermost async-def block. This change won't bring any parsing performance improvements, but it makes the code much easier to read and validate.
1 parent f315c1c commit 96ec934

File tree

7 files changed

+183
-132
lines changed

7 files changed

+183
-132
lines changed

Lib/lib2to3/pgen2/tokenize.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,11 @@ def generate_tokens(readline):
366366
contline = None
367367
indents = [0]
368368

369-
# 'stashed' and 'ctx' are used for async/await parsing
369+
# 'stashed' and 'async_*' are used for async/await parsing
370370
stashed = None
371-
ctx = [('sync', 0)]
372-
in_async = 0
371+
async_def = False
372+
async_def_indent = 0
373+
async_def_nl = False
373374

374375
while 1: # loop over lines in stream
375376
try:
@@ -438,15 +439,18 @@ def generate_tokens(readline):
438439
("<tokenize>", lnum, pos, line))
439440
indents = indents[:-1]
440441

441-
cur_indent = indents[-1]
442-
while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
443-
if ctx[-1][0] == 'async':
444-
in_async -= 1
445-
assert in_async >= 0
446-
ctx.pop()
442+
if async_def and async_def_indent >= indents[-1]:
443+
async_def = False
444+
async_def_nl = False
445+
async_def_indent = 0
447446

448447
yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
449448

449+
if async_def and async_def_nl and async_def_indent >= indents[-1]:
450+
async_def = False
451+
async_def_nl = False
452+
async_def_indent = 0
453+
450454
else: # continued statement
451455
if not line:
452456
raise TokenError("EOF in multi-line statement", (lnum, 0))
@@ -466,10 +470,13 @@ def generate_tokens(readline):
466470
newline = NEWLINE
467471
if parenlev > 0:
468472
newline = NL
473+
elif async_def:
474+
async_def_nl = True
469475
if stashed:
470476
yield stashed
471477
stashed = None
472478
yield (newline, token, spos, epos, line)
479+
473480
elif initial == '#':
474481
assert not token.endswith("\n")
475482
if stashed:
@@ -508,7 +515,7 @@ def generate_tokens(readline):
508515
yield (STRING, token, spos, epos, line)
509516
elif initial in namechars: # ordinary name
510517
if token in ('async', 'await'):
511-
if in_async:
518+
if async_def:
512519
yield (ASYNC if token == 'async' else AWAIT,
513520
token, spos, epos, line)
514521
continue
@@ -523,15 +530,13 @@ def generate_tokens(readline):
523530
and stashed[0] == NAME
524531
and stashed[1] == 'async'):
525532

526-
ctx.append(('async', indents[-1]))
527-
in_async += 1
533+
async_def = True
534+
async_def_indent = indents[-1]
528535

529536
yield (ASYNC, stashed[1],
530537
stashed[2], stashed[3],
531538
stashed[4])
532539
stashed = None
533-
else:
534-
ctx.append(('sync', indents[-1]))
535540

536541
if stashed:
537542
yield stashed

Lib/lib2to3/tests/test_parser.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,32 @@ def test_await_expr(self):
6767
await x
6868
""")
6969

70+
self.validate("""async def foo():
71+
72+
def foo(): pass
73+
74+
def foo(): pass
75+
76+
await x
77+
""")
78+
79+
self.validate("""async def foo(): return await a""")
80+
81+
self.validate("""def foo():
82+
def foo(): pass
83+
async def foo(): await x
84+
""")
85+
7086
self.invalid_syntax("await x")
7187
self.invalid_syntax("""def foo():
7288
await x""")
7389

90+
self.invalid_syntax("""def foo():
91+
def foo(): pass
92+
async def foo(): pass
93+
await x
94+
""")
95+
7496
def test_async_var(self):
7597
self.validate("""async = 1""")
7698
self.validate("""await = 1""")

Lib/test/test_coroutines.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,7 @@ async def bar(): return await_
330330
async def f():
331331
async def g(): pass
332332
await z
333+
await = 1
333334
self.assertTrue(inspect.iscoroutinefunction(f))
334335

335336

Lib/test/test_tokenize.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,79 @@
840840
OP ')' (1, 19) (1, 20)
841841
OP ':' (1, 20) (1, 21)
842842
AWAIT 'await' (1, 22) (1, 27)
843+
844+
>>> dump_tokens('''def f():
845+
...
846+
... def baz(): pass
847+
... async def bar(): pass
848+
...
849+
... await = 2''')
850+
ENCODING 'utf-8' (0, 0) (0, 0)
851+
NAME 'def' (1, 0) (1, 3)
852+
NAME 'f' (1, 4) (1, 5)
853+
OP '(' (1, 5) (1, 6)
854+
OP ')' (1, 6) (1, 7)
855+
OP ':' (1, 7) (1, 8)
856+
NEWLINE '\\n' (1, 8) (1, 9)
857+
NL '\\n' (2, 0) (2, 1)
858+
INDENT ' ' (3, 0) (3, 2)
859+
NAME 'def' (3, 2) (3, 5)
860+
NAME 'baz' (3, 6) (3, 9)
861+
OP '(' (3, 9) (3, 10)
862+
OP ')' (3, 10) (3, 11)
863+
OP ':' (3, 11) (3, 12)
864+
NAME 'pass' (3, 13) (3, 17)
865+
NEWLINE '\\n' (3, 17) (3, 18)
866+
ASYNC 'async' (4, 2) (4, 7)
867+
NAME 'def' (4, 8) (4, 11)
868+
NAME 'bar' (4, 12) (4, 15)
869+
OP '(' (4, 15) (4, 16)
870+
OP ')' (4, 16) (4, 17)
871+
OP ':' (4, 17) (4, 18)
872+
NAME 'pass' (4, 19) (4, 23)
873+
NEWLINE '\\n' (4, 23) (4, 24)
874+
NL '\\n' (5, 0) (5, 1)
875+
NAME 'await' (6, 2) (6, 7)
876+
OP '=' (6, 8) (6, 9)
877+
NUMBER '2' (6, 10) (6, 11)
878+
DEDENT '' (7, 0) (7, 0)
879+
880+
>>> dump_tokens('''async def f():
881+
...
882+
... def baz(): pass
883+
... async def bar(): pass
884+
...
885+
... await = 2''')
886+
ENCODING 'utf-8' (0, 0) (0, 0)
887+
ASYNC 'async' (1, 0) (1, 5)
888+
NAME 'def' (1, 6) (1, 9)
889+
NAME 'f' (1, 10) (1, 11)
890+
OP '(' (1, 11) (1, 12)
891+
OP ')' (1, 12) (1, 13)
892+
OP ':' (1, 13) (1, 14)
893+
NEWLINE '\\n' (1, 14) (1, 15)
894+
NL '\\n' (2, 0) (2, 1)
895+
INDENT ' ' (3, 0) (3, 2)
896+
NAME 'def' (3, 2) (3, 5)
897+
NAME 'baz' (3, 6) (3, 9)
898+
OP '(' (3, 9) (3, 10)
899+
OP ')' (3, 10) (3, 11)
900+
OP ':' (3, 11) (3, 12)
901+
NAME 'pass' (3, 13) (3, 17)
902+
NEWLINE '\\n' (3, 17) (3, 18)
903+
ASYNC 'async' (4, 2) (4, 7)
904+
NAME 'def' (4, 8) (4, 11)
905+
NAME 'bar' (4, 12) (4, 15)
906+
OP '(' (4, 15) (4, 16)
907+
OP ')' (4, 16) (4, 17)
908+
OP ':' (4, 17) (4, 18)
909+
NAME 'pass' (4, 19) (4, 23)
910+
NEWLINE '\\n' (4, 23) (4, 24)
911+
NL '\\n' (5, 0) (5, 1)
912+
AWAIT 'await' (6, 2) (6, 7)
913+
OP '=' (6, 8) (6, 9)
914+
NUMBER '2' (6, 10) (6, 11)
915+
DEDENT '' (7, 0) (7, 0)
843916
"""
844917

845918
from test import support

Lib/tokenize.py

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -498,10 +498,11 @@ def _tokenize(readline, encoding):
498498
contline = None
499499
indents = [0]
500500

501-
# 'stashed' and 'ctx' are used for async/await parsing
501+
# 'stashed' and 'async_*' are used for async/await parsing
502502
stashed = None
503-
ctx = [('sync', 0)]
504-
in_async = 0
503+
async_def = False
504+
async_def_indent = 0
505+
async_def_nl = False
505506

506507
if encoding is not None:
507508
if encoding == "utf-8-sig":
@@ -579,15 +580,18 @@ def _tokenize(readline, encoding):
579580
("<tokenize>", lnum, pos, line))
580581
indents = indents[:-1]
581582

582-
cur_indent = indents[-1]
583-
while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
584-
if ctx[-1][0] == 'async':
585-
in_async -= 1
586-
assert in_async >= 0
587-
ctx.pop()
583+
if async_def and async_def_indent >= indents[-1]:
584+
async_def = False
585+
async_def_nl = False
586+
async_def_indent = 0
588587

589588
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
590589

590+
if async_def and async_def_nl and async_def_indent >= indents[-1]:
591+
async_def = False
592+
async_def_nl = False
593+
async_def_indent = 0
594+
591595
else: # continued statement
592596
if not line:
593597
raise TokenError("EOF in multi-line statement", (lnum, 0))
@@ -609,8 +613,13 @@ def _tokenize(readline, encoding):
609613
if stashed:
610614
yield stashed
611615
stashed = None
612-
yield TokenInfo(NL if parenlev > 0 else NEWLINE,
613-
token, spos, epos, line)
616+
if parenlev > 0:
617+
yield TokenInfo(NL, token, spos, epos, line)
618+
else:
619+
yield TokenInfo(NEWLINE, token, spos, epos, line)
620+
if async_def:
621+
async_def_nl = True
622+
614623
elif initial == '#':
615624
assert not token.endswith("\n")
616625
if stashed:
@@ -644,7 +653,7 @@ def _tokenize(readline, encoding):
644653
yield TokenInfo(STRING, token, spos, epos, line)
645654
elif initial.isidentifier(): # ordinary name
646655
if token in ('async', 'await'):
647-
if in_async:
656+
if async_def:
648657
yield TokenInfo(
649658
ASYNC if token == 'async' else AWAIT,
650659
token, spos, epos, line)
@@ -660,15 +669,13 @@ def _tokenize(readline, encoding):
660669
and stashed.type == NAME
661670
and stashed.string == 'async'):
662671

663-
ctx.append(('async', indents[-1]))
664-
in_async += 1
672+
async_def = True
673+
async_def_indent = indents[-1]
665674

666675
yield TokenInfo(ASYNC, stashed.string,
667676
stashed.start, stashed.end,
668677
stashed.line)
669678
stashed = None
670-
else:
671-
ctx.append(('sync', indents[-1]))
672679

673680
if stashed:
674681
yield stashed

0 commit comments

Comments
 (0)