[Parser] Don't modify the current token kind when cutting off parsing

ahoppen · ahoppen · commit b888dc0e4050 · 2021-11-09T12:28:10.000+01:00
Previously, when we reached the maximum nesting level, we changed the current token’s kind to an EOF token. A lot of places in the parser are not set up to expect this token change. The intended workaround was to check whether pushing a structure marker failed (which would change the token kind) and bail out parsing if this happened. This was fragile and caused assertion failures in assert builds.

Instead of changing the current token’s kind, and failing to push the structure marker, let the lexer know that it should cut off lexing, essentially making the input buffer stop at the current position. The parser will continue to consume its current token (`Parser.Tok`) and the next token that’s already lexed in the lexer (`Lexer.NextToken`) before reaching the emulated EOF token. Thus two more tokens are parsed than before, but that shouldn’t make much of a difference.
diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h
@@ -133,6 +133,11 @@ class Lexer {
   /// the next token doesn't have a comment.
   const char *CommentStart;
 
+  /// If this is not \c nullptr, all tokens after this point are treated as eof.
+  /// Used to cut off lexing early when we detect that the nesting level is too
+  /// deep.
+  const char *LexerCutOffPoint = nullptr;
+
   Lexer(const Lexer&) = delete;
   void operator=(const Lexer&) = delete;
 
@@ -222,6 +227,28 @@ class Lexer {
     lexImpl();
   }
 
+  /// Cut off lexing at the current position. The next token to be lexed will
+  /// be an EOF token, even if there is still source code to be lexed.
+  /// The current and next token (returned by \c peekNextToken ) are not
+  /// modified. The token after \c NextToken will be the EOF token.
+  void cutOffLexing() {
+    // If we already have a cut off point, don't push it further towards the
+    // back.
+    if (LexerCutOffPoint == nullptr || LexerCutOffPoint >= CurPtr) {
+      LexerCutOffPoint = CurPtr;
+    }
+  }
+
+  /// If a lexer cut off point has been set returns the offset in the buffer at
+  /// which lexing is being cut off.
+  Optional<size_t> lexingCutOffOffset() const {
+    if (LexerCutOffPoint) {
+      return LexerCutOffPoint - BufferStart;
+    } else {
+      return None;
+    }
+  }
+
   bool isKeepingComments() const {
     return RetainComments == CommentRetentionMode::ReturnAsTokens;
   }
diff --git a/include/swift/Parse/Parser.h b/include/swift/Parse/Parser.h
@@ -221,12 +221,6 @@ class Parser {
   /// The location of the previous token.
   SourceLoc PreviousLoc;
 
-  /// Stop parsing immediately.
-  void cutOffParsing() {
-    // Cut off parsing by acting as if we reached the end-of-file.
-    Tok.setKind(tok::eof);
-  }
-
   /// Use this to assert that the parser has advanced the lexing location, e.g.
   /// before a specific parser function has returned.
   class AssertParserMadeProgressBeforeLeavingScopeRAII {
@@ -329,35 +323,21 @@ class Parser {
 
   /// An RAII object that notes when we have seen a structure marker.
   class StructureMarkerRAII {
-    Parser *const P;
+    Parser &P;
 
     /// Max nesting level
     // TODO: customizable.
     enum { MaxDepth = 256 };
 
-    StructureMarkerRAII(Parser *parser) : P(parser) {}
-
-    /// Have the parser start the new Structure or fail if already too deep.
-    bool pushStructureMarker(Parser &parser, SourceLoc loc,
-                             StructureMarkerKind kind);
+    StructureMarkerRAII(Parser &parser) : P(parser) {}
 
   public:
-    StructureMarkerRAII(Parser &parser, SourceLoc loc, StructureMarkerKind kind)
-        : StructureMarkerRAII(
-              pushStructureMarker(parser, loc, kind) ? &parser : nullptr) {}
+    StructureMarkerRAII(Parser &parser, SourceLoc loc,
+                        StructureMarkerKind kind);
 
     StructureMarkerRAII(Parser &parser, const Token &tok);
 
-    /// Did we fail to push the new structure?
-    bool isFailed() {
-      return P == nullptr;
-    }
-
-    ~StructureMarkerRAII() {
-      if (P != nullptr) {
-        P->StructureMarkers.pop_back();
-      }
-    }
+    ~StructureMarkerRAII() { P.StructureMarkers.pop_back(); }
   };
   friend class StructureMarkerRAII;
 
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -2349,7 +2349,11 @@ void Lexer::lexImpl() {
 
   // Remember the start of the token so we can form the text range.
   const char *TokStart = CurPtr;
-  
+
+  if (LexerCutOffPoint && CurPtr >= LexerCutOffPoint) {
+    return formToken(tok::eof, TokStart);
+  }
+
   switch (*CurPtr++) {
   default: {
     char const *Tmp = CurPtr-1;
diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp
@@ -3121,10 +3121,6 @@ ParserStatus Parser::parseExprList(tok leftTok, tok rightTok,
                                    SourceLoc &rightLoc, SyntaxKind Kind) {
   StructureMarkerRAII ParsingExprList(*this, Tok);
   
-  if (ParsingExprList.isFailed()) {
-    return makeParserError();
-  }
-
   leftLoc = consumeToken(leftTok);
   return parseList(rightTok, leftLoc, rightLoc, /*AllowSepAfterLast=*/false,
                    rightTok == tok::r_paren ? diag::expected_rparen_expr_list
diff --git a/lib/Parse/ParseGeneric.cpp b/lib/Parse/ParseGeneric.cpp
@@ -57,10 +57,6 @@ Parser::parseGenericParametersBeforeWhere(SourceLoc LAngleLoc,
     // Note that we're parsing a declaration.
     StructureMarkerRAII ParsingDecl(*this, Tok.getLoc(),
                                     StructureMarkerKind::Declaration);
-    
-    if (ParsingDecl.isFailed()) {
-      return makeParserError();
-    }
 
     // Parse attributes.
     DeclAttributes attributes;
diff --git a/lib/Parse/ParsePattern.cpp b/lib/Parse/ParsePattern.cpp
@@ -1216,9 +1216,6 @@ ParserResult<Pattern> Parser::parsePatternTuple() {
   SyntaxParsingContext TuplePatternCtxt(SyntaxContext,
                                         SyntaxKind::TuplePattern);
   StructureMarkerRAII ParsingPatternTuple(*this, Tok);
-  if (ParsingPatternTuple.isFailed()) {
-    return makeParserError();
-  }
   SourceLoc LPLoc = consumeToken(tok::l_paren);
   SourceLoc RPLoc;
 
diff --git a/lib/Parse/ParseStmt.cpp b/lib/Parse/ParseStmt.cpp
@@ -1280,10 +1280,6 @@ ParserResult<PoundAvailableInfo> Parser::parseStmtConditionPoundAvailable() {
 
   StructureMarkerRAII ParsingAvailabilitySpecList(*this, Tok);
 
-  if (ParsingAvailabilitySpecList.isFailed()) {
-    return makeParserError();
-  }
-
   SourceLoc LParenLoc = consumeToken(tok::l_paren);
 
   SmallVector<AvailabilitySpec *, 5> Specs;
diff --git a/lib/Parse/ParseType.cpp b/lib/Parse/ParseType.cpp
@@ -1017,10 +1017,6 @@ ParserResult<TypeRepr> Parser::parseTypeTupleBody() {
   TypeContext.setCreateSyntax(SyntaxKind::TupleType);
   Parser::StructureMarkerRAII ParsingTypeTuple(*this, Tok);
 
-  if (ParsingTypeTuple.isFailed()) {
-    return makeParserError();
-  }
-
   SourceLoc RPLoc, LPLoc = consumeToken(tok::l_paren);
   SourceLoc EllipsisLoc;
   unsigned EllipsisIdx;
diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp
@@ -405,6 +405,9 @@ namespace {
 /// underlying corrected token stream.
 class TokenRecorder: public ConsumeTokenReceiver {
   ASTContext &Ctx;
+  /// The lexer that is being used to lex the source file. Used to query whether
+  /// lexing has been cut off.
+  Lexer &BaseLexer;
   unsigned BufferID;
 
   // Token list ordered by their appearance in the source file.
@@ -425,11 +428,19 @@ class TokenRecorder: public ConsumeTokenReceiver {
   void relexComment(CharSourceRange CommentRange,
                     llvm::SmallVectorImpl<Token> &Scratch) {
     auto &SM = Ctx.SourceMgr;
+    auto EndOffset = SM.getLocOffsetInBuffer(CommentRange.getEnd(), BufferID);
+    if (auto LexerCutOffOffset = BaseLexer.lexingCutOffOffset()) {
+      if (*LexerCutOffOffset < EndOffset) {
+        // If lexing was cut off due to a too deep nesting level, adjust the end
+        // offset to not point past the cut off point.
+        EndOffset = *LexerCutOffOffset;
+      }
+    }
     Lexer L(Ctx.LangOpts, SM, BufferID, nullptr, LexerMode::Swift,
             HashbangMode::Disallowed, CommentRetentionMode::ReturnAsTokens,
             TriviaRetentionMode::WithoutTrivia,
             SM.getLocOffsetInBuffer(CommentRange.getStart(), BufferID),
-            SM.getLocOffsetInBuffer(CommentRange.getEnd(), BufferID));
+            EndOffset);
     while(true) {
       Token Result;
       L.lex(Result);
@@ -441,8 +452,8 @@ class TokenRecorder: public ConsumeTokenReceiver {
   }
 
 public:
-  TokenRecorder(ASTContext &ctx, unsigned BufferID)
-      : Ctx(ctx), BufferID(BufferID) {}
+  TokenRecorder(ASTContext &ctx, Lexer &BaseLexer)
+      : Ctx(ctx), BaseLexer(BaseLexer), BufferID(BaseLexer.getBufferID()) {}
 
   Optional<std::vector<Token>> finalize() override {
     auto &SM = Ctx.SourceMgr;
@@ -516,19 +527,14 @@ class TokenRecorder: public ConsumeTokenReceiver {
 Parser::Parser(std::unique_ptr<Lexer> Lex, SourceFile &SF,
                SILParserStateBase *SIL, PersistentParserState *PersistentState,
                std::shared_ptr<SyntaxParseActions> SPActions)
-  : SourceMgr(SF.getASTContext().SourceMgr),
-    Diags(SF.getASTContext().Diags),
-    SF(SF),
-    L(Lex.release()),
-    SIL(SIL),
-    CurDeclContext(&SF),
-    Context(SF.getASTContext()),
-    TokReceiver(SF.shouldCollectTokens() ?
-                new TokenRecorder(SF.getASTContext(), L->getBufferID()) :
-                new ConsumeTokenReceiver()),
-    SyntaxContext(new SyntaxParsingContext(SyntaxContext, SF,
-                                           L->getBufferID(),
-                                           std::move(SPActions))) {
+    : SourceMgr(SF.getASTContext().SourceMgr), Diags(SF.getASTContext().Diags),
+      SF(SF), L(Lex.release()), SIL(SIL), CurDeclContext(&SF),
+      Context(SF.getASTContext()),
+      TokReceiver(SF.shouldCollectTokens()
+                      ? new TokenRecorder(SF.getASTContext(), *L)
+                      : new ConsumeTokenReceiver()),
+      SyntaxContext(new SyntaxParsingContext(
+          SyntaxContext, SF, L->getBufferID(), std::move(SPActions))) {
   State = PersistentState;
   if (!State) {
     OwnedState.reset(new PersistentParserState());
@@ -880,28 +886,25 @@ getStructureMarkerKindForToken(const Token &tok) {
   }
 }
 
-Parser::StructureMarkerRAII::StructureMarkerRAII(Parser &parser,
-                                                 const Token &tok)
-    : StructureMarkerRAII(parser, tok.getLoc(),
-                          getStructureMarkerKindForToken(tok)) {}
-
-bool Parser::StructureMarkerRAII::pushStructureMarker(
-                                      Parser &parser, SourceLoc loc,    
-                                      StructureMarkerKind kind) {
-  
-  if (parser.StructureMarkers.size() < MaxDepth) {
-    parser.StructureMarkers.push_back({loc, kind, None});
-    return true;
-  } else {
+Parser::StructureMarkerRAII::StructureMarkerRAII(Parser &parser, SourceLoc loc,
+                                                 StructureMarkerKind kind)
+    : StructureMarkerRAII(parser) {
+  parser.StructureMarkers.push_back({loc, kind, None});
+  if (parser.StructureMarkers.size() > MaxDepth) {
     parser.diagnose(loc, diag::structure_overflow, MaxDepth);
     // We need to cut off parsing or we will stack-overflow.
     // But `cutOffParsing` changes the current token to eof, and we may be in
     // a place where `consumeToken()` will be expecting e.g. '[',
     // since we need that to get to the callsite, so this can cause an assert.
-    parser.cutOffParsing();
-    return false;
+    parser.L->cutOffLexing();
   }
 }
+
+Parser::StructureMarkerRAII::StructureMarkerRAII(Parser &parser,
+                                                 const Token &tok)
+    : StructureMarkerRAII(parser, tok.getLoc(),
+                          getStructureMarkerKindForToken(tok)) {}
+
 //===----------------------------------------------------------------------===//
 // Primitive Parsing
 //===----------------------------------------------------------------------===//
diff --git a/validation-test/compiler_crashers_2_fixed/parser-cutoff.swift b/validation-test/compiler_crashers_2_fixed/parser-cutoff.swift