Skip to content

Commit 37f1652

Browse files
committed
Prototype regex literal AST and emission
With `-enable-experimental-string-processing`, start lexing `'` delimiters as regex literals (this is just a placeholder delimiter for now). The contents of which gets passed to the libswift library, which can return an error string to be emitted, or null for success. The libswift side isn't yet hooked up to the Swift regex parser, so for now just emit a dummy diagnostic for regexes starting with quantifiers. If successful, build an AST node which will be emitted as an implicit call to an `init(_regexString:)` initializer of an in-scope `Regex` decl (which will eventually be a known stdlib decl).
1 parent c0f7143 commit 37f1652

28 files changed

+304
-42
lines changed

Diff for: include/swift/AST/DiagnosticsParse.def

+6-3
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,9 @@ ERROR(forbidden_interpolated_string,none,
9191
ERROR(forbidden_extended_escaping_string,none,
9292
"%0 cannot be an extended escaping string literal", (StringRef))
9393

94+
ERROR(regex_literal_parsing_error,none,
95+
"%0", (StringRef))
96+
9497
//------------------------------------------------------------------------------
9598
// MARK: Lexer diagnostics
9699
//------------------------------------------------------------------------------
@@ -108,9 +111,6 @@ ERROR(lex_unprintable_ascii_character,none,
108111
ERROR(lex_invalid_utf8,none,
109112
"invalid UTF-8 found in source file", ())
110113

111-
NOTE(lex_experimental_regex_strawperson,none,
112-
"'%0'", (StringRef))
113-
114114
ERROR(lex_single_quote_string,none,
115115
"single-quoted string literal found, use '\"'", ())
116116
ERROR(lex_invalid_curly_quote,none,
@@ -140,6 +140,9 @@ ERROR(lex_invalid_escape_delimiter,none,
140140
ERROR(lex_invalid_closing_delimiter,none,
141141
"too many '#' characters in closing delimiter", ())
142142

143+
ERROR(lex_unterminated_regex,none,
144+
"unterminated regex literal", ())
145+
143146
ERROR(lex_invalid_unicode_scalar,none,
144147
"invalid unicode scalar", ())
145148
ERROR(lex_unicode_escape_braces,none,

Diff for: include/swift/AST/DiagnosticsSema.def

+3
Original file line numberDiff line numberDiff line change
@@ -3668,6 +3668,9 @@ ERROR(builtin_string_literal_broken_proto,none,
36683668
ERROR(string_literal_broken_proto,none,
36693669
"protocol 'ExpressibleByStringLiteral' is broken", ())
36703670

3671+
ERROR(regex_decl_broken,none,
3672+
"cannot find 'Regex' type in scope", ())
3673+
36713674
// Array literals
36723675
ERROR(should_use_dictionary_literal,none,
36733676
"dictionary of type %0 cannot be %select{used|initialized}1 "

Diff for: include/swift/AST/Expr.h

+32-1
Original file line numberDiff line numberDiff line change
@@ -961,7 +961,38 @@ class InterpolatedStringLiteralExpr : public LiteralExpr {
961961
return E->getKind() == ExprKind::InterpolatedStringLiteral;
962962
}
963963
};
964-
964+
965+
/// A regular expression literal e.g '(a|c)*'.
966+
class RegexLiteralExpr : public LiteralExpr {
967+
SourceLoc Loc;
968+
StringRef RegexText;
969+
Expr *SemanticExpr;
970+
971+
RegexLiteralExpr(SourceLoc loc, StringRef regexText, Expr *semanticExpr,
972+
bool isImplicit)
973+
: LiteralExpr(ExprKind::RegexLiteral, isImplicit), Loc(loc),
974+
RegexText(regexText), SemanticExpr(semanticExpr) {}
975+
976+
public:
977+
static RegexLiteralExpr *createParsed(ASTContext &ctx, SourceLoc loc,
978+
StringRef regexText,
979+
Expr *semanticExpr);
980+
981+
/// Retrieve the raw regex text.
982+
StringRef getRegexText() const { return RegexText; }
983+
984+
/// Retrieve the semantic expression that the regex will be type-checked and
985+
/// emitted as.
986+
Expr *getSemanticExpr() const { return SemanticExpr; }
987+
void setSemanticExpr(Expr *expr) { SemanticExpr = expr; }
988+
989+
SourceRange getSourceRange() const { return Loc; }
990+
991+
static bool classof(const Expr *E) {
992+
return E->getKind() == ExprKind::RegexLiteral;
993+
}
994+
};
995+
965996
/// MagicIdentifierLiteralExpr - A magic identifier like #file which expands
966997
/// out to a literal at SILGen time.
967998
class MagicIdentifierLiteralExpr : public BuiltinLiteralExpr {

Diff for: include/swift/AST/ExprNodes.def

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ ABSTRACT_EXPR(Literal, Expr)
7878
LITERAL_EXPR(MagicIdentifierLiteral, BuiltinLiteralExpr)
7979
EXPR_RANGE(BuiltinLiteral, BooleanLiteral, MagicIdentifierLiteral)
8080
LITERAL_EXPR(InterpolatedStringLiteral, LiteralExpr)
81+
LITERAL_EXPR(RegexLiteral, LiteralExpr)
8182
LITERAL_EXPR(ObjectLiteral, LiteralExpr)
8283
EXPR_RANGE(Literal, NilLiteral, ObjectLiteral)
8384
EXPR(DiscardAssignment, Expr)

Diff for: include/swift/AST/KnownIdentifiers.def

+4
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,10 @@ IDENTIFIER(pullback)
250250
IDENTIFIER(TangentVector)
251251
IDENTIFIER(zero)
252252

253+
// Regex literals
254+
IDENTIFIER(Regex)
255+
IDENTIFIER(_regexString)
256+
253257
// Distributed actors
254258
IDENTIFIER(transport)
255259
IDENTIFIER(using)

Diff for: include/swift/Parse/ExperimentalRegexBridging.h

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ extern "C" {
88
typedef const char *(* ParseRegexStrawperson)(const char *);
99

1010
void Parser_registerParseRegexStrawperson(ParseRegexStrawperson fn);
11+
bool Parser_hasParseRegexStrawperson();
1112

1213
#ifdef __cplusplus
1314
} // extern "C"

Diff for: include/swift/Parse/Lexer.h

+2
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,8 @@ class Lexer {
595595
void lexStringLiteral(unsigned CustomDelimiterLen = 0);
596596
void lexEscapedIdentifier();
597597

598+
void lexRegexLiteral(const char *TokStart);
599+
598600
void tryLexEditorPlaceholder();
599601
const char *findEndOfCurlyQuoteStringLiteral(const char *,
600602
bool EmitDiagnostics);

Diff for: include/swift/Parse/Parser.h

+1
Original file line numberDiff line numberDiff line change
@@ -1579,6 +1579,7 @@ class Parser {
15791579
ParserResult<Expr> parseExprSelector();
15801580
ParserResult<Expr> parseExprSuper();
15811581
ParserResult<Expr> parseExprStringLiteral();
1582+
ParserResult<Expr> parseExprRegexLiteral();
15821583

15831584
StringRef copyAndStripUnderscores(StringRef text);
15841585

Diff for: lib/AST/ASTDumper.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -1907,6 +1907,11 @@ class PrintExpr : public ExprVisitor<PrintExpr> {
19071907
E->getInitializer().dump(OS);
19081908
PrintWithColorRAII(OS, ParenthesisColor) << ')';
19091909
}
1910+
void visitRegexLiteralExpr(RegexLiteralExpr *E) {
1911+
printCommon(E, "regex_literal_expr");
1912+
printRec(E->getSemanticExpr());
1913+
PrintWithColorRAII(OS, ParenthesisColor) << ')';
1914+
}
19101915

19111916
void visitObjectLiteralExpr(ObjectLiteralExpr *E) {
19121917
printCommon(E, "object_literal")

Diff for: lib/AST/ASTWalker.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -1152,6 +1152,15 @@ class Traversal : public ASTVisitor<Traversal, Expr*, Stmt*,
11521152
return E;
11531153
}
11541154

1155+
Expr *visitRegexLiteralExpr(RegexLiteralExpr *E) {
1156+
if (auto *newExpr = doIt(E->getSemanticExpr())) {
1157+
E->setSemanticExpr(newExpr);
1158+
} else {
1159+
return nullptr;
1160+
}
1161+
return E;
1162+
}
1163+
11551164
//===--------------------------------------------------------------------===//
11561165
// Everything Else
11571166
//===--------------------------------------------------------------------===//

Diff for: lib/AST/Expr.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ ConcreteDeclRef Expr::getReferencedDecl(bool stopAtParenExpr) const {
282282
NO_REFERENCE(BooleanLiteral);
283283
NO_REFERENCE(StringLiteral);
284284
NO_REFERENCE(InterpolatedStringLiteral);
285+
NO_REFERENCE(RegexLiteral);
285286
NO_REFERENCE(ObjectLiteral);
286287
NO_REFERENCE(MagicIdentifierLiteral);
287288
NO_REFERENCE(DiscardAssignment);
@@ -590,6 +591,7 @@ bool Expr::canAppendPostfixExpression(bool appendingPostfixOperator) const {
590591
case ExprKind::BooleanLiteral:
591592
case ExprKind::StringLiteral:
592593
case ExprKind::InterpolatedStringLiteral:
594+
case ExprKind::RegexLiteral:
593595
case ExprKind::MagicIdentifierLiteral:
594596
case ExprKind::ObjCSelector:
595597
case ExprKind::KeyPath:
@@ -815,6 +817,7 @@ bool Expr::isValidParentOfTypeExpr(Expr *typeExpr) const {
815817
case ExprKind::StringLiteral:
816818
case ExprKind::MagicIdentifierLiteral:
817819
case ExprKind::InterpolatedStringLiteral:
820+
case ExprKind::RegexLiteral:
818821
case ExprKind::ObjectLiteral:
819822
case ExprKind::DiscardAssignment:
820823
case ExprKind::DeclRef:
@@ -2187,6 +2190,13 @@ SourceLoc TapExpr::getEndLoc() const {
21872190
return SourceLoc();
21882191
}
21892192

2193+
RegexLiteralExpr *
2194+
RegexLiteralExpr::createParsed(ASTContext &ctx, SourceLoc loc,
2195+
StringRef regexText, Expr *semanticExpr) {
2196+
return new (ctx) RegexLiteralExpr(loc, regexText, semanticExpr,
2197+
/*implicit*/ false);
2198+
}
2199+
21902200
void swift::simple_display(llvm::raw_ostream &out, const ClosureExpr *CE) {
21912201
if (!CE) {
21922202
out << "(null)";

Diff for: lib/Parse/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ add_swift_host_library(swiftParse STATIC
1818
ParseIfConfig.cpp
1919
ParsePattern.cpp
2020
Parser.cpp
21+
ParseRegex.cpp
2122
ParseRequests.cpp
2223
ParseStmt.cpp
2324
ParseType.cpp

Diff for: lib/Parse/Lexer.cpp

+41-20
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "swift/AST/Identifier.h"
2121
#include "swift/Basic/LangOptions.h"
2222
#include "swift/Basic/SourceManager.h"
23+
#include "swift/Parse/ExperimentalRegexBridging.h"
2324
#include "swift/Syntax/Trivia.h"
2425
#include "llvm/Support/Compiler.h"
2526
#include "llvm/Support/MathExtras.h"
@@ -30,13 +31,6 @@
3031
// FIXME: Figure out if this can be migrated to LLVM.
3132
#include "clang/Basic/CharInfo.h"
3233

33-
// Regex parser delivered via libSwift
34-
#include "swift/Parse/ExperimentalRegexBridging.h"
35-
static ParseRegexStrawperson parseRegexStrawperson = nullptr;
36-
void Parser_registerParseRegexStrawperson(ParseRegexStrawperson fn) {
37-
parseRegexStrawperson = fn;
38-
}
39-
4034
#include <limits>
4135

4236
using namespace swift;
@@ -1795,9 +1789,6 @@ static void validateMultilineIndents(const Token &Str,
17951789

17961790
/// Emit diagnostics for single-quote string and suggest replacement
17971791
/// with double-quoted equivalent.
1798-
///
1799-
/// Or, if we're in experimental regex mode, we will emit a custom
1800-
/// error message instead, determined by the Swift library.
18011792
void Lexer::diagnoseSingleQuoteStringLiteral(const char *TokStart,
18021793
const char *TokEnd) {
18031794
assert(*TokStart == '\'' && TokEnd[-1] == '\'');
@@ -1807,15 +1798,6 @@ void Lexer::diagnoseSingleQuoteStringLiteral(const char *TokStart,
18071798
auto startLoc = Lexer::getSourceLoc(TokStart);
18081799
auto endLoc = Lexer::getSourceLoc(TokEnd);
18091800

1810-
if (LangOpts.EnableExperimentalStringProcessing) {
1811-
if (parseRegexStrawperson) {
1812-
auto copy = std::string(TokStart, TokEnd-TokStart);
1813-
auto msg = parseRegexStrawperson(copy.c_str());
1814-
assert(msg != nullptr);
1815-
Diags->diagnose(startLoc, diag::lex_experimental_regex_strawperson, msg);
1816-
}
1817-
}
1818-
18191801
SmallString<32> replacement;
18201802
replacement.push_back('"');
18211803
const char *Ptr = TokStart + 1;
@@ -1969,6 +1951,37 @@ const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body,
19691951
}
19701952
}
19711953

1954+
void Lexer::lexRegexLiteral(const char *TokStart) {
1955+
assert(*TokStart == '\'');
1956+
1957+
bool HadError = false;
1958+
while (true) {
1959+
// Check if we reached the end of the literal without terminating.
1960+
if (CurPtr >= BufferEnd || *CurPtr == '\n' || *CurPtr == '\r') {
1961+
diagnose(TokStart, diag::lex_unterminated_regex);
1962+
return formToken(tok::unknown, TokStart);
1963+
}
1964+
1965+
const auto *CharStart = CurPtr;
1966+
uint32_t CharValue = validateUTF8CharacterAndAdvance(CurPtr, BufferEnd);
1967+
if (CharValue == ~0U) {
1968+
diagnose(CharStart, diag::lex_invalid_utf8);
1969+
HadError = true;
1970+
continue;
1971+
}
1972+
if (CharValue == '\\' && (*CurPtr == '\'' || *CurPtr == '\\')) {
1973+
// Skip escaped delimiter or \.
1974+
CurPtr++;
1975+
} else if (CharValue == '\'') {
1976+
// End of literal, stop.
1977+
break;
1978+
}
1979+
}
1980+
if (HadError)
1981+
return formToken(tok::unknown, TokStart);
1982+
1983+
formToken(tok::regex_literal, TokStart);
1984+
}
19721985

19731986
/// lexEscapedIdentifier:
19741987
/// identifier ::= '`' identifier '`'
@@ -2513,8 +2526,16 @@ void Lexer::lexImpl() {
25132526
case '5': case '6': case '7': case '8': case '9':
25142527
return lexNumber();
25152528

2516-
case '"':
25172529
case '\'':
2530+
// If we have experimental string processing enabled, and have the parsing
2531+
// logic for regex literals, lex a single quoted string as a regex literal.
2532+
if (LangOpts.EnableExperimentalStringProcessing &&
2533+
Parser_hasParseRegexStrawperson()) {
2534+
return lexRegexLiteral(TokStart);
2535+
}
2536+
// Otherwise lex as a string literal and emit a diagnostic.
2537+
LLVM_FALLTHROUGH;
2538+
case '"':
25182539
return lexStringLiteral();
25192540

25202541
case '`':

Diff for: lib/Parse/ParseExpr.cpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -1545,7 +1545,10 @@ ParserResult<Expr> Parser::parseExprPrimary(Diag<> ID, bool isExprBasic) {
15451545

15461546
case tok::string_literal: // "foo"
15471547
return parseExprStringLiteral();
1548-
1548+
1549+
case tok::regex_literal:
1550+
return parseExprRegexLiteral();
1551+
15491552
case tok::kw_nil:
15501553
ExprContext.setCreateSyntax(SyntaxKind::NilLiteralExpr);
15511554
return makeParserResult(new (Context)

Diff for: lib/Parse/ParseRegex.cpp

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
//===--- ParseRegex.cpp - Regular expression literal parsing --------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
//
13+
// Regular expression literal parsing
14+
//
15+
//===----------------------------------------------------------------------===//
16+
17+
#include "swift/Parse/Parser.h"
18+
#include "swift/AST/DiagnosticsParse.h"
19+
#include "swift/Parse/ParsedSyntaxRecorder.h"
20+
#include "swift/Parse/SyntaxParsingContext.h"
21+
#include "swift/Syntax/SyntaxKind.h"
22+
23+
// Regex parser delivered via libSwift
24+
#include "swift/Parse/ExperimentalRegexBridging.h"
25+
static ParseRegexStrawperson parseRegexStrawperson = nullptr;
26+
void Parser_registerParseRegexStrawperson(ParseRegexStrawperson fn) {
27+
parseRegexStrawperson = fn;
28+
}
29+
// Exposes the presence of the regex parsing function to the lexer.
30+
bool Parser_hasParseRegexStrawperson() {
31+
return parseRegexStrawperson != nullptr;
32+
}
33+
34+
using namespace swift;
35+
using namespace swift::syntax;
36+
37+
ParserResult<Expr> Parser::parseExprRegexLiteral() {
38+
assert(Tok.is(tok::regex_literal));
39+
assert(parseRegexStrawperson);
40+
41+
SyntaxParsingContext LocalContext(SyntaxContext,
42+
SyntaxKind::RegexLiteralExpr);
43+
// Strip off delimiters.
44+
auto rawText = Tok.getText();
45+
assert(rawText.front() == '\'' && rawText.back() == '\'');
46+
auto regexText = rawText.slice(1, rawText.size() - 1);
47+
48+
// Let the Swift library parse the contents, returning an error, or null if
49+
// successful.
50+
// TODO: We need to be able to pass back a source location to emit the error
51+
// at.
52+
auto *errorStr = parseRegexStrawperson(regexText.str().c_str());
53+
if (errorStr)
54+
diagnose(Tok, diag::regex_literal_parsing_error, errorStr);
55+
56+
auto loc = consumeToken();
57+
58+
// Create an implicit .init(_regexString: "<regex text>") call to serve as the
59+
// semantic expression for the regex. The type-checker will provide it with
60+
// the correct contextual type. We delay the contextual type for a couple of
61+
// reasons:
62+
// 1. We need to delay type lookup until after parsing.
63+
// 2. Even if the AST synthesis were done lazily in e.g a request, we don't
64+
// currently have great support for implicit TypeExprs for unopened generic
65+
// types, as we want to phase out the use of UnboundGenericType. The Regex
66+
// type isn't currently generic, but might be in the future.
67+
auto *regexStringExpr =
68+
new (Context) StringLiteralExpr(Context.AllocateCopy(regexText), loc);
69+
regexStringExpr->setImplicit();
70+
71+
DeclName initName(Context, DeclBaseName::createConstructor(),
72+
{Context.Id__regexString});
73+
DeclNameRef initNameRef(initName);
74+
auto *dotInit = new (Context) UnresolvedMemberExpr(
75+
/*dotLoc*/ loc, DeclNameLoc(loc), initNameRef, /*implicit*/ true);
76+
77+
auto *args =
78+
ArgumentList::forImplicitCallTo(initNameRef, {regexStringExpr}, Context);
79+
auto *call = CallExpr::createImplicit(Context, dotInit, args);
80+
81+
return makeParserResult(
82+
RegexLiteralExpr::createParsed(Context, loc, regexText, call));
83+
}

0 commit comments

Comments
 (0)