@@ -2048,6 +2048,14 @@ const char *Lexer::tryScanRegexLiteral(const char *TokStart, bool MustBeRegex,
2048
2048
2049
2049
bool IsForwardSlash = (*TokStart == ' /' );
2050
2050
2051
+ auto spaceOrTabDescription = [](char c) -> StringRef {
2052
+ switch (c) {
2053
+ case ' ' : return " space" ;
2054
+ case ' \t ' : return " tab" ;
2055
+ default : llvm_unreachable (" Unhandled case" );
2056
+ }
2057
+ };
2058
+
2051
2059
// Check if we're able to lex a `/.../` regex.
2052
2060
if (IsForwardSlash) {
2053
2061
// For `/.../` regex literals, we need to ban space and tab at the start of
@@ -2063,33 +2071,17 @@ const char *Lexer::tryScanRegexLiteral(const char *TokStart, bool MustBeRegex,
2063
2071
// TODO: This heuristic should be sunk into the Swift library once we have a
2064
2072
// way of doing fix-its from there.
2065
2073
auto *RegexContentStart = TokStart + 1 ;
2066
- switch (*RegexContentStart) {
2067
- case ' ' :
2068
- case ' \t ' : {
2074
+ if (*RegexContentStart == ' ' || *RegexContentStart == ' \t ' ) {
2069
2075
if (!MustBeRegex)
2070
2076
return nullptr ;
2071
2077
2072
2078
if (Diags) {
2073
2079
// We must have a regex, so emit an error for space and tab.
2074
- StringRef DiagChar;
2075
- switch (*RegexContentStart) {
2076
- case ' ' :
2077
- DiagChar = " space" ;
2078
- break ;
2079
- case ' \t ' :
2080
- DiagChar = " tab" ;
2081
- break ;
2082
- default :
2083
- llvm_unreachable (" Unhandled case" );
2084
- }
2085
2080
Diags->diagnose (getSourceLoc (RegexContentStart),
2086
- diag::lex_regex_literal_invalid_starting_char, DiagChar)
2081
+ diag::lex_regex_literal_invalid_starting_char,
2082
+ spaceOrTabDescription (*RegexContentStart))
2087
2083
.fixItInsert (getSourceLoc (RegexContentStart), " \\ " );
2088
2084
}
2089
- break ;
2090
- }
2091
- default :
2092
- break ;
2093
2085
}
2094
2086
}
2095
2087
@@ -2106,60 +2098,82 @@ const char *Lexer::tryScanRegexLiteral(const char *TokStart, bool MustBeRegex,
2106
2098
if (Ptr == TokStart)
2107
2099
return nullptr ;
2108
2100
2109
- // If we're lexing `/.../`, error if we ended on the opening of a comment.
2110
- // We prefer to lex the comment as it's more likely than not that is what
2111
- // the user is expecting.
2112
- // TODO: This should be sunk into the Swift library.
2113
- if (IsForwardSlash && Ptr [-1 ] == ' /' && (*Ptr == ' *' || *Ptr == ' /' )) {
2114
- if (!MustBeRegex)
2115
- return nullptr ;
2101
+ // Perform some additional heuristics to see if we can lex `/.../`.
2102
+ // TODO: These should all be sunk into the Swift library.
2103
+ if (IsForwardSlash) {
2104
+ // If we're lexing `/.../`, error if we ended on the opening of a comment.
2105
+ // We prefer to lex the comment as it's more likely than not that is what
2106
+ // the user is expecting.
2107
+ if (Ptr [-1 ] == ' /' && (*Ptr == ' *' || *Ptr == ' /' )) {
2108
+ if (!MustBeRegex)
2109
+ return nullptr ;
2116
2110
2117
- if (Diags) {
2118
- Diags->diagnose (getSourceLoc (TokStart),
2119
- diag::lex_regex_literal_unterminated);
2120
- }
2121
- // Move the pointer back to the '/' of the comment.
2122
- Ptr --;
2123
- }
2124
-
2125
- // If we're tentatively lexing `/.../`, scan to make sure we don't have any
2126
- // unbalanced ')'s. This helps avoid ambiguity with unapplied operator
2127
- // references e.g `reduce(1, /)` and `foo(/, 0) / 2`. This would be invalid
2128
- // regex syntax anyways. This ensures users can surround their operator ref
2129
- // in parens `(/)` to fix the issue. This also applies to prefix operators
2130
- // that can be disambiguated as e.g `(/S.foo)`. Note we need to track whether
2131
- // or not we're in a custom character class `[...]`, as parens are literal
2132
- // there.
2133
- // TODO: This should be sunk into the Swift library.
2134
- if (IsForwardSlash && !MustBeRegex) {
2135
- unsigned CharClassDepth = 0 ;
2136
- unsigned GroupDepth = 0 ;
2137
- for (auto *Cursor = TokStart + 1 ; Cursor < Ptr - 1 ; Cursor++) {
2138
- switch (*Cursor) {
2139
- case ' \\ ' :
2140
- // Skip over the next character of an escape.
2141
- Cursor++;
2142
- break ;
2143
- case ' (' :
2144
- if (CharClassDepth == 0 )
2145
- GroupDepth += 1 ;
2146
- break ;
2147
- case ' )' :
2148
- if (CharClassDepth != 0 )
2111
+ if (Diags) {
2112
+ Diags->diagnose (getSourceLoc (TokStart),
2113
+ diag::lex_regex_literal_unterminated);
2114
+ }
2115
+ // Move the pointer back to the '/' of the comment.
2116
+ Ptr --;
2117
+ }
2118
+ auto *TokEnd = Ptr - 1 ;
2119
+ auto *ContentEnd = TokEnd - 1 ;
2120
+
2121
+ // We also ban unescaped space and tab at the end of a `/.../` literal.
2122
+ if (*TokEnd == ' /' && (TokEnd - TokStart > 2 ) && ContentEnd[-1 ] != ' \\ ' &&
2123
+ (*ContentEnd == ' ' || *ContentEnd == ' \t ' )) {
2124
+ if (!MustBeRegex)
2125
+ return nullptr ;
2126
+
2127
+ if (Diags) {
2128
+ // Diagnose and suggest using a `#/.../#` literal instead. We could
2129
+ // suggest escaping, but that would be wrong if the user has written (?x).
2130
+ // TODO: Should we suggest this for space-as-first character too?
2131
+ Diags->diagnose (getSourceLoc (ContentEnd),
2132
+ diag::lex_regex_literal_invalid_ending_char,
2133
+ spaceOrTabDescription (*ContentEnd))
2134
+ .fixItInsert (getSourceLoc (TokStart), " #" )
2135
+ .fixItInsert (getSourceLoc (Ptr ), " #" );
2136
+ }
2137
+ }
2138
+
2139
+ // If we're tentatively lexing `/.../`, scan to make sure we don't have any
2140
+ // unbalanced ')'s. This helps avoid ambiguity with unapplied operator
2141
+ // references e.g `reduce(1, /)` and `foo(/, 0) / 2`. This would be invalid
2142
+ // regex syntax anyways. This ensures users can surround their operator ref
2143
+ // in parens `(/)` to fix the issue. This also applies to prefix operators
2144
+ // that can be disambiguated as e.g `(/S.foo)`. Note we need to track whether
2145
+ // or not we're in a custom character class `[...]`, as parens are literal
2146
+ // there.
2147
+ if (!MustBeRegex) {
2148
+ unsigned CharClassDepth = 0 ;
2149
+ unsigned GroupDepth = 0 ;
2150
+ for (auto *Cursor = TokStart + 1 ; Cursor < TokEnd; Cursor++) {
2151
+ switch (*Cursor) {
2152
+ case ' \\ ' :
2153
+ // Skip over the next character of an escape.
2154
+ Cursor++;
2155
+ break ;
2156
+ case ' (' :
2157
+ if (CharClassDepth == 0 )
2158
+ GroupDepth += 1 ;
2149
2159
break ;
2160
+ case ' )' :
2161
+ if (CharClassDepth != 0 )
2162
+ break ;
2150
2163
2151
- // Invalid, so bail.
2152
- if (GroupDepth == 0 )
2153
- return nullptr ;
2164
+ // Invalid, so bail.
2165
+ if (GroupDepth == 0 )
2166
+ return nullptr ;
2154
2167
2155
- GroupDepth -= 1 ;
2156
- break ;
2157
- case ' [' :
2158
- CharClassDepth += 1 ;
2159
- break ;
2160
- case ' ]' :
2161
- if (CharClassDepth != 0 )
2162
- CharClassDepth -= 1 ;
2168
+ GroupDepth -= 1 ;
2169
+ break ;
2170
+ case ' [' :
2171
+ CharClassDepth += 1 ;
2172
+ break ;
2173
+ case ' ]' :
2174
+ if (CharClassDepth != 0 )
2175
+ CharClassDepth -= 1 ;
2176
+ }
2163
2177
}
2164
2178
}
2165
2179
}
0 commit comments