Skip to content

Commit fcabb5c

Browse files
Simplify or optimize regexes with polynomial time worst cases (microsoft#44197)
* Simplify or optimize regexes with polynomial time worst cases * PR feedback & cleanup Co-authored-by: David Michon <dmichon-msft@users.noreply.github.com> * Use builtin scanner function for checking whitespace in fallback method (its faster) Co-authored-by: David Michon <dmichon-msft@users.noreply.github.com>
1 parent 2203228 commit fcabb5c

11 files changed

+158
-85
lines changed

src/compiler/commandLineParser.ts

+26-37
Original file line numberDiff line numberDiff line change
@@ -3034,10 +3034,6 @@ namespace ts {
30343034
return filter(map(values, v => convertJsonOption(option.element, v, basePath, errors)), v => !!v);
30353035
}
30363036

3037-
function trimString(s: string) {
3038-
return typeof s.trim === "function" ? s.trim() : s.replace(/^[\s]+|[\s]+$/g, "");
3039-
}
3040-
30413037
/**
30423038
* Tests for a path that ends in a recursive directory wildcard.
30433039
* Matches **, \**, **\, and \**\, but not a**b.
@@ -3051,36 +3047,6 @@ namespace ts {
30513047
*/
30523048
const invalidTrailingRecursionPattern = /(^|\/)\*\*\/?$/;
30533049

3054-
/**
3055-
* Tests for a path where .. appears after a recursive directory wildcard.
3056-
* Matches **\..\*, **\a\..\*, and **\.., but not ..\**\*
3057-
*
3058-
* NOTE: used \ in place of / above to avoid issues with multiline comments.
3059-
*
3060-
* Breakdown:
3061-
* (^|\/) # matches either the beginning of the string or a directory separator.
3062-
* \*\*\/ # matches a recursive directory wildcard "**" followed by a directory separator.
3063-
* (.*\/)? # optionally matches any number of characters followed by a directory separator.
3064-
* \.\. # matches a parent directory path component ".."
3065-
* ($|\/) # matches either the end of the string or a directory separator.
3066-
*/
3067-
const invalidDotDotAfterRecursiveWildcardPattern = /(^|\/)\*\*\/(.*\/)?\.\.($|\/)/;
3068-
3069-
/**
3070-
* Tests for a path containing a wildcard character in a directory component of the path.
3071-
* Matches \*\, \?\, and \a*b\, but not \a\ or \a\*.
3072-
*
3073-
* NOTE: used \ in place of / above to avoid issues with multiline comments.
3074-
*
3075-
* Breakdown:
3076-
* \/ # matches a directory separator.
3077-
* [^/]*? # matches any number of characters excluding directory separators (non-greedy).
3078-
* [*?] # matches either a wildcard character (* or ?)
3079-
* [^/]* # matches any number of characters excluding directory separators (greedy).
3080-
* \/ # matches a directory separator.
3081-
*/
3082-
const watchRecursivePattern = /\/[^/]*?[*?][^/]*\//;
3083-
30843050
/**
30853051
* Matches the portion of a wildcard path that does not contain wildcards.
30863052
* Matches \a of \a\*, or \a\b\c of \a\b\c\?\d.
@@ -3217,6 +3183,20 @@ namespace ts {
32173183
return matchesExcludeWorker(pathToCheck, validatedExcludeSpecs, useCaseSensitiveFileNames, currentDirectory, basePath);
32183184
}
32193185

3186+
function invalidDotDotAfterRecursiveWildcard(s: string) {
3187+
// We used to use the regex /(^|\/)\*\*\/(.*\/)?\.\.($|\/)/ to check for this case, but
3188+
// in v8, that has polynomial performance because the recursive wildcard match - **/ -
3189+
// can be matched in many arbitrary positions when multiple are present, resulting
3190+
// in bad backtracking (and we don't care which is matched - just that some /.. segment
3191+
// comes after some **/ segment).
3192+
const wildcardIndex = startsWith(s, "**/") ? 0 : s.indexOf("/**/");
3193+
if (wildcardIndex === -1) {
3194+
return false;
3195+
}
3196+
const lastDotIndex = endsWith(s, "/..") ? s.length : s.lastIndexOf("/../");
3197+
return lastDotIndex > wildcardIndex;
3198+
}
3199+
32203200
/* @internal */
32213201
export function matchesExclude(
32223202
pathToCheck: string,
@@ -3226,7 +3206,7 @@ namespace ts {
32263206
) {
32273207
return matchesExcludeWorker(
32283208
pathToCheck,
3229-
filter(excludeSpecs, spec => !invalidDotDotAfterRecursiveWildcardPattern.test(spec)),
3209+
filter(excludeSpecs, spec => !invalidDotDotAfterRecursiveWildcard(spec)),
32303210
useCaseSensitiveFileNames,
32313211
currentDirectory
32323212
);
@@ -3268,7 +3248,7 @@ namespace ts {
32683248
if (disallowTrailingRecursion && invalidTrailingRecursionPattern.test(spec)) {
32693249
return [Diagnostics.File_specification_cannot_end_in_a_recursive_directory_wildcard_Asterisk_Asterisk_Colon_0, spec];
32703250
}
3271-
else if (invalidDotDotAfterRecursiveWildcardPattern.test(spec)) {
3251+
else if (invalidDotDotAfterRecursiveWildcard(spec)) {
32723252
return [Diagnostics.File_specification_cannot_contain_a_parent_directory_that_appears_after_a_recursive_directory_wildcard_Asterisk_Asterisk_Colon_0, spec];
32733253
}
32743254
}
@@ -3331,9 +3311,18 @@ namespace ts {
33313311
function getWildcardDirectoryFromSpec(spec: string, useCaseSensitiveFileNames: boolean): { key: string, flags: WatchDirectoryFlags } | undefined {
33323312
const match = wildcardDirectoryPattern.exec(spec);
33333313
if (match) {
3314+
// We check this with a few `indexOf` calls because 3 `indexOf`/`lastIndexOf` calls is
3315+
// less algorithmically complex (roughly O(3n) worst-case) than the regex we used to use,
3316+
// \/[^/]*?[*?][^/]*\/ which was polynominal in v8, since arbitrary sequences of wildcard
3317+
// characters could match any of the central patterns, resulting in bad backtracking.
3318+
const questionWildcardIndex = spec.indexOf("?");
3319+
const starWildcardIndex = spec.indexOf("*");
3320+
const lastDirectorySeperatorIndex = spec.lastIndexOf(directorySeparator);
33343321
return {
33353322
key: useCaseSensitiveFileNames ? match[0] : toFileNameLowerCase(match[0]),
3336-
flags: watchRecursivePattern.test(spec) ? WatchDirectoryFlags.Recursive : WatchDirectoryFlags.None
3323+
flags: (questionWildcardIndex !== -1 && questionWildcardIndex < lastDirectorySeperatorIndex)
3324+
|| (starWildcardIndex !== -1 && starWildcardIndex < lastDirectorySeperatorIndex)
3325+
? WatchDirectoryFlags.Recursive : WatchDirectoryFlags.None
33373326
};
33383327
}
33393328
if (isImplicitGlob(spec)) {

src/compiler/core.ts

+44-4
Original file line numberDiff line numberDiff line change
@@ -2035,11 +2035,51 @@ namespace ts {
20352035
* Takes a string like "jquery-min.4.2.3" and returns "jquery"
20362036
*/
20372037
export function removeMinAndVersionNumbers(fileName: string) {
2038-
// Match a "." or "-" followed by a version number or 'min' at the end of the name
2039-
const trailingMinOrVersion = /[.-]((min)|(\d+(\.\d+)*))$/;
2038+
// We used to use the regex /[.-]((min)|(\d+(\.\d+)*))$/ and would just .replace it twice.
2039+
// Unfortunately, that regex has O(n^2) performance because v8 doesn't match from the end of the string.
2040+
// Instead, we now essentially scan the filename (backwards) ourselves.
2041+
2042+
let end: number = fileName.length;
2043+
2044+
for (let pos = end - 1; pos > 0; pos--) {
2045+
let ch: number = fileName.charCodeAt(pos);
2046+
if (ch >= CharacterCodes._0 && ch <= CharacterCodes._9) {
2047+
// Match a \d+ segment
2048+
do {
2049+
--pos;
2050+
ch = fileName.charCodeAt(pos);
2051+
} while (pos > 0 && ch >= CharacterCodes._0 && ch <= CharacterCodes._9);
2052+
}
2053+
else if (pos > 4 && (ch === CharacterCodes.n || ch === CharacterCodes.N)) {
2054+
// Looking for "min" or "min"
2055+
// Already matched the 'n'
2056+
--pos;
2057+
ch = fileName.charCodeAt(pos);
2058+
if (ch !== CharacterCodes.i && ch !== CharacterCodes.I) {
2059+
break;
2060+
}
2061+
--pos;
2062+
ch = fileName.charCodeAt(pos);
2063+
if (ch !== CharacterCodes.m && ch !== CharacterCodes.M) {
2064+
break;
2065+
}
2066+
--pos;
2067+
ch = fileName.charCodeAt(pos);
2068+
}
2069+
else {
2070+
// This character is not part of either suffix pattern
2071+
break;
2072+
}
2073+
2074+
if (ch !== CharacterCodes.minus && ch !== CharacterCodes.dot) {
2075+
break;
2076+
}
2077+
2078+
end = pos;
2079+
}
20402080

2041-
// The "min" or version may both be present, in either order, so try applying the above twice.
2042-
return fileName.replace(trailingMinOrVersion, "").replace(trailingMinOrVersion, "");
2081+
// end might be fileName.length, in which case this should internally no-op
2082+
return end === fileName.length ? fileName : fileName.slice(0, end);
20432083
}
20442084

20452085
/** Remove an item from an array, moving everything to its right one space left. */

src/compiler/debug.ts

+3
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,9 @@ namespace ts {
471471
// An `Array` with extra properties is rendered as `[A, B, prop1: 1, prop2: 2]`. Most of
472472
// these aren't immediately useful so we trim off the `prop1: ..., prop2: ...` part from the
473473
// formatted string.
474+
// This regex can trigger slow backtracking because of overlapping potential captures.
475+
// We don't care, this is debug code that's only enabled with a debugger attached -
476+
// we're just taking note of it for anyone checking regex performance in the future.
474477
defaultValue = String(defaultValue).replace(/(?:,[\s\w\d_]+:[^,]+)+\]$/, "]");
475478
return `NodeArray ${defaultValue}`;
476479
}

src/compiler/parser.ts

+8-7
Original file line numberDiff line numberDiff line change
@@ -9094,7 +9094,7 @@ namespace ts {
90949094
if (namedArgRegExCache.has(name)) {
90959095
return namedArgRegExCache.get(name)!;
90969096
}
9097-
const result = new RegExp(`(\\s${name}\\s*=\\s*)('|")(.+?)\\2`, "im");
9097+
const result = new RegExp(`(\\s${name}\\s*=\\s*)(?:(?:'([^']*)')|(?:"([^"]*)"))`, "im");
90989098
namedArgRegExCache.set(name, result);
90999099
return result;
91009100
}
@@ -9118,16 +9118,17 @@ namespace ts {
91189118
return; // Missing required argument, don't parse
91199119
}
91209120
else if (matchResult) {
9121+
const value = matchResult[2] || matchResult[3];
91219122
if (arg.captureSpan) {
9122-
const startPos = range.pos + matchResult.index + matchResult[1].length + matchResult[2].length;
9123+
const startPos = range.pos + matchResult.index + matchResult[1].length + 1;
91239124
argument[arg.name] = {
9124-
value: matchResult[3],
9125+
value,
91259126
pos: startPos,
9126-
end: startPos + matchResult[3].length
9127+
end: startPos + value.length
91279128
};
91289129
}
91299130
else {
9130-
argument[arg.name] = matchResult[3];
9131+
argument[arg.name] = value;
91319132
}
91329133
}
91339134
}
@@ -9145,7 +9146,7 @@ namespace ts {
91459146
}
91469147

91479148
if (range.kind === SyntaxKind.MultiLineCommentTrivia) {
9148-
const multiLinePragmaRegEx = /\s*@(\S+)\s*(.*)\s*$/gim; // Defined inline since it uses the "g" flag, which keeps a persistent index (for iterating)
9149+
const multiLinePragmaRegEx = /@(\S+)(\s+.*)?$/gim; // Defined inline since it uses the "g" flag, which keeps a persistent index (for iterating)
91499150
let multiLineMatch: RegExpExecArray | null;
91509151
while (multiLineMatch = multiLinePragmaRegEx.exec(text)) {
91519152
addPragmaForMatch(pragmas, range, PragmaKindFlags.MultiLine, multiLineMatch);
@@ -9170,7 +9171,7 @@ namespace ts {
91709171
function getNamedPragmaArguments(pragma: PragmaDefinition, text: string | undefined): {[index: string]: string} | "fail" {
91719172
if (!text) return {};
91729173
if (!pragma.args) return {};
9173-
const args = text.split(/\s+/);
9174+
const args = trimString(text).split(/\s+/);
91749175
const argMap: {[index: string]: string} = {};
91759176
for (let i = 0; i < pragma.args.length; i++) {
91769177
const argument = pragma.args[i];

src/compiler/program.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ namespace ts {
406406
const lineStart = getPositionOfLineAndCharacter(file, i, 0);
407407
const lineEnd = i < lastLineInFile ? getPositionOfLineAndCharacter(file, i + 1, 0) : file.text.length;
408408
let lineContent = file.text.slice(lineStart, lineEnd);
409-
lineContent = lineContent.replace(/\s+$/g, ""); // trim from end
409+
lineContent = trimStringEnd(lineContent); // trim from end
410410
lineContent = lineContent.replace(/\t/g, " "); // convert tabs to single spaces
411411

412412
// Output the gutter and the actual contents of the line.

0 commit comments

Comments
 (0)