Skip to content

Commit 7e29bcd

Browse files
committed
Generate binary search character-checking functions based on regexps
1 parent 686e622 commit 7e29bcd

12 files changed

+486
-513
lines changed

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,15 @@
1010
],
1111
"scripts": {
1212
"benchmarks": "ts-node benchmarks/benchmarks.ts",
13-
"build": "npm-run-all clean update-known-tlds build:src",
13+
"build": "npm-run-all clean update-known-tlds generate-char-utils build:src",
1414
"build:docs": "npm-run-all build:docs-site build:live-example",
1515
"build:docs-site": "ts-node scripts/build-docs.ts",
1616
"build:src": "ts-node scripts/build.ts",
1717
"build:live-example": "webpack",
1818
"clean": "rimraf dist",
1919
"coverage": "nyc npm run test:unit -- --forbid-only",
2020
"devserver": "webpack-dev-server",
21+
"generate-char-utils": "ts-node scripts/generate-char-utils.ts",
2122
"lint": "eslint src tests scripts",
2223
"lint:fix": "npm run lint -- --fix",
2324
"prepublishOnly": "npm-run-all build test",

scripts/generate-char-utils.ts

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
import dedent from 'dedent';
2+
import fs from 'fs';
3+
import path from 'path';
4+
5+
/*
6+
* This script generates functions which check that a single character matches
7+
* a regular expression, but by character code rather than string.
8+
*
9+
* This is a performance enhancement for Autolinker where having a function that
10+
* checks a single character code by integer value via a binary search is far
11+
* more performant than checking against the equivalent regular expression (to
12+
* the tune of 10x faster sometimes). Because these character-checking functions
13+
* are used to process each character of the input string, we want these to be
14+
* as fast as possible.
15+
*/
16+
17+
const rootPath = path.normalize(`${__dirname}/..`);
18+
const generateScriptName = 'generate-char-utils';
19+
20+
const { srcFileContents, specFileContents } = generateCharUtils([
21+
['isAsciiLetterChar', /[A-Za-z]/],
22+
['isDigitChar', /\d/],
23+
['isQuoteChar', /['"]/],
24+
['isWhitespaceChar', /\s/],
25+
]);
26+
27+
// console.log(srcFileContents);
28+
// console.log(specFileContents);
29+
30+
const srcFilePath = `src/char-utils.ts`;
31+
fs.writeFileSync(`${rootPath}/${srcFilePath}`, srcFileContents, 'utf-8');
32+
console.log(`Wrote ${srcFilePath}`);
33+
34+
const specFilePath = `tests/char-utils.spec.ts`;
35+
fs.writeFileSync(`${rootPath}/${specFilePath}`, specFileContents, 'utf-8');
36+
console.log(`Wrote ${specFilePath}`);
37+
38+
// -------------------------------------------------------------
39+
40+
/**
41+
* Generates the source and spec files for char-utils.ts
42+
*/
43+
function generateCharUtils(fns: [fnName: string, re: RegExp][]): {
44+
srcFileContents: string;
45+
specFileContents: string;
46+
} {
47+
const fileHeader = dedent`
48+
// NOTE: THIS FILE IS GENERATED. DO NOT EDIT.
49+
// INSTEAD, RUN: npm run ${generateScriptName}
50+
`;
51+
52+
const srcFileContents = `
53+
${fileHeader}
54+
55+
/**
56+
* Common UTF-16 character codes used in the program.
57+
*
58+
* This is a 'const' enum, meaning that the numerical value will be inlined into
59+
* the code when TypeScript is compiled.
60+
*/
61+
// prettier-ignore
62+
export const enum Char {
63+
// Letter chars (usually used for scheme testing)
64+
A = 65,
65+
Z = 90,
66+
a = 97,
67+
z = 122,
68+
69+
// Quote chars (used for HTML parsing)
70+
DoubleQuote = 34, // char code for "
71+
SingleQuote = 39, // char code for '
72+
73+
// Digit chars (used for parsing matches)
74+
Zero = 48, // char code for '0'
75+
Nine = 57, // char code for '9'
76+
77+
// Semantically meaningful characters for HTML and Match parsing
78+
NumberSign = 35, // '#' char
79+
OpenParen = 40, // '(' char
80+
CloseParen = 41, // ')' char
81+
Plus = 43, // '+' char
82+
Comma = 44, // ',' char
83+
Dash = 45, // '-' char
84+
Dot = 46, // '.' char
85+
Slash = 47, // '/' char
86+
Colon = 58, // ':' char
87+
SemiColon = 59, // ';' char
88+
Question = 63, // '?' char
89+
AtSign = 64, // '@' char
90+
Underscore = 95, // '_' char
91+
92+
// Whitespace and Line Terminator chars (all used by the /\\s/ RegExp escape)
93+
// These are used for parsing both HTML and matches.
94+
// The order is in UTF-16 value in order to make it easier to write code
95+
// against, but the following are all from the following documents (intermixed):
96+
// - Whitespace: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#white_space
97+
// - Line terminators: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#line_terminators
98+
// - Other Unicode space characters <USP> Characters in the "Space_Separator" general category: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGeneral_Category%3DSpace_Separator%7D
99+
Tab = 9, // U+0009 Horizontal tab '\\t'
100+
// LineFeed = 10, // U+000A Line Feed <LF> New line character in UNIX systems. \\n -- not needed as we'll simply test the range from Tab to CarriageReturn
101+
// VerticalTab = 11, // U+000B Line tabulation <VT> Vertical tabulation \\v -- not needed as we'll simply test the range from Tab to CarriageReturn
102+
// FormFeed = 12, // U+000C Form feed <FF> Page breaking control character (Wikipedia). \\f -- not needed as we'll simply test the range from Tab to CarriageReturn
103+
CarriageReturn = 13, // U+000D Carriage Return <CR> New line character in Commodore and early Mac systems. \\r
104+
Space = 32, // U+0020 Space <SP> Normal space
105+
NoBreakSpace = 160, // U+00A0 No-break space <NBSP> Normal space, but no point at which a line may break
106+
OghamSpace = 5760, // U+1680 OGHAM SPACE MARK
107+
EnQuad = 8192, // U+2000 EN QUAD
108+
// EmQuad = 8193, // U+2001 EM QUAD -- not needed as we'll simply test the range from EnQuad to HairSpace
109+
// EnSpace = 8194, // U+2002 EN SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace
110+
// EmSpace = 8195, // U+2003 EM SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace
111+
// ThreePerEmSpace = 8196, // U+2004 THREE-PER-EM SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace
112+
// FourPerEmSpace = 8197, // U+2005 FOUR-PER-EM SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace
113+
// SizePerEmSpace = 8198, // U+2006 SIX-PER-EM SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace
114+
// FigureSpace = 8199, // U+2007 FIGURE SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace
115+
// PunctuationSpace = 8200, // U+2008 PUNCTUATION SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace
116+
// ThinSpace = 8201, // U+2009 THIN SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace
117+
HairSpace = 8202, // U+200A HAIR SPACE
118+
LineSeparator = 8232, // U+2028 Line Separator <LS>
119+
ParagraphSeparator = 8233, // U+2029 Paragraph Separator <PS>
120+
NarrowNoBreakSpace = 8239, // U+202F NARROW NO-BREAK SPACE
121+
MediumMathematicalSpace = 8287, // U+205F MEDIUM MATHEMATICAL SPACE
122+
IdiographicSpace = 12288, // U+3000 IDEOGRAPHIC SPACE
123+
ZeroWidthNoBreakSpace = 65279, // U+FEFF Zero-width no-break space <ZWNBSP> When not at the start of a script, the BOM marker is a normal whitespace character.
124+
}
125+
126+
${fns.map(fn => generateCharCompareFn(fn[0], fn[1])).join('\n\n')}
127+
`;
128+
129+
const specFileContents = `
130+
${fileHeader}
131+
132+
import { expect } from 'chai';
133+
import { ${fns.map(fn => fn[0]).join(', ')} } from '../src/char-utils';
134+
135+
${fns.map(fn => generateCharCompareTest(fn[0], fn[1])).join('\n\n')}
136+
`;
137+
138+
return { srcFileContents, specFileContents };
139+
}
140+
141+
/**
142+
* Generates the character-checking function based on a regular expression.
143+
*
144+
* For example:
145+
*
146+
* generateCharCompareFn('isDigit', /\d/);
147+
*
148+
* Generates:
149+
*
150+
* function isDigit(c: number): boolean {
151+
* return (c >= 48 && c <= 57);
152+
* }
153+
*
154+
* where 48 is the '0' char and 57 is the '9' character.
155+
*/
156+
function generateCharCompareFn(fnName: string, regExp: RegExp): string {
157+
const charCodes = charCodesFromRe(regExp);
158+
const charCodeRanges = toCharCodeRanges(charCodes);
159+
160+
return dedent`
161+
/**
162+
* Determines if the given character \`c\` matches the regular expression /${regExp.source}/
163+
* by checking it via character code in a binary search fashion.
164+
*
165+
* This technique speeds this function up by a factor of ~10x vs. running RegExp.prototype.test()
166+
* on the character itself.
167+
*
168+
* NOTE: This function is generated. Do not edit manually. To regenerate, run:
169+
*
170+
* npm run ${generateScriptName}
171+
*/
172+
export function ${fnName}(c: number): boolean {
173+
return ${buildBinarySearchExpr(charCodeRanges)};
174+
}
175+
`;
176+
}
177+
178+
/**
179+
* Given a regular expression, determines which character codes the regular
180+
* expression returns true for.
181+
*
182+
* Example:
183+
*
184+
* charCodesFromRe(/\d/);
185+
* // -> [48, 49, ..., 56, 57]
186+
*/
187+
function charCodesFromRe(regExp: RegExp): number[] {
188+
const charCodes: number[] = [];
189+
for (let charCode = 0; charCode < 65535; charCode++) {
190+
const char = String.fromCharCode(charCode);
191+
192+
if (regExp.test(char)) {
193+
charCodes.push(charCode);
194+
}
195+
}
196+
return charCodes;
197+
}
198+
199+
/**
200+
* Given an array of character codes, compresses adjacent codes into ranges.
201+
*
202+
* Example:
203+
*
204+
* toCharCodeRanges([9, 10, 11, 12, 13, 32, 160]);
205+
* // -> [[9, 13], [32], [160]]
206+
*/
207+
function toCharCodeRanges(charCodes: number[]): CharCodeRange[] {
208+
if (charCodes.length === 0) {
209+
throw new Error(
210+
`toCharCodeRanges(): No codes in charCodes array! Likely an invalid RegExp`
211+
);
212+
}
213+
if (charCodes.length === 1) {
214+
return [[charCodes[0]]]; // single element
215+
}
216+
217+
const ranges: CharCodeRange[] = [];
218+
let currentRange: CharCodeRange | null = null;
219+
220+
for (let i = 0; i < charCodes.length; i++) {
221+
const currentCode = charCodes[i];
222+
223+
if (!currentRange) {
224+
currentRange = [currentCode, currentCode];
225+
}
226+
227+
if (i < charCodes.length - 1) {
228+
// There's a "next" element
229+
const nextCode = charCodes[i + 1];
230+
231+
if (nextCode === currentCode + 1) {
232+
currentRange[1] = nextCode;
233+
} else {
234+
ranges.push(collapseRange(currentRange));
235+
currentRange = null;
236+
}
237+
} else {
238+
// We've reached the end of the array
239+
// push the final range being generated
240+
ranges.push(collapseRange(currentRange));
241+
}
242+
}
243+
return ranges;
244+
245+
// If the range contains just one number, collapse into a single element tuple
246+
function collapseRange(range: CharCodeRange): CharCodeRange {
247+
if (range[0] === range[1]) {
248+
return [range[0]]; // single number "range". Ex: [32]
249+
} else {
250+
return range; // multiple number range. Ex: [9, 13]
251+
}
252+
}
253+
}
254+
255+
type CharCodeRange = [from: number, to?: number];
256+
257+
/**
258+
* Given a set of character code ranges, builds a binary search JavaScript
259+
* expression to check a character code against the ranges.
260+
*
261+
* Ex:
262+
*
263+
* buildBinarySearchExpr([[9, 13], [32], [160]]);
264+
* // -> '(c < 32 ? (c >= 9 && c <= 13) : (c == 32 || c == 160))'
265+
*
266+
*/
267+
function buildBinarySearchExpr(ranges: CharCodeRange[]): string {
268+
if (ranges.length === 1) {
269+
return buildComparisonExpr(ranges[0]);
270+
} else if (ranges.length === 2) {
271+
return `(${buildComparisonExpr(ranges[0])} || ${buildComparisonExpr(ranges[1])})`;
272+
} else {
273+
const mid = Math.floor(ranges.length / 2);
274+
275+
const midRange = ranges[mid];
276+
const leftExpr = buildBinarySearchExpr(ranges.slice(0, mid));
277+
const rightExpr = buildBinarySearchExpr(ranges.slice(mid));
278+
279+
return `(c < ${midRange[0]} ? ${leftExpr} : ${rightExpr})`;
280+
}
281+
}
282+
283+
function buildComparisonExpr(range: CharCodeRange): string {
284+
if (range.length === 1) {
285+
return `c == ${range[0]}`;
286+
} else {
287+
return `(c >= ${range[0]} && c <= ${range[1]})`;
288+
}
289+
}
290+
291+
function generateCharCompareTest(fnName: string, re: RegExp): string {
292+
return dedent`
293+
describe('${fnName}()', () => {
294+
it(\`should appropriately return true/false to match the regular expression /${re.source.replace(/\\/g, '\\\\')}/\`, () => {
295+
for (let charCode = 0; charCode < 65535; charCode++) {
296+
const char = String.fromCharCode(charCode);
297+
const fnResult = ${fnName}(charCode);
298+
const regExpResult = /${re.source}/.test(char);
299+
300+
expect(fnResult).to.equal(regExpResult, \`Expected charCode \${charCode} (\${char}) to return \${regExpResult}, but returned \${fnResult}\`);
301+
}
302+
});
303+
});
304+
`;
305+
}

0 commit comments

Comments
 (0)