|
| 1 | +import dedent from 'dedent'; |
| 2 | +import fs from 'fs'; |
| 3 | +import path from 'path'; |
| 4 | + |
| 5 | +/* |
| 6 | + * This script generates functions which check that a single character matches |
| 7 | + * a regular expression, but by character code rather than string. |
| 8 | + * |
| 9 | + * This is a performance enhancement for Autolinker where having a function that |
| 10 | + * checks a single character code by integer value via a binary search is far |
| 11 | + * more performant than checking against the equivalent regular expression (to |
| 12 | + * the tune of 10x faster sometimes). Because these character-checking functions |
| 13 | + * are used to process each character of the input string, we want these to be |
| 14 | + * as fast as possible. |
| 15 | + */ |
| 16 | + |
| 17 | +const rootPath = path.normalize(`${__dirname}/..`); |
| 18 | +const generateScriptName = 'generate-char-utils'; |
| 19 | + |
| 20 | +const { srcFileContents, specFileContents } = generateCharUtils([ |
| 21 | + ['isAsciiLetterChar', /[A-Za-z]/], |
| 22 | + ['isDigitChar', /\d/], |
| 23 | + ['isQuoteChar', /['"]/], |
| 24 | + ['isWhitespaceChar', /\s/], |
| 25 | +]); |
| 26 | + |
| 27 | +// console.log(srcFileContents); |
| 28 | +// console.log(specFileContents); |
| 29 | + |
| 30 | +const srcFilePath = `src/char-utils.ts`; |
| 31 | +fs.writeFileSync(`${rootPath}/${srcFilePath}`, srcFileContents, 'utf-8'); |
| 32 | +console.log(`Wrote ${srcFilePath}`); |
| 33 | + |
| 34 | +const specFilePath = `tests/char-utils.spec.ts`; |
| 35 | +fs.writeFileSync(`${rootPath}/${specFilePath}`, specFileContents, 'utf-8'); |
| 36 | +console.log(`Wrote ${specFilePath}`); |
| 37 | + |
| 38 | +// ------------------------------------------------------------- |
| 39 | + |
| 40 | +/** |
| 41 | + * Generates the source and spec files for char-utils.ts |
| 42 | + */ |
| 43 | +function generateCharUtils(fns: [fnName: string, re: RegExp][]): { |
| 44 | + srcFileContents: string; |
| 45 | + specFileContents: string; |
| 46 | +} { |
| 47 | + const fileHeader = dedent` |
| 48 | + // NOTE: THIS FILE IS GENERATED. DO NOT EDIT. |
| 49 | + // INSTEAD, RUN: npm run ${generateScriptName} |
| 50 | + `; |
| 51 | + |
| 52 | + const srcFileContents = ` |
| 53 | +${fileHeader} |
| 54 | +
|
| 55 | +/** |
| 56 | + * Common UTF-16 character codes used in the program. |
| 57 | + * |
| 58 | + * This is a 'const' enum, meaning that the numerical value will be inlined into |
| 59 | + * the code when TypeScript is compiled. |
| 60 | + */ |
| 61 | +// prettier-ignore |
| 62 | +export const enum Char { |
| 63 | + // Letter chars (usually used for scheme testing) |
| 64 | + A = 65, |
| 65 | + Z = 90, |
| 66 | + a = 97, |
| 67 | + z = 122, |
| 68 | +
|
| 69 | + // Quote chars (used for HTML parsing) |
| 70 | + DoubleQuote = 34, // char code for " |
| 71 | + SingleQuote = 39, // char code for ' |
| 72 | +
|
| 73 | + // Digit chars (used for parsing matches) |
| 74 | + Zero = 48, // char code for '0' |
| 75 | + Nine = 57, // char code for '9' |
| 76 | +
|
| 77 | + // Semantically meaningful characters for HTML and Match parsing |
| 78 | + NumberSign = 35, // '#' char |
| 79 | + OpenParen = 40, // '(' char |
| 80 | + CloseParen = 41, // ')' char |
| 81 | + Plus = 43, // '+' char |
| 82 | + Comma = 44, // ',' char |
| 83 | + Dash = 45, // '-' char |
| 84 | + Dot = 46, // '.' char |
| 85 | + Slash = 47, // '/' char |
| 86 | + Colon = 58, // ':' char |
| 87 | + SemiColon = 59, // ';' char |
| 88 | + Question = 63, // '?' char |
| 89 | + AtSign = 64, // '@' char |
| 90 | + Underscore = 95, // '_' char |
| 91 | +
|
| 92 | + // Whitespace and Line Terminator chars (all used by the /\\s/ RegExp escape) |
| 93 | + // These are used for parsing both HTML and matches. |
| 94 | + // The order is in UTF-16 value in order to make it easier to write code |
| 95 | + // against, but the following are all from the following documents (intermixed): |
| 96 | + // - Whitespace: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#white_space |
| 97 | + // - Line terminators: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#line_terminators |
| 98 | + // - Other Unicode space characters <USP> Characters in the "Space_Separator" general category: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGeneral_Category%3DSpace_Separator%7D |
| 99 | + Tab = 9, // U+0009 Horizontal tab '\\t' |
| 100 | + // LineFeed = 10, // U+000A Line Feed <LF> New line character in UNIX systems. \\n -- not needed as we'll simply test the range from Tab to CarriageReturn |
| 101 | + // VerticalTab = 11, // U+000B Line tabulation <VT> Vertical tabulation \\v -- not needed as we'll simply test the range from Tab to CarriageReturn |
| 102 | + // FormFeed = 12, // U+000C Form feed <FF> Page breaking control character (Wikipedia). \\f -- not needed as we'll simply test the range from Tab to CarriageReturn |
| 103 | + CarriageReturn = 13, // U+000D Carriage Return <CR> New line character in Commodore and early Mac systems. \\r |
| 104 | + Space = 32, // U+0020 Space <SP> Normal space |
| 105 | + NoBreakSpace = 160, // U+00A0 No-break space <NBSP> Normal space, but no point at which a line may break |
| 106 | + OghamSpace = 5760, // U+1680 OGHAM SPACE MARK |
| 107 | + EnQuad = 8192, // U+2000 EN QUAD |
| 108 | + // EmQuad = 8193, // U+2001 EM QUAD -- not needed as we'll simply test the range from EnQuad to HairSpace |
| 109 | + // EnSpace = 8194, // U+2002 EN SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace |
| 110 | + // EmSpace = 8195, // U+2003 EM SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace |
| 111 | + // ThreePerEmSpace = 8196, // U+2004 THREE-PER-EM SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace |
| 112 | + // FourPerEmSpace = 8197, // U+2005 FOUR-PER-EM SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace |
| 113 | + // SizePerEmSpace = 8198, // U+2006 SIX-PER-EM SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace |
| 114 | + // FigureSpace = 8199, // U+2007 FIGURE SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace |
| 115 | + // PunctuationSpace = 8200, // U+2008 PUNCTUATION SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace |
| 116 | + // ThinSpace = 8201, // U+2009 THIN SPACE -- not needed as we'll simply test the range from EnQuad to HairSpace |
| 117 | + HairSpace = 8202, // U+200A HAIR SPACE |
| 118 | + LineSeparator = 8232, // U+2028 Line Separator <LS> |
| 119 | + ParagraphSeparator = 8233, // U+2029 Paragraph Separator <PS> |
| 120 | + NarrowNoBreakSpace = 8239, // U+202F NARROW NO-BREAK SPACE |
| 121 | + MediumMathematicalSpace = 8287, // U+205F MEDIUM MATHEMATICAL SPACE |
| 122 | + IdiographicSpace = 12288, // U+3000 IDEOGRAPHIC SPACE |
| 123 | + ZeroWidthNoBreakSpace = 65279, // U+FEFF Zero-width no-break space <ZWNBSP> When not at the start of a script, the BOM marker is a normal whitespace character. |
| 124 | +} |
| 125 | +
|
| 126 | +${fns.map(fn => generateCharCompareFn(fn[0], fn[1])).join('\n\n')} |
| 127 | +`; |
| 128 | + |
| 129 | + const specFileContents = ` |
| 130 | +${fileHeader} |
| 131 | +
|
| 132 | +import { expect } from 'chai'; |
| 133 | +import { ${fns.map(fn => fn[0]).join(', ')} } from '../src/char-utils'; |
| 134 | +
|
| 135 | +${fns.map(fn => generateCharCompareTest(fn[0], fn[1])).join('\n\n')} |
| 136 | +`; |
| 137 | + |
| 138 | + return { srcFileContents, specFileContents }; |
| 139 | +} |
| 140 | + |
| 141 | +/** |
| 142 | + * Generates the character-checking function based on a regular expression. |
| 143 | + * |
| 144 | + * For example: |
| 145 | + * |
| 146 | + * generateCharCompareFn('isDigit', /\d/); |
| 147 | + * |
| 148 | + * Generates: |
| 149 | + * |
| 150 | + * function isDigit(c: number): boolean { |
| 151 | + * return (c >= 48 && c <= 57); |
| 152 | + * } |
| 153 | + * |
| 154 | + * where 48 is the '0' char and 57 is the '9' character. |
| 155 | + */ |
| 156 | +function generateCharCompareFn(fnName: string, regExp: RegExp): string { |
| 157 | + const charCodes = charCodesFromRe(regExp); |
| 158 | + const charCodeRanges = toCharCodeRanges(charCodes); |
| 159 | + |
| 160 | + return dedent` |
| 161 | + /** |
| 162 | + * Determines if the given character \`c\` matches the regular expression /${regExp.source}/ |
| 163 | + * by checking it via character code in a binary search fashion. |
| 164 | + * |
| 165 | + * This technique speeds this function up by a factor of ~10x vs. running RegExp.prototype.test() |
| 166 | + * on the character itself. |
| 167 | + * |
| 168 | + * NOTE: This function is generated. Do not edit manually. To regenerate, run: |
| 169 | + * |
| 170 | + * npm run ${generateScriptName} |
| 171 | + */ |
| 172 | + export function ${fnName}(c: number): boolean { |
| 173 | + return ${buildBinarySearchExpr(charCodeRanges)}; |
| 174 | + } |
| 175 | + `; |
| 176 | +} |
| 177 | + |
| 178 | +/** |
| 179 | + * Given a regular expression, determines which character codes the regular |
| 180 | + * expression returns true for. |
| 181 | + * |
| 182 | + * Example: |
| 183 | + * |
| 184 | + * charCodesFromRe(/\d/); |
| 185 | + * // -> [48, 49, ..., 56, 57] |
| 186 | + */ |
| 187 | +function charCodesFromRe(regExp: RegExp): number[] { |
| 188 | + const charCodes: number[] = []; |
| 189 | + for (let charCode = 0; charCode < 65535; charCode++) { |
| 190 | + const char = String.fromCharCode(charCode); |
| 191 | + |
| 192 | + if (regExp.test(char)) { |
| 193 | + charCodes.push(charCode); |
| 194 | + } |
| 195 | + } |
| 196 | + return charCodes; |
| 197 | +} |
| 198 | + |
| 199 | +/** |
| 200 | + * Given an array of character codes, compresses adjacent codes into ranges. |
| 201 | + * |
| 202 | + * Example: |
| 203 | + * |
| 204 | + * toCharCodeRanges([9, 10, 11, 12, 13, 32, 160]); |
| 205 | + * // -> [[9, 13], [32], [160]] |
| 206 | + */ |
| 207 | +function toCharCodeRanges(charCodes: number[]): CharCodeRange[] { |
| 208 | + if (charCodes.length === 0) { |
| 209 | + throw new Error( |
| 210 | + `toCharCodeRanges(): No codes in charCodes array! Likely an invalid RegExp` |
| 211 | + ); |
| 212 | + } |
| 213 | + if (charCodes.length === 1) { |
| 214 | + return [[charCodes[0]]]; // single element |
| 215 | + } |
| 216 | + |
| 217 | + const ranges: CharCodeRange[] = []; |
| 218 | + let currentRange: CharCodeRange | null = null; |
| 219 | + |
| 220 | + for (let i = 0; i < charCodes.length; i++) { |
| 221 | + const currentCode = charCodes[i]; |
| 222 | + |
| 223 | + if (!currentRange) { |
| 224 | + currentRange = [currentCode, currentCode]; |
| 225 | + } |
| 226 | + |
| 227 | + if (i < charCodes.length - 1) { |
| 228 | + // There's a "next" element |
| 229 | + const nextCode = charCodes[i + 1]; |
| 230 | + |
| 231 | + if (nextCode === currentCode + 1) { |
| 232 | + currentRange[1] = nextCode; |
| 233 | + } else { |
| 234 | + ranges.push(collapseRange(currentRange)); |
| 235 | + currentRange = null; |
| 236 | + } |
| 237 | + } else { |
| 238 | + // We've reached the end of the array |
| 239 | + // push the final range being generated |
| 240 | + ranges.push(collapseRange(currentRange)); |
| 241 | + } |
| 242 | + } |
| 243 | + return ranges; |
| 244 | + |
| 245 | + // If the range contains just one number, collapse into a single element tuple |
| 246 | + function collapseRange(range: CharCodeRange): CharCodeRange { |
| 247 | + if (range[0] === range[1]) { |
| 248 | + return [range[0]]; // single number "range". Ex: [32] |
| 249 | + } else { |
| 250 | + return range; // multiple number range. Ex: [9, 13] |
| 251 | + } |
| 252 | + } |
| 253 | +} |
| 254 | + |
| 255 | +type CharCodeRange = [from: number, to?: number]; |
| 256 | + |
| 257 | +/** |
| 258 | + * Given a set of character code ranges, builds a binary search JavaScript |
| 259 | + * expression to check a character code against the ranges. |
| 260 | + * |
| 261 | + * Ex: |
| 262 | + * |
| 263 | + * buildBinarySearchExpr([[9, 13], [32], [160]]); |
| 264 | + * // -> '(c < 32 ? (c >= 9 && c <= 13) : (c == 32 || c == 160))' |
| 265 | + * |
| 266 | + */ |
| 267 | +function buildBinarySearchExpr(ranges: CharCodeRange[]): string { |
| 268 | + if (ranges.length === 1) { |
| 269 | + return buildComparisonExpr(ranges[0]); |
| 270 | + } else if (ranges.length === 2) { |
| 271 | + return `(${buildComparisonExpr(ranges[0])} || ${buildComparisonExpr(ranges[1])})`; |
| 272 | + } else { |
| 273 | + const mid = Math.floor(ranges.length / 2); |
| 274 | + |
| 275 | + const midRange = ranges[mid]; |
| 276 | + const leftExpr = buildBinarySearchExpr(ranges.slice(0, mid)); |
| 277 | + const rightExpr = buildBinarySearchExpr(ranges.slice(mid)); |
| 278 | + |
| 279 | + return `(c < ${midRange[0]} ? ${leftExpr} : ${rightExpr})`; |
| 280 | + } |
| 281 | +} |
| 282 | + |
| 283 | +function buildComparisonExpr(range: CharCodeRange): string { |
| 284 | + if (range.length === 1) { |
| 285 | + return `c == ${range[0]}`; |
| 286 | + } else { |
| 287 | + return `(c >= ${range[0]} && c <= ${range[1]})`; |
| 288 | + } |
| 289 | +} |
| 290 | + |
| 291 | +function generateCharCompareTest(fnName: string, re: RegExp): string { |
| 292 | + return dedent` |
| 293 | + describe('${fnName}()', () => { |
| 294 | + it(\`should appropriately return true/false to match the regular expression /${re.source.replace(/\\/g, '\\\\')}/\`, () => { |
| 295 | + for (let charCode = 0; charCode < 65535; charCode++) { |
| 296 | + const char = String.fromCharCode(charCode); |
| 297 | + const fnResult = ${fnName}(charCode); |
| 298 | + const regExpResult = /${re.source}/.test(char); |
| 299 | +
|
| 300 | + expect(fnResult).to.equal(regExpResult, \`Expected charCode \${charCode} (\${char}) to return \${regExpResult}, but returned \${fnResult}\`); |
| 301 | + } |
| 302 | + }); |
| 303 | + }); |
| 304 | + `; |
| 305 | +} |
0 commit comments