Skip to content

Commit a368445

Browse files
committed
Use optimized isLetterChar() where possible. Improvement of ~400ops/sec on benchmarks
1 parent c7f60d7 commit a368445

File tree

5 files changed

+104
-17
lines changed

5 files changed

+104
-17
lines changed

src/htmlParser/parse-html.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
import { letterRe, digitRe, whitespaceRe, quoteRe, controlCharsRe } from '../regex-lib';
1+
import { isLetterChar } from '../string-utils';
2+
import { digitRe, whitespaceRe, quoteRe, controlCharsRe } from '../regex-lib';
23
import { assertNever } from '../utils';
34

45
// For debugging: search for other "For debugging" lines
@@ -208,7 +209,7 @@ export function parseHtml(
208209
} else if (char === '<') {
209210
// start of another tag (ignore the previous, incomplete one)
210211
startNewTag();
211-
} else if (letterRe.test(char)) {
212+
} else if (isLetterChar(char)) {
212213
// tag name start (and no '/' read)
213214
state = State.TagName;
214215
currentTag = new CurrentTag({ ...currentTag, isOpening: true });
@@ -244,7 +245,7 @@ export function parseHtml(
244245
name: captureTagName(),
245246
});
246247
emitTagAndPreviousTextNode(); // resets to Data state as well
247-
} else if (!letterRe.test(char) && !digitRe.test(char) && char !== ':') {
248+
} else if (!isLetterChar(char) && !digitRe.test(char) && char !== ':') {
248249
// Anything else that does not form an html tag. Note: the colon
249250
// character is accepted for XML namespaced tags
250251
resetToDataState();
@@ -259,7 +260,7 @@ export function parseHtml(
259260
if (char === '>') {
260261
// parse error. Encountered "</>". Skip it without treating as a tag
261262
resetToDataState();
262-
} else if (letterRe.test(char)) {
263+
} else if (isLetterChar(char)) {
263264
state = State.TagName;
264265
} else {
265266
// some other non-tag-like character, don't treat this as a tag

src/parser/uri-utils.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import { alphaNumericAndMarksRe, letterRe, digitRe } from '../regex-lib';
2-
import { isLetterChar, isLetterCharCode } from '../string-utils';
1+
import { alphaNumericAndMarksRe, digitRe } from '../regex-lib';
2+
import { isLetterChar, isLetterCharCode, letterRe } from '../string-utils';
33
import { tldRegex } from './known-tlds';
44

55
/**
@@ -90,9 +90,7 @@ export const isSchemeStartCharCode: (code: number) => boolean = isLetterCharCode
9090
* {@link isSchemeStartChar}.
9191
*/
9292
export function isSchemeChar(char: string): boolean {
93-
return (
94-
letterRe.test(char) || digitRe.test(char) || char === '+' || char === '-' || char === '.'
95-
);
93+
return isLetterChar(char) || digitRe.test(char) || char === '+' || char === '-' || char === '.';
9694
}
9795

9896
/**
@@ -193,6 +191,7 @@ export function isValidSchemeUrl(url: string): boolean {
193191
// - git:something ('something' doesn't look like a host)
194192
// - version:1.0 ('1.0' doesn't look like a host)
195193
if (host.indexOf('.') === -1 || !letterRe.test(host)) {
194+
// `letterRe` RegExp checks for a letter anywhere in the host string
196195
return false;
197196
}
198197
return true;

src/regex-lib.ts

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,16 @@
66
* regular expressions that are shared between source files.
77
*/
88

9-
/**
10-
* Regular expression to match upper and lowercase ASCII letters
11-
*/
12-
export const letterRe = /[A-Za-z]/;
13-
149
/**
1510
* Regular expression to match ASCII digits
1611
*/
12+
// TODO: Remove and replace with isDigit() function
1713
export const digitRe = /[\d]/;
1814

1915
/**
2016
* Regular expression to match everything *except* ASCII digits
2117
*/
18+
// TODO: Remove and replace with !isDigit() function call
2219
export const nonDigitRe = /[\D]/;
2320

2421
/**

src/string-utils.ts

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,26 @@ const enum Char {
99
Z = 90,
1010
a = 97,
1111
z = 122,
12+
Zero = 48, // char code for '0'
13+
Nine = 57, // char code for '9'
1214
}
1315

16+
/**
17+
* Regular expression to match one or more upper and lowercase ASCII letters.
18+
*
19+
* Do not use for single letter checks. The {@link #isLetterChar} and
20+
* {@link #isLetterCharCode} functions are 10x faster.
21+
*/
22+
export const letterRe = /[A-Za-z]/;
23+
1424
/**
1525
* Determines if the given character is a letter char which matches the RegExp
1626
* `/[A-Za-z]/`
1727
*/
1828
export function isLetterChar(char: string): boolean {
19-
// Previous implementation of this function was using the /[A-Za-z]/ regexp,
20-
// but this is 90% slower than testing by char code ranges as numbers
21-
// according to jsperf
29+
// Previous implementation of this function was just testing against the
30+
// /[A-Za-z]/ regexp, but this is 90% slower than testing by char code
31+
// ranges as numbers according to jsperf
2232
//return letterRe.test(char);
2333

2434
return isLetterCharCode(char.charCodeAt(0));

tests/string-utils.spec.ts

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import { expect } from 'chai';
2+
import { isLetterChar, isLetterCharCode } from '../src/string-utils';
3+
4+
describe(`isLetterChar()`, () => {
5+
it(`when given letter characters A-Z and a-z, should return true`, () => {
6+
expect(isLetterChar('A')).to.equal(true);
7+
expect(isLetterChar('B')).to.equal(true);
8+
expect(isLetterChar('C')).to.equal(true);
9+
expect(isLetterChar('M')).to.equal(true);
10+
expect(isLetterChar('X')).to.equal(true);
11+
expect(isLetterChar('Y')).to.equal(true);
12+
expect(isLetterChar('Z')).to.equal(true);
13+
14+
expect(isLetterChar('a')).to.equal(true);
15+
expect(isLetterChar('b')).to.equal(true);
16+
expect(isLetterChar('c')).to.equal(true);
17+
expect(isLetterChar('m')).to.equal(true);
18+
expect(isLetterChar('x')).to.equal(true);
19+
expect(isLetterChar('y')).to.equal(true);
20+
expect(isLetterChar('z')).to.equal(true);
21+
});
22+
23+
it(`when given non-letter characters (i.e. not A-Z or a-z), should return false`, () => {
24+
expect(isLetterChar('1')).to.equal(false);
25+
expect(isLetterChar('5')).to.equal(false);
26+
expect(isLetterChar('9')).to.equal(false);
27+
expect(isLetterChar('!')).to.equal(false);
28+
expect(isLetterChar('[')).to.equal(false); // char between the A-Z and a-z ASCII ranges
29+
expect(isLetterChar('_')).to.equal(false); // char between the A-Z and a-z ASCII ranges
30+
expect(isLetterChar('`')).to.equal(false); // char between the A-Z and a-z ASCII ranges
31+
expect(isLetterChar(' ')).to.equal(false);
32+
expect(isLetterChar('{')).to.equal(false);
33+
expect(isLetterChar('}')).to.equal(false);
34+
expect(isLetterChar(':')).to.equal(false);
35+
expect(isLetterChar(';')).to.equal(false);
36+
expect(isLetterChar('<')).to.equal(false);
37+
expect(isLetterChar('>')).to.equal(false);
38+
expect(isLetterChar('=')).to.equal(false);
39+
expect(isLetterChar('-')).to.equal(false);
40+
});
41+
});
42+
43+
describe(`isLetterCharCodeCode()`, () => {
44+
it(`when given letter characters A-Z and a-z, should return true`, () => {
45+
expect(isLetterCharCode('A'.charCodeAt(0))).to.equal(true);
46+
expect(isLetterCharCode('B'.charCodeAt(0))).to.equal(true);
47+
expect(isLetterCharCode('C'.charCodeAt(0))).to.equal(true);
48+
expect(isLetterCharCode('M'.charCodeAt(0))).to.equal(true);
49+
expect(isLetterCharCode('X'.charCodeAt(0))).to.equal(true);
50+
expect(isLetterCharCode('Y'.charCodeAt(0))).to.equal(true);
51+
expect(isLetterCharCode('Z'.charCodeAt(0))).to.equal(true);
52+
53+
expect(isLetterCharCode('a'.charCodeAt(0))).to.equal(true);
54+
expect(isLetterCharCode('b'.charCodeAt(0))).to.equal(true);
55+
expect(isLetterCharCode('c'.charCodeAt(0))).to.equal(true);
56+
expect(isLetterCharCode('m'.charCodeAt(0))).to.equal(true);
57+
expect(isLetterCharCode('x'.charCodeAt(0))).to.equal(true);
58+
expect(isLetterCharCode('y'.charCodeAt(0))).to.equal(true);
59+
expect(isLetterCharCode('z'.charCodeAt(0))).to.equal(true);
60+
});
61+
62+
it(`when given non-letter characters (i.e. not A-Z or a-z), should return false`, () => {
63+
expect(isLetterCharCode('1'.charCodeAt(0))).to.equal(false);
64+
expect(isLetterCharCode('5'.charCodeAt(0))).to.equal(false);
65+
expect(isLetterCharCode('9'.charCodeAt(0))).to.equal(false);
66+
expect(isLetterCharCode('!'.charCodeAt(0))).to.equal(false);
67+
expect(isLetterCharCode('['.charCodeAt(0))).to.equal(false); // char between the A-Z and a-z ASCII ranges
68+
expect(isLetterCharCode('_'.charCodeAt(0))).to.equal(false); // char between the A-Z and a-z ASCII ranges
69+
expect(isLetterCharCode('`'.charCodeAt(0))).to.equal(false); // char between the A-Z and a-z ASCII ranges
70+
expect(isLetterCharCode(' '.charCodeAt(0))).to.equal(false);
71+
expect(isLetterCharCode('{'.charCodeAt(0))).to.equal(false);
72+
expect(isLetterCharCode('}'.charCodeAt(0))).to.equal(false);
73+
expect(isLetterCharCode(':'.charCodeAt(0))).to.equal(false);
74+
expect(isLetterCharCode(';'.charCodeAt(0))).to.equal(false);
75+
expect(isLetterCharCode('<'.charCodeAt(0))).to.equal(false);
76+
expect(isLetterCharCode('>'.charCodeAt(0))).to.equal(false);
77+
expect(isLetterCharCode('='.charCodeAt(0))).to.equal(false);
78+
expect(isLetterCharCode('-'.charCodeAt(0))).to.equal(false);
79+
});
80+
});

0 commit comments

Comments
 (0)