Use optimized isLetterChar() where possible. Improvement of ~400ops/sec on benchmarks

gregjacobs · gregjacobs · commit a3684457c8fb · 2025-04-29T16:50:44.000-04:00
diff --git a/src/htmlParser/parse-html.ts b/src/htmlParser/parse-html.ts
@@ -1,4 +1,5 @@
-import { letterRe, digitRe, whitespaceRe, quoteRe, controlCharsRe } from '../regex-lib';
+import { isLetterChar } from '../string-utils';
+import { digitRe, whitespaceRe, quoteRe, controlCharsRe } from '../regex-lib';
 import { assertNever } from '../utils';
 
 // For debugging: search for other "For debugging" lines
@@ -208,7 +209,7 @@ export function parseHtml(
         } else if (char === '<') {
             // start of another tag (ignore the previous, incomplete one)
             startNewTag();
-        } else if (letterRe.test(char)) {
+        } else if (isLetterChar(char)) {
             // tag name start (and no '/' read)
             state = State.TagName;
             currentTag = new CurrentTag({ ...currentTag, isOpening: true });
@@ -244,7 +245,7 @@ export function parseHtml(
                 name: captureTagName(),
             });
             emitTagAndPreviousTextNode(); // resets to Data state as well
-        } else if (!letterRe.test(char) && !digitRe.test(char) && char !== ':') {
+        } else if (!isLetterChar(char) && !digitRe.test(char) && char !== ':') {
             // Anything else that does not form an html tag. Note: the colon
             // character is accepted for XML namespaced tags
             resetToDataState();
@@ -259,7 +260,7 @@ export function parseHtml(
         if (char === '>') {
             // parse error. Encountered "</>". Skip it without treating as a tag
             resetToDataState();
-        } else if (letterRe.test(char)) {
+        } else if (isLetterChar(char)) {
             state = State.TagName;
         } else {
             // some other non-tag-like character, don't treat this as a tag
diff --git a/src/parser/uri-utils.ts b/src/parser/uri-utils.ts
@@ -1,5 +1,5 @@
-import { alphaNumericAndMarksRe, letterRe, digitRe } from '../regex-lib';
-import { isLetterChar, isLetterCharCode } from '../string-utils';
+import { alphaNumericAndMarksRe, digitRe } from '../regex-lib';
+import { isLetterChar, isLetterCharCode, letterRe } from '../string-utils';
 import { tldRegex } from './known-tlds';
 
 /**
@@ -90,9 +90,7 @@ export const isSchemeStartCharCode: (code: number) => boolean = isLetterCharCode
  * {@link isSchemeStartChar}.
  */
 export function isSchemeChar(char: string): boolean {
-    return (
-        letterRe.test(char) || digitRe.test(char) || char === '+' || char === '-' || char === '.'
-    );
+    return isLetterChar(char) || digitRe.test(char) || char === '+' || char === '-' || char === '.';
 }
 
 /**
@@ -193,6 +191,7 @@ export function isValidSchemeUrl(url: string): boolean {
     //   - git:something ('something' doesn't look like a host)
     //   - version:1.0   ('1.0' doesn't look like a host)
     if (host.indexOf('.') === -1 || !letterRe.test(host)) {
+        // `letterRe` RegExp checks for a letter anywhere in the host string
         return false;
     }
     return true;
diff --git a/src/regex-lib.ts b/src/regex-lib.ts
@@ -6,19 +6,16 @@
  * regular expressions that are shared between source files.
  */
 
-/**
- * Regular expression to match upper and lowercase ASCII letters
- */
-export const letterRe = /[A-Za-z]/;
-
 /**
  * Regular expression to match ASCII digits
  */
+// TODO: Remove and replace with isDigit() function
 export const digitRe = /[\d]/;
 
 /**
  * Regular expression to match everything *except* ASCII digits
  */
+// TODO: Remove and replace with !isDigit() function call
 export const nonDigitRe = /[\D]/;
 
 /**
diff --git a/src/string-utils.ts b/src/string-utils.ts
@@ -9,16 +9,26 @@ const enum Char {
     Z = 90,
     a = 97,
     z = 122,
+    Zero = 48, // char code for '0'
+    Nine = 57, // char code for '9'
 }
 
+/**
+ * Regular expression to match one or more upper and lowercase ASCII letters.
+ *
+ * Do not use for single letter checks. The {@link #isLetterChar} and
+ * {@link #isLetterCharCode} functions are 10x faster.
+ */
+export const letterRe = /[A-Za-z]/;
+
 /**
  * Determines if the given character is a letter char which matches the RegExp
  * `/[A-Za-z]/`
  */
 export function isLetterChar(char: string): boolean {
-    // Previous implementation of this function was using the /[A-Za-z]/ regexp,
-    // but this is 90% slower than testing by char code ranges as numbers
-    // according to jsperf
+    // Previous implementation of this function was just testing against the
+    // /[A-Za-z]/ regexp, but this is 90% slower than testing by char code
+    // ranges as numbers according to jsperf
     //return letterRe.test(char);
 
     return isLetterCharCode(char.charCodeAt(0));
diff --git a/tests/string-utils.spec.ts b/tests/string-utils.spec.ts
@@ -0,0 +1,80 @@
+import { expect } from 'chai';
+import { isLetterChar, isLetterCharCode } from '../src/string-utils';
+
+describe(`isLetterChar()`, () => {
+    it(`when given letter characters A-Z and a-z, should return true`, () => {
+        expect(isLetterChar('A')).to.equal(true);
+        expect(isLetterChar('B')).to.equal(true);
+        expect(isLetterChar('C')).to.equal(true);
+        expect(isLetterChar('M')).to.equal(true);
+        expect(isLetterChar('X')).to.equal(true);
+        expect(isLetterChar('Y')).to.equal(true);
+        expect(isLetterChar('Z')).to.equal(true);
+
+        expect(isLetterChar('a')).to.equal(true);
+        expect(isLetterChar('b')).to.equal(true);
+        expect(isLetterChar('c')).to.equal(true);
+        expect(isLetterChar('m')).to.equal(true);
+        expect(isLetterChar('x')).to.equal(true);
+        expect(isLetterChar('y')).to.equal(true);
+        expect(isLetterChar('z')).to.equal(true);
+    });
+
+    it(`when given non-letter characters (i.e. not A-Z or a-z), should return false`, () => {
+        expect(isLetterChar('1')).to.equal(false);
+        expect(isLetterChar('5')).to.equal(false);
+        expect(isLetterChar('9')).to.equal(false);
+        expect(isLetterChar('!')).to.equal(false);
+        expect(isLetterChar('[')).to.equal(false); // char between the A-Z and a-z ASCII ranges
+        expect(isLetterChar('_')).to.equal(false); // char between the A-Z and a-z ASCII ranges
+        expect(isLetterChar('`')).to.equal(false); // char between the A-Z and a-z ASCII ranges
+        expect(isLetterChar(' ')).to.equal(false);
+        expect(isLetterChar('{')).to.equal(false);
+        expect(isLetterChar('}')).to.equal(false);
+        expect(isLetterChar(':')).to.equal(false);
+        expect(isLetterChar(';')).to.equal(false);
+        expect(isLetterChar('<')).to.equal(false);
+        expect(isLetterChar('>')).to.equal(false);
+        expect(isLetterChar('=')).to.equal(false);
+        expect(isLetterChar('-')).to.equal(false);
+    });
+});
+
+describe(`isLetterCharCodeCode()`, () => {
+    it(`when given letter characters A-Z and a-z, should return true`, () => {
+        expect(isLetterCharCode('A'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('B'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('C'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('M'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('X'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('Y'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('Z'.charCodeAt(0))).to.equal(true);
+
+        expect(isLetterCharCode('a'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('b'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('c'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('m'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('x'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('y'.charCodeAt(0))).to.equal(true);
+        expect(isLetterCharCode('z'.charCodeAt(0))).to.equal(true);
+    });
+
+    it(`when given non-letter characters (i.e. not A-Z or a-z), should return false`, () => {
+        expect(isLetterCharCode('1'.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode('5'.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode('9'.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode('!'.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode('['.charCodeAt(0))).to.equal(false); // char between the A-Z and a-z ASCII ranges
+        expect(isLetterCharCode('_'.charCodeAt(0))).to.equal(false); // char between the A-Z and a-z ASCII ranges
+        expect(isLetterCharCode('`'.charCodeAt(0))).to.equal(false); // char between the A-Z and a-z ASCII ranges
+        expect(isLetterCharCode(' '.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode('{'.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode('}'.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode(':'.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode(';'.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode('<'.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode('>'.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode('='.charCodeAt(0))).to.equal(false);
+        expect(isLetterCharCode('-'.charCodeAt(0))).to.equal(false);
+    });
+});