Faster readRegexp (#13453)

2021-06-10 19:00:21 -04:00
parent 0eb2853732
commit 6c8b2336f6
5 changed files with 82 additions and 31 deletions
--- a/packages/babel-parser/benchmark/large-regexp/bench.mjs
+++ b/packages/babel-parser/benchmark/large-regexp/bench.mjs
@@ -0,0 +1,22 @@
+import Benchmark from "benchmark";
+import baseline from "@babel-baseline/parser";
+import current from "../../lib/index.js";
+import { report } from "../util.mjs";
+
+const suite = new Benchmark.Suite();
+function createInput(length) {
+  return "const a = /" + "[/\\\\]".repeat(length / 4) + "/igsudm";
+}
+function benchCases(name, implementation, options) {
+  for (const length of [256, 512, 1024, 2048]) {
+    const input = createInput(length);
+    suite.add(`${name} ${length}-size RegExp literal `, () => {
+      implementation.parse(input, options);
+    });
+  }
+}
+
+benchCases("baseline", baseline);
+benchCases("current", current);
+
+suite.on("cycle", report).run();
--- a/packages/babel-parser/benchmark/many-small-all-flags-regexp/bench.mjs
+++ b/packages/babel-parser/benchmark/many-small-all-flags-regexp/bench.mjs
@@ -0,0 +1,22 @@
+import Benchmark from "benchmark";
+import baseline from "../../lib/index-v2.js";
+import current from "../../lib/index.js";
+import { report } from "../util.mjs";
+
+const suite = new Benchmark.Suite();
+function createInput(length) {
+  return "/x/dgimsuy;".repeat(length);
+}
+function benchCases(name, implementation, options) {
+  for (const length of [256, 512, 1024, 2048]) {
+    const input = createInput(length);
+    suite.add(`${name} ${length} small regexp literal with all flags`, () => {
+      implementation.parse(input, options);
+    });
+  }
+}
+
+benchCases("baseline", baseline);
+benchCases("current", current);
+
+suite.on("cycle", report).run();
--- a/packages/babel-parser/package.json
+++ b/packages/babel-parser/package.json
@@ -33,7 +33,7 @@
    "node": ">=6.0.0"
  },
  "devDependencies": {
-    "@babel-baseline/parser": "npm:@babel/parser@^7.14.4",
+    "@babel-baseline/parser": "npm:@babel/parser@^7.14.5",
    "@babel/code-frame": "workspace:*",
    "@babel/helper-fixtures": "workspace:*",
    "@babel/helper-validator-identifier": "workspace:*",
--- a/packages/babel-parser/src/tokenizer/index.js
+++ b/packages/babel-parser/src/tokenizer/index.js
@@ -12,7 +12,6 @@ import { type TokContext, types as ct } from "./context";
 import ParserErrors, { Errors, type ErrorTemplate } from "../parser/error";
 import { SourceLocation } from "../util/location";
 import {
-  lineBreak,
  lineBreakG,
  isNewLine,
  isWhitespace,
@@ -21,7 +20,15 @@ import {
 import State from "./state";
 import type { LookaheadState } from "./state";

-const VALID_REGEX_FLAGS = new Set(["g", "m", "s", "i", "y", "u", "d"]);
+const VALID_REGEX_FLAGS = new Set([
+  charCodes.lowercaseG,
+  charCodes.lowercaseM,
+  charCodes.lowercaseS,
+  charCodes.lowercaseI,
+  charCodes.lowercaseY,
+  charCodes.lowercaseU,
+  charCodes.lowercaseD,
+]);

 // The following character codes are forbidden from being
 // an immediate sibling of NumericLiteralSeparator _
@@ -976,53 +983,52 @@ export default class Tokenizer extends ParserErrors {
  readRegexp(): void {
    const start = this.state.start + 1;
    let escaped, inClass;
-    for (;;) {
-      if (this.state.pos >= this.length) {
+    let { pos } = this.state;
+    for (; ; ++pos) {
+      if (pos >= this.length) {
        throw this.raise(start, Errors.UnterminatedRegExp);
      }
-      const ch = this.input.charAt(this.state.pos);
-      if (lineBreak.test(ch)) {
+      const ch = this.input.charCodeAt(pos);
+      if (isNewLine(ch)) {
        throw this.raise(start, Errors.UnterminatedRegExp);
      }
      if (escaped) {
        escaped = false;
      } else {
-        if (ch === "[") {
+        if (ch === charCodes.leftSquareBracket) {
          inClass = true;
-        } else if (ch === "]" && inClass) {
+        } else if (ch === charCodes.rightSquareBracket && inClass) {
          inClass = false;
-        } else if (ch === "/" && !inClass) {
+        } else if (ch === charCodes.slash && !inClass) {
          break;
        }
-        escaped = ch === "\\";
+        escaped = ch === charCodes.backslash;
      }
-      ++this.state.pos;
    }
-    const content = this.input.slice(start, this.state.pos);
-    ++this.state.pos;
+    const content = this.input.slice(start, pos);
+    ++pos;

    let mods = "";

-    while (this.state.pos < this.length) {
-      const char = this.input[this.state.pos];
-      const charCode = this.codePointAtPos(this.state.pos);
+    while (pos < this.length) {
+      const cp = this.codePointAtPos(pos);
+      // It doesn't matter if cp > 0xffff, the loop will either throw or break because we check on cp
+      const char = String.fromCharCode(cp);

-      if (VALID_REGEX_FLAGS.has(char)) {
-        if (mods.indexOf(char) > -1) {
-          this.raise(this.state.pos + 1, Errors.DuplicateRegExpFlags);
+      if (VALID_REGEX_FLAGS.has(cp)) {
+        if (mods.includes(char)) {
+          this.raise(pos + 1, Errors.DuplicateRegExpFlags);
        }
-      } else if (
-        isIdentifierChar(charCode) ||
-        charCode === charCodes.backslash
-      ) {
-        this.raise(this.state.pos + 1, Errors.MalformedRegExpFlags);
+      } else if (isIdentifierChar(cp) || cp === charCodes.backslash) {
+        this.raise(pos + 1, Errors.MalformedRegExpFlags);
      } else {
        break;
      }

-      ++this.state.pos;
+      ++pos;
      mods += char;
    }
+    this.state.pos = pos;

    this.finishToken(tt.regexp, {
      pattern: content,