perf: minor tokenizer tweaks (#13652)

2021-08-09 21:20:44 +02:00 · 2021-08-09 21:20:44 +02:00 · da1d166ea6
commit da1d166ea6
parent 8a09993e39
4 changed files with 34 additions and 21 deletions
--- a/packages/babel-parser/src/parser/statement.js
+++ b/packages/babel-parser/src/parser/statement.js
@ -201,11 +201,10 @@ export default class StatementParser extends ExpressionParser {

    if (isIdentifierStart(nextCh)) {
      keywordRelationalOperator.lastIndex = next;
-      const matched = keywordRelationalOperator.exec(this.input);
-      if (matched !== null) {
+      if (keywordRelationalOperator.test(this.input)) {
        // We have seen `in` or `instanceof` so far, now check if the identfier
        // ends here
-        const endCh = this.codePointAtPos(next + matched[0].length);
+        const endCh = this.codePointAtPos(keywordRelationalOperator.lastIndex);
        if (!isIdentifierChar(endCh) && endCh !== charCodes.backslash) {
          return false;
        }
--- a/packages/babel-parser/src/parser/util.js
+++ b/packages/babel-parser/src/parser/util.js
@ -4,7 +4,7 @@ import { types as tt, TokenType } from "../tokenizer/types";
 import Tokenizer from "../tokenizer";
 import State from "../tokenizer/state";
 import type { Node } from "../types";
-import { lineBreak } from "../util/whitespace";
+import { lineBreak, skipWhiteSpaceToLineBreak } from "../util/whitespace";
 import { isIdentifierChar } from "../util/identifier";
 import ClassScopeHandler from "../util/class-scope";
 import ExpressionScopeHandler from "../util/expression-scope";
@ -119,9 +119,8 @@ export default class UtilParser extends Tokenizer {
  }

  hasFollowingLineBreak(): boolean {
-    return lineBreak.test(
-      this.input.slice(this.state.end, this.nextTokenStart()),
-    );
+    skipWhiteSpaceToLineBreak.lastIndex = this.state.end;
+    return skipWhiteSpaceToLineBreak.test(this.input);
  }

  // TODO
--- a/packages/babel-parser/src/tokenizer/index.js
+++ b/packages/babel-parser/src/tokenizer/index.js
@ -237,9 +237,7 @@ export default class Tokenizer extends ParserErrors {

  nextTokenStartSince(pos: number): number {
    skipWhiteSpace.lastIndex = pos;
-    const skip = skipWhiteSpace.exec(this.input);
-    // $FlowIgnore: The skipWhiteSpace ensures to match any string
-    return pos + skip[0].length;
+    return skipWhiteSpace.test(this.input) ? skipWhiteSpace.lastIndex : pos;
  }

  lookaheadCharCode(): number {
@ -307,18 +305,14 @@ export default class Tokenizer extends ParserErrors {
    let startLoc;
    if (!this.isLookahead) startLoc = this.state.curPosition();
    const start = this.state.pos;
-    const end = this.input.indexOf("*/", this.state.pos + 2);
+    const end = this.input.indexOf("*/", start + 2);
    if (end === -1) throw this.raise(start, Errors.UnterminatedComment);

    this.state.pos = end + 2;
-    lineBreakG.lastIndex = start;
-    let match;
-    while (
-      (match = lineBreakG.exec(this.input)) &&
-      match.index < this.state.pos
-    ) {
+    lineBreakG.lastIndex = start + 2;
+    while (lineBreakG.test(this.input) && lineBreakG.lastIndex <= end) {
      ++this.state.curLine;
-      this.state.lineStart = match.index + match[0].length;
+      this.state.lineStart = lineBreakG.lastIndex;
    }

    // If we are doing a lookahead right now we need to advance the position (above code)
@ -326,11 +320,10 @@ export default class Tokenizer extends ParserErrors {
    if (this.isLookahead) return;
    /*:: invariant(startLoc) */

-    const value = this.input.slice(start + 2, end);
    const comment = {
      type: "CommentBlock",
-      value: value,
-      start: start,
+      value: this.input.slice(start + 2, end),
+      start,
      end: end + 2,
      loc: new SourceLocation(startLoc, this.state.curPosition()),
    };
--- a/packages/babel-parser/src/util/whitespace.js
+++ b/packages/babel-parser/src/util/whitespace.js
@ -23,6 +23,28 @@ export function isNewLine(code: number): boolean {

 export const skipWhiteSpace = /(?:\s|\/\/.*|\/\*[^]*?\*\/)*/g;

+export const skipWhiteSpaceInLine =
+  /(?:[^\S\n\r\u2028\u2029]|\/\/.*|\/\*.*?\*\/)*/y;
+
+// Skip whitespace and single-line comments, including /* no newline here */.
+// After this RegExp matches, its lastIndex points to a line terminator, or
+// the start of multi-line comment (which is effectively a line terminator),
+// or the end of string.
+export const skipWhiteSpaceToLineBreak = new RegExp(
+  // Unfortunately JS doesn't support Perl's atomic /(?>pattern)/ or
+  // possessive quantifiers, so we use a trick to prevent backtracking
+  // when the look-ahead for line terminator fails.
+  "(?=(" +
+    // Capture the whitespace and comments that should be skipped inside
+    // a look-ahead assertion, and then re-match the group as a unit.
+    skipWhiteSpaceInLine.source +
+    "))\\1" +
+    // Look-ahead for either line terminator, start of multi-line comment,
+    // or end of string.
+    /(?=[\n\r\u2028\u2029]|\/\*(?!.*?\*\/)|$)/.source,
+  "y", // sticky
+);
+
 // https://tc39.github.io/ecma262/#sec-white-space
 export function isWhitespace(code: number): boolean {
  switch (code) {