From da1d166ea6b538aeda1faba0d2438a44f2a0c1af Mon Sep 17 00:00:00 2001
From: Mickey Rose <lightmare@users.noreply.github.com>
Date: Mon, 9 Aug 2021 21:20:44 +0200
Subject: [PATCH] perf: minor tokenizer tweaks (#13652)

---
 packages/babel-parser/src/parser/statement.js |  5 ++---
 packages/babel-parser/src/parser/util.js      |  7 +++---
 packages/babel-parser/src/tokenizer/index.js  | 21 ++++++------------
 packages/babel-parser/src/util/whitespace.js  | 22 +++++++++++++++++++
 4 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/packages/babel-parser/src/parser/statement.js b/packages/babel-parser/src/parser/statement.js
index 5e72f7c094..71bbb711bd 100644
--- a/packages/babel-parser/src/parser/statement.js
+++ b/packages/babel-parser/src/parser/statement.js
@@ -201,11 +201,10 @@ export default class StatementParser extends ExpressionParser {
 
     if (isIdentifierStart(nextCh)) {
       keywordRelationalOperator.lastIndex = next;
-      const matched = keywordRelationalOperator.exec(this.input);
-      if (matched !== null) {
+      if (keywordRelationalOperator.test(this.input)) {
         // We have seen `in` or `instanceof` so far, now check if the identfier
         // ends here
-        const endCh = this.codePointAtPos(next + matched[0].length);
+        const endCh = this.codePointAtPos(keywordRelationalOperator.lastIndex);
         if (!isIdentifierChar(endCh) && endCh !== charCodes.backslash) {
           return false;
         }
diff --git a/packages/babel-parser/src/parser/util.js b/packages/babel-parser/src/parser/util.js
index 648a5743c5..6c73a4b385 100644
--- a/packages/babel-parser/src/parser/util.js
+++ b/packages/babel-parser/src/parser/util.js
@@ -4,7 +4,7 @@ import { types as tt, TokenType } from "../tokenizer/types";
 import Tokenizer from "../tokenizer";
 import State from "../tokenizer/state";
 import type { Node } from "../types";
-import { lineBreak } from "../util/whitespace";
+import { lineBreak, skipWhiteSpaceToLineBreak } from "../util/whitespace";
 import { isIdentifierChar } from "../util/identifier";
 import ClassScopeHandler from "../util/class-scope";
 import ExpressionScopeHandler from "../util/expression-scope";
@@ -119,9 +119,8 @@ export default class UtilParser extends Tokenizer {
   }
 
   hasFollowingLineBreak(): boolean {
-    return lineBreak.test(
-      this.input.slice(this.state.end, this.nextTokenStart()),
-    );
+    skipWhiteSpaceToLineBreak.lastIndex = this.state.end;
+    return skipWhiteSpaceToLineBreak.test(this.input);
   }
 
   // TODO
diff --git a/packages/babel-parser/src/tokenizer/index.js b/packages/babel-parser/src/tokenizer/index.js
index d3cf76f376..85a0a60395 100644
--- a/packages/babel-parser/src/tokenizer/index.js
+++ b/packages/babel-parser/src/tokenizer/index.js
@@ -237,9 +237,7 @@ export default class Tokenizer extends ParserErrors {
 
   nextTokenStartSince(pos: number): number {
     skipWhiteSpace.lastIndex = pos;
-    const skip = skipWhiteSpace.exec(this.input);
-    // $FlowIgnore: The skipWhiteSpace ensures to match any string
-    return pos + skip[0].length;
+    return skipWhiteSpace.test(this.input) ? skipWhiteSpace.lastIndex : pos;
   }
 
   lookaheadCharCode(): number {
@@ -307,18 +305,14 @@ export default class Tokenizer extends ParserErrors {
     let startLoc;
     if (!this.isLookahead) startLoc = this.state.curPosition();
     const start = this.state.pos;
-    const end = this.input.indexOf("*/", this.state.pos + 2);
+    const end = this.input.indexOf("*/", start + 2);
     if (end === -1) throw this.raise(start, Errors.UnterminatedComment);
 
     this.state.pos = end + 2;
-    lineBreakG.lastIndex = start;
-    let match;
-    while (
-      (match = lineBreakG.exec(this.input)) &&
-      match.index < this.state.pos
-    ) {
+    lineBreakG.lastIndex = start + 2;
+    while (lineBreakG.test(this.input) && lineBreakG.lastIndex <= end) {
       ++this.state.curLine;
-      this.state.lineStart = match.index + match[0].length;
+      this.state.lineStart = lineBreakG.lastIndex;
     }
 
     // If we are doing a lookahead right now we need to advance the position (above code)
@@ -326,11 +320,10 @@ export default class Tokenizer extends ParserErrors {
     if (this.isLookahead) return;
     /*:: invariant(startLoc) */
 
-    const value = this.input.slice(start + 2, end);
     const comment = {
       type: "CommentBlock",
-      value: value,
-      start: start,
+      value: this.input.slice(start + 2, end),
+      start,
       end: end + 2,
       loc: new SourceLocation(startLoc, this.state.curPosition()),
     };
diff --git a/packages/babel-parser/src/util/whitespace.js b/packages/babel-parser/src/util/whitespace.js
index 2fdbc3d423..e13eb5c1f2 100644
--- a/packages/babel-parser/src/util/whitespace.js
+++ b/packages/babel-parser/src/util/whitespace.js
@@ -23,6 +23,28 @@ export function isNewLine(code: number): boolean {
 
 export const skipWhiteSpace = /(?:\s|\/\/.*|\/\*[^]*?\*\/)*/g;
 
+export const skipWhiteSpaceInLine =
+  /(?:[^\S\n\r\u2028\u2029]|\/\/.*|\/\*.*?\*\/)*/y;
+
+// Skip whitespace and single-line comments, including /* no newline here */.
+// After this RegExp matches, its lastIndex points to a line terminator, or
+// the start of multi-line comment (which is effectively a line terminator),
+// or the end of string.
+export const skipWhiteSpaceToLineBreak = new RegExp(
+  // Unfortunately JS doesn't support Perl's atomic /(?>pattern)/ or
+  // possessive quantifiers, so we use a trick to prevent backtracking
+  // when the look-ahead for line terminator fails.
+  "(?=(" +
+    // Capture the whitespace and comments that should be skipped inside
+    // a look-ahead assertion, and then re-match the group as a unit.
+    skipWhiteSpaceInLine.source +
+    "))\\1" +
+    // Look-ahead for either line terminator, start of multi-line comment,
+    // or end of string.
+    /(?=[\n\r\u2028\u2029]|\/\*(?!.*?\*\/)|$)/.source,
+  "y", // sticky
+);
+
 // https://tc39.github.io/ecma262/#sec-white-space
 export function isWhitespace(code: number): boolean {
   switch (code) {