Update parser whitespace for clarity (#8539)

* Update parser whitespace for clarity Has the nice benefit of not requiring a charCode to String conversion. * Remove test262 exceptions Per https://github.com/tc39/ecma262/pull/1218#issuecomment-395340891, ECMA262 follows whatever the latest Unicode version specifies for Zs Space_Separator category. MONGOLIAN VOWEL SEPARATOR was moved to the Cf Other_Format category in Unicode 8.
2018-08-25 19:12:38 -04:00 · 2018-08-25 19:12:38 -04:00 · 5899940156
commit 5899940156
parent 595240f071
3 changed files with 45 additions and 16 deletions
--- a/packages/babel-parser/src/tokenizer/index.js
+++ b/packages/babel-parser/src/tokenizer/index.js
@ -16,7 +16,7 @@ import {
  lineBreak,
  lineBreakG,
  isNewLine,
-  nonASCIIwhitespace,
+  isWhitespace,
 } from "../util/whitespace";
 import State from "./state";

@ -331,11 +331,6 @@ export default class Tokenizer extends LocationParser {
    loop: while (this.state.pos < this.input.length) {
      const ch = this.input.charCodeAt(this.state.pos);
      switch (ch) {
-        case charCodes.space:
-        case charCodes.nonBreakingSpace:
-          ++this.state.pos;
-          break;
-
        case charCodes.carriageReturn:
          if (
            this.input.charCodeAt(this.state.pos + 1) === charCodes.lineFeed
@ -367,11 +362,7 @@ export default class Tokenizer extends LocationParser {
          break;

        default:
-          if (
-            (ch > charCodes.backSpace && ch < charCodes.shiftOut) ||
-            (ch >= charCodes.oghamSpaceMark &&
-              nonASCIIwhitespace.test(String.fromCharCode(ch)))
-          ) {
+          if (isWhitespace(ch)) {
            ++this.state.pos;
          } else {
            break loop;
--- a/packages/babel-parser/src/util/whitespace.js
+++ b/packages/babel-parser/src/util/whitespace.js
@ -1,13 +1,53 @@
 // @flow

+import * as charCodes from "charcodes";
+
 // Matches a whole line break (where CRLF is considered a single
 // line break). Used to count lines.
-
 export const lineBreak = /\r\n?|\n|\u2028|\u2029/;
 export const lineBreakG = new RegExp(lineBreak.source, "g");

+// https://tc39.github.io/ecma262/#sec-line-terminators
 export function isNewLine(code: number): boolean {
-  return code === 10 || code === 13 || code === 0x2028 || code === 0x2029;
+  switch (code) {
+    case charCodes.lineFeed:
+    case charCodes.carriageReturn:
+    case charCodes.lineSeparator:
+    case charCodes.paragraphSeparator:
+      return true;
+
+    default:
+      return false;
+  }
 }

-export const nonASCIIwhitespace = /[\u1680\u180e\u2000-\u200a\u202f\u205f\u3000\ufeff]/;
+// https://tc39.github.io/ecma262/#sec-white-space
+export function isWhitespace(code: number): boolean {
+  switch (code) {
+    case 0x0009: // CHARACTER TABULATION
+    case 0x000b: // LINE TABULATION
+    case 0x000c: // FORM FEED
+    case charCodes.space:
+    case charCodes.nonBreakingSpace:
+    case charCodes.oghamSpaceMark:
+    case 0x2000: // EN QUAD
+    case 0x2001: // EM QUAD
+    case 0x2002: // EN SPACE
+    case 0x2003: // EM SPACE
+    case 0x2004: // THREE-PER-EM SPACE
+    case 0x2005: // FOUR-PER-EM SPACE
+    case 0x2006: // SIX-PER-EM SPACE
+    case 0x2007: // FIGURE SPACE
+    case 0x2008: // PUNCTUATION SPACE
+    case 0x2009: // THIN SPACE
+    case 0x200a: // HAIR SPACE
+    case 0x202f: // NARROW NO-BREAK SPACE
+    case 0x205f: // MEDIUM MATHEMATICAL SPACE
+    case 0x3000: // IDEOGRAPHIC SPACE
+    case 0xfeff: // ZERO WIDTH NO-BREAK SPACE
+      return true;
+
+    default:
+      return false;
+  }
+}
--- a/scripts/tests/test262/test262_whitelist.txt
+++ b/scripts/tests/test262/test262_whitelist.txt
@ -683,8 +683,6 @@ language/statements/while/labelled-fn-stmt.js(default)
 language/statements/with/decl-async-fun.js(default)
 language/statements/with/decl-async-gen.js(default)
 language/statements/with/labelled-fn-stmt.js(default)
-language/white-space/mongolian-vowel-separator.js(default)
-language/white-space/mongolian-vowel-separator.js(strict mode)

 language/identifiers/unicode-escape-nls-err.js(default)
 language/identifiers/unicode-escape-nls-err.js(strict mode)