Update parser whitespace for clarity (#8539)

* Update parser whitespace for clarity

Has the nice benefit of not requiring a charCode to String conversion.

* Remove test262 exceptions

Per https://github.com/tc39/ecma262/pull/1218#issuecomment-395340891,
ECMA262 follows whatever the latest Unicode version specifies for Zs
Space_Separator category. MONGOLIAN VOWEL SEPARATOR was moved to the Cf
Other_Format category in Unicode 8.
This commit is contained in:
Justin Ridgewell 2018-08-25 19:12:38 -04:00 committed by GitHub
parent 595240f071
commit 5899940156
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 16 deletions

View File

@ -16,7 +16,7 @@ import {
lineBreak,
lineBreakG,
isNewLine,
nonASCIIwhitespace,
isWhitespace,
} from "../util/whitespace";
import State from "./state";
@ -331,11 +331,6 @@ export default class Tokenizer extends LocationParser {
loop: while (this.state.pos < this.input.length) {
const ch = this.input.charCodeAt(this.state.pos);
switch (ch) {
case charCodes.space:
case charCodes.nonBreakingSpace:
++this.state.pos;
break;
case charCodes.carriageReturn:
if (
this.input.charCodeAt(this.state.pos + 1) === charCodes.lineFeed
@ -367,11 +362,7 @@ export default class Tokenizer extends LocationParser {
break;
default:
if (
(ch > charCodes.backSpace && ch < charCodes.shiftOut) ||
(ch >= charCodes.oghamSpaceMark &&
nonASCIIwhitespace.test(String.fromCharCode(ch)))
) {
if (isWhitespace(ch)) {
++this.state.pos;
} else {
break loop;

View File

@ -1,13 +1,53 @@
// @flow
import * as charCodes from "charcodes";
// Matches a whole line break (where CRLF is considered a single
// line break). Used to count lines.
export const lineBreak = /\r\n?|\n|\u2028|\u2029/;
export const lineBreakG = new RegExp(lineBreak.source, "g");
// https://tc39.github.io/ecma262/#sec-line-terminators
export function isNewLine(code: number): boolean {
return code === 10 || code === 13 || code === 0x2028 || code === 0x2029;
switch (code) {
case charCodes.lineFeed:
case charCodes.carriageReturn:
case charCodes.lineSeparator:
case charCodes.paragraphSeparator:
return true;
default:
return false;
}
}
export const nonASCIIwhitespace = /[\u1680\u180e\u2000-\u200a\u202f\u205f\u3000\ufeff]/;
// https://tc39.github.io/ecma262/#sec-white-space
export function isWhitespace(code: number): boolean {
switch (code) {
case 0x0009: // CHARACTER TABULATION
case 0x000b: // LINE TABULATION
case 0x000c: // FORM FEED
case charCodes.space:
case charCodes.nonBreakingSpace:
case charCodes.oghamSpaceMark:
case 0x2000: // EN QUAD
case 0x2001: // EM QUAD
case 0x2002: // EN SPACE
case 0x2003: // EM SPACE
case 0x2004: // THREE-PER-EM SPACE
case 0x2005: // FOUR-PER-EM SPACE
case 0x2006: // SIX-PER-EM SPACE
case 0x2007: // FIGURE SPACE
case 0x2008: // PUNCTUATION SPACE
case 0x2009: // THIN SPACE
case 0x200a: // HAIR SPACE
case 0x202f: // NARROW NO-BREAK SPACE
case 0x205f: // MEDIUM MATHEMATICAL SPACE
case 0x3000: // IDEOGRAPHIC SPACE
case 0xfeff: // ZERO WIDTH NO-BREAK SPACE
return true;
default:
return false;
}
}

View File

@ -683,8 +683,6 @@ language/statements/while/labelled-fn-stmt.js(default)
language/statements/with/decl-async-fun.js(default)
language/statements/with/decl-async-gen.js(default)
language/statements/with/labelled-fn-stmt.js(default)
language/white-space/mongolian-vowel-separator.js(default)
language/white-space/mongolian-vowel-separator.js(strict mode)
language/identifiers/unicode-escape-nls-err.js(default)
language/identifiers/unicode-escape-nls-err.js(strict mode)