// @flow /*:: declare var invariant; */ import type { Options } from "../options"; import * as N from "../types"; import * as charCodes from "charcodes"; import { isIdentifierStart, isIdentifierChar } from "../util/identifier"; import { types as tt, keywords as keywordTypes, type TokenType } from "./types"; import { type TokContext, types as ct } from "./context"; import ParserErrors, { Errors, type ErrorTemplate } from "../parser/error"; import { SourceLocation } from "../util/location"; import { lineBreakG, isNewLine, isWhitespace, skipWhiteSpace, } from "../util/whitespace"; import State from "./state"; import type { LookaheadState } from "./state"; const VALID_REGEX_FLAGS = new Set([ charCodes.lowercaseG, charCodes.lowercaseM, charCodes.lowercaseS, charCodes.lowercaseI, charCodes.lowercaseY, charCodes.lowercaseU, charCodes.lowercaseD, ]); // The following character codes are forbidden from being // an immediate sibling of NumericLiteralSeparator _ const forbiddenNumericSeparatorSiblings = { decBinOct: [ charCodes.dot, charCodes.uppercaseB, charCodes.uppercaseE, charCodes.uppercaseO, charCodes.underscore, // multiple separators are not allowed charCodes.lowercaseB, charCodes.lowercaseE, charCodes.lowercaseO, ], hex: [ charCodes.dot, charCodes.uppercaseX, charCodes.underscore, // multiple separators are not allowed charCodes.lowercaseX, ], }; const allowedNumericSeparatorSiblings = {}; allowedNumericSeparatorSiblings.bin = [ // 0 - 1 charCodes.digit0, charCodes.digit1, ]; allowedNumericSeparatorSiblings.oct = [ // 0 - 7 ...allowedNumericSeparatorSiblings.bin, charCodes.digit2, charCodes.digit3, charCodes.digit4, charCodes.digit5, charCodes.digit6, charCodes.digit7, ]; allowedNumericSeparatorSiblings.dec = [ // 0 - 9 ...allowedNumericSeparatorSiblings.oct, charCodes.digit8, charCodes.digit9, ]; allowedNumericSeparatorSiblings.hex = [ // 0 - 9, A - F, a - f, ...allowedNumericSeparatorSiblings.dec, charCodes.uppercaseA, charCodes.uppercaseB, charCodes.uppercaseC, charCodes.uppercaseD, charCodes.uppercaseE, charCodes.uppercaseF, charCodes.lowercaseA, charCodes.lowercaseB, charCodes.lowercaseC, charCodes.lowercaseD, charCodes.lowercaseE, charCodes.lowercaseF, ]; // Object type used to represent tokens. Note that normally, tokens // simply exist as properties on the parser object. This is only // used for the onToken callback and the external tokenizer. export class Token { constructor(state: State) { this.type = state.type; this.value = state.value; this.start = state.start; this.end = state.end; this.loc = new SourceLocation(state.startLoc, state.endLoc); } declare type: TokenType; declare value: any; declare start: number; declare end: number; declare loc: SourceLocation; } // ## Tokenizer export default class Tokenizer extends ParserErrors { // Forward-declarations // parser/util.js /*:: +hasPrecedingLineBreak: () => boolean; +unexpected: (pos?: ?number, messageOrType?: ErrorTemplate | TokenType) => empty; +expectPlugin: (name: string, pos?: ?number) => true; */ isLookahead: boolean; // Token store. tokens: Array = []; constructor(options: Options, input: string) { super(); this.state = new State(); this.state.init(options); this.input = input; this.length = input.length; this.isLookahead = false; } pushToken(token: Token | N.Comment) { // Pop out invalid tokens trapped by try-catch parsing. // Those parsing branches are mainly created by typescript and flow plugins. this.tokens.length = this.state.tokensLength; this.tokens.push(token); ++this.state.tokensLength; } // Move to the next token next(): void { this.checkKeywordEscapes(); if (this.options.tokens) { this.pushToken(new Token(this.state)); } this.state.lastTokEnd = this.state.end; this.state.lastTokStart = this.state.start; this.state.lastTokEndLoc = this.state.endLoc; this.state.lastTokStartLoc = this.state.startLoc; this.nextToken(); } // TODO eat(type: TokenType): boolean { if (this.match(type)) { this.next(); return true; } else { return false; } } // TODO match(type: TokenType): boolean { return this.state.type === type; } /** * Create a LookaheadState from current parser state * * @param {State} state * @returns {LookaheadState} * @memberof Tokenizer */ createLookaheadState(state: State): LookaheadState { return { pos: state.pos, value: null, type: state.type, start: state.start, end: state.end, lastTokEnd: state.end, context: [this.curContext()], inType: state.inType, }; } /** * lookahead peeks the next token, skipping changes to token context and * comment stack. For performance it returns a limited LookaheadState * instead of full parser state. * * The { column, line } Loc info is not included in lookahead since such usage * is rare. Although it may return other location properties e.g. `curLine` and * `lineStart`, these properties are not listed in the LookaheadState interface * and thus the returned value is _NOT_ reliable. * * The tokenizer should make best efforts to avoid using any parser state * other than those defined in LookaheadState * * @returns {LookaheadState} * @memberof Tokenizer */ lookahead(): LookaheadState { const old = this.state; // For performance we use a simpified tokenizer state structure // $FlowIgnore this.state = this.createLookaheadState(old); this.isLookahead = true; this.nextToken(); this.isLookahead = false; const curr = this.state; this.state = old; return curr; } nextTokenStart(): number { return this.nextTokenStartSince(this.state.pos); } nextTokenStartSince(pos: number): number { skipWhiteSpace.lastIndex = pos; const skip = skipWhiteSpace.exec(this.input); // $FlowIgnore: The skipWhiteSpace ensures to match any string return pos + skip[0].length; } lookaheadCharCode(): number { return this.input.charCodeAt(this.nextTokenStart()); } codePointAtPos(pos: number): number { // The implementation is based on // https://source.chromium.org/chromium/chromium/src/+/master:v8/src/builtins/builtins-string-gen.cc;l=1455;drc=221e331b49dfefadbc6fa40b0c68e6f97606d0b3;bpv=0;bpt=1 // We reimplement `codePointAt` because `codePointAt` is a V8 builtin which is not inlined by TurboFan (as of M91) // since `input` is mostly ASCII, an inlined `charCodeAt` wins here let cp = this.input.charCodeAt(pos); if ((cp & 0xfc00) === 0xd800 && ++pos < this.input.length) { const trail = this.input.charCodeAt(pos); if ((trail & 0xfc00) === 0xdc00) { cp = 0x10000 + ((cp & 0x3ff) << 10) + (trail & 0x3ff); } } return cp; } // Toggle strict mode. Re-reads the next number or string to please // pedantic tests (`"use strict"; 010;` should fail). setStrict(strict: boolean): void { this.state.strict = strict; if (strict) { // Throw an error for any string decimal escape found before/immediately // after a "use strict" directive. Strict mode will be set at parse // time for any literals that occur after the next node of the strict // directive. this.state.strictErrors.forEach((message, pos) => /* eslint-disable @babel/development-internal/dry-error-messages */ this.raise(pos, message), ); this.state.strictErrors.clear(); } } curContext(): TokContext { return this.state.context[this.state.context.length - 1]; } // Read a single token, updating the parser object's token-related // properties. nextToken(): void { const curContext = this.curContext(); if (!curContext.preserveSpace) this.skipSpace(); this.state.start = this.state.pos; if (!this.isLookahead) this.state.startLoc = this.state.curPosition(); if (this.state.pos >= this.length) { this.finishToken(tt.eof); return; } if (curContext === ct.template) { this.readTmplToken(); } else { this.getTokenFromCode(this.codePointAtPos(this.state.pos)); } } skipBlockComment(): N.CommentBlock | void { let startLoc; if (!this.isLookahead) startLoc = this.state.curPosition(); const start = this.state.pos; const end = this.input.indexOf("*/", this.state.pos + 2); if (end === -1) throw this.raise(start, Errors.UnterminatedComment); this.state.pos = end + 2; lineBreakG.lastIndex = start; let match; while ( (match = lineBreakG.exec(this.input)) && match.index < this.state.pos ) { ++this.state.curLine; this.state.lineStart = match.index + match[0].length; } // If we are doing a lookahead right now we need to advance the position (above code) // but we do not want to push the comment to the state. if (this.isLookahead) return; /*:: invariant(startLoc) */ const value = this.input.slice(start + 2, end); const comment = { type: "CommentBlock", value: value, start: start, end: end + 2, loc: new SourceLocation(startLoc, this.state.curPosition()), }; if (this.options.tokens) this.pushToken(comment); return comment; } skipLineComment(startSkip: number): N.CommentLine | void { const start = this.state.pos; let startLoc; if (!this.isLookahead) startLoc = this.state.curPosition(); let ch = this.input.charCodeAt((this.state.pos += startSkip)); if (this.state.pos < this.length) { while (!isNewLine(ch) && ++this.state.pos < this.length) { ch = this.input.charCodeAt(this.state.pos); } } // If we are doing a lookahead right now we need to advance the position (above code) // but we do not want to push the comment to the state. if (this.isLookahead) return; /*:: invariant(startLoc) */ const end = this.state.pos; const value = this.input.slice(start + startSkip, end); const comment = { type: "CommentLine", value, start, end, loc: new SourceLocation(startLoc, this.state.curPosition()), }; if (this.options.tokens) this.pushToken(comment); return comment; } // Called at the start of the parse and after every token. Skips // whitespace and comments, and. skipSpace(): void { const spaceStart = this.state.pos; const comments = []; loop: while (this.state.pos < this.length) { const ch = this.input.charCodeAt(this.state.pos); switch (ch) { case charCodes.space: case charCodes.nonBreakingSpace: case charCodes.tab: ++this.state.pos; break; case charCodes.carriageReturn: if ( this.input.charCodeAt(this.state.pos + 1) === charCodes.lineFeed ) { ++this.state.pos; } // fall through case charCodes.lineFeed: case charCodes.lineSeparator: case charCodes.paragraphSeparator: ++this.state.pos; ++this.state.curLine; this.state.lineStart = this.state.pos; break; case charCodes.slash: switch (this.input.charCodeAt(this.state.pos + 1)) { case charCodes.asterisk: { const comment = this.skipBlockComment(); if (comment !== undefined) { this.addComment(comment); comments.push(comment); } break; } case charCodes.slash: { const comment = this.skipLineComment(2); if (comment !== undefined) { this.addComment(comment); comments.push(comment); } break; } default: break loop; } break; default: if (isWhitespace(ch)) { ++this.state.pos; } else if (ch === charCodes.dash && !this.inModule) { const pos = this.state.pos; if ( this.input.charCodeAt(pos + 1) === charCodes.dash && this.input.charCodeAt(pos + 2) === charCodes.greaterThan && (spaceStart === 0 || this.state.lineStart > spaceStart) ) { // A `-->` line comment const comment = this.skipLineComment(3); if (comment !== undefined) { this.addComment(comment); comments.push(comment); } } else { break loop; } } else if (ch === charCodes.lessThan && !this.inModule) { const pos = this.state.pos; if ( this.input.charCodeAt(pos + 1) === charCodes.exclamationMark && this.input.charCodeAt(pos + 2) === charCodes.dash && this.input.charCodeAt(pos + 3) === charCodes.dash ) { // `