import ParseError from "./ParseError";
import SourceLocation from "./SourceLocation";
import {Token} from "./Token";
import type {LexerInterface} from "./Token";
import type Settings from "./Settings";
const spaceRegexString = "[ \r\n\t]";
const controlWordRegexString = "\\\\[a-zA-Z@]+";
const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
const controlWordWhitespaceRegexString =
`(${controlWordRegexString})${spaceRegexString}*`;
const controlSpaceRegexString = "\\\\(\n|[ \r\t]+\n?)[ \r\t]*";
const combiningDiacriticalMarkString = "[\u0300-\u036f]";
export const combiningDiacriticalMarksEndRegex: RegExp =
new RegExp(`${combiningDiacriticalMarkString}+$`);
const tokenRegexString = `(${spaceRegexString}+)|` + `${controlSpaceRegexString}|` + "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + `${combiningDiacriticalMarkString}*` + "|[\uD800-\uDBFF][\uDC00-\uDFFF]" + `${combiningDiacriticalMarkString}*` + "|\\\\verb\\*([^]).*?\\4" + "|\\\\verb([^*a-zA-Z]).*?\\5" + `|${controlWordWhitespaceRegexString}` + `|${controlSymbolRegexString})`;
export default class Lexer implements LexerInterface {
input: string;
settings: Settings;
tokenRegex: RegExp;
catcodes: {[string]: number};
constructor(input: string, settings: Settings) {
this.input = input;
this.settings = settings;
this.tokenRegex = new RegExp(tokenRegexString, 'g');
this.catcodes = {
"%": 14, "~": 13, };
}
setCatcode(char: string, code: number) {
this.catcodes[char] = code;
}
lex(): Token {
const input = this.input;
const pos = this.tokenRegex.lastIndex;
if (pos === input.length) {
return new Token("EOF", new SourceLocation(this, pos, pos));
}
const match = this.tokenRegex.exec(input);
if (match === null || match.index !== pos) {
throw new ParseError(
`Unexpected character: '${input[pos]}'`,
new Token(input[pos], new SourceLocation(this, pos, pos + 1)));
}
const text = match[6] || match[3] || (match[2] ? "\\ " : " ");
if (this.catcodes[text] === 14) { const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex);
if (nlIndex === -1) {
this.tokenRegex.lastIndex = input.length; this.settings.reportNonstrict("commentAtEnd",
"% comment has no terminating newline; LaTeX would " +
"fail because of commenting the end of math mode (e.g. $)");
} else {
this.tokenRegex.lastIndex = nlIndex + 1;
}
return this.lex();
}
return new Token(text, new SourceLocation(this, pos,
this.tokenRegex.lastIndex));
}
}