ezno_parser/
lexer.rs

1//! Contains lexing logic for all the whole of JS + TypeScript type annotations + JSX + other syntax
2//!
3//! Uses [`TSXToken`]s for data, uses [Span] for location data. Uses [`tokenizer_lib`] for logic.
4
5#![allow(clippy::as_conversions, clippy::cast_possible_truncation)]
6
7use super::{Span, TSXToken};
8use crate::{
9	errors::LexingErrors, html_tag_contains_literal_content, html_tag_is_self_closing, Comments,
10	Quoted,
11};
12use tokenizer_lib::{sized_tokens::TokenStart, Token, TokenSender};
13
14use derive_finite_automaton::{
15	FiniteAutomata, FiniteAutomataConstructor, GetAutomataStateForValue, GetNextResult,
16};
17
18mod html {}
19
20#[allow(clippy::struct_excessive_bools)]
21pub struct LexerOptions {
22	/// Whether to append tokens when lexing. If false will just ignore
23	pub comments: Comments,
24	/// Whether to parse JSX. TypeScript's `<number> 2` breaks the lexer so this can be disabled to allow
25	/// for that syntax
26	pub lex_jsx: bool,
27	/// TODO temp
28	pub allow_unsupported_characters_in_jsx_attribute_keys: bool,
29	pub allow_expressions_in_jsx: bool,
30	pub top_level_html: bool,
31}
32
33impl Default for LexerOptions {
34	fn default() -> Self {
35		Self {
36			comments: Comments::All,
37			lex_jsx: true,
38			allow_unsupported_characters_in_jsx_attribute_keys: true,
39			allow_expressions_in_jsx: true,
40			top_level_html: false,
41		}
42	}
43}
44
45fn is_number_delimiter(chr: char) -> bool {
46	matches!(
47		chr,
48		' ' | ','
49			| '\n' | '\r'
50			| ';' | '+'
51			| '-' | '*'
52			| '/' | '&'
53			| '|' | '!'
54			| '^' | '('
55			| '{' | '['
56			| ')' | '}'
57			| ']' | '%'
58			| '=' | ':'
59			| '<' | '>'
60			| '?' | '"'
61			| '\'' | '`'
62			| '#'
63	)
64}
65
66/// *Tokenizes* script appending Tokens to `sender` using [TokenSender::push]
67/// `offset` represents the start of the source if script is contained in some larger buffer
68///
69/// Returns () if successful, if runs into lexing error will short-circuit
70///
71/// **MARKERS HAVE TO BE IN FORWARD ORDER**
72#[doc(hidden)]
73pub fn lex_script(
74	script: &str,
75	sender: &mut impl TokenSender<TSXToken, crate::TokenStart>,
76	options: &LexerOptions,
77	offset: Option<u32>,
78) -> Result<(), (LexingErrors, Span)> {
79	#[derive(PartialEq, Debug)]
80	enum JSXAttributeValueDelimiter {
81		None,
82		SingleQuote,
83		DoubleQuote,
84	}
85
86	#[derive(PartialEq, Debug, Eq)]
87	enum JSXTagNameDirection {
88		Opening,
89		Closing,
90	}
91
92	#[derive(PartialEq, Debug)]
93	enum JSXLexingState {
94		/// Only for top level html
95		ExpectingOpenChevron,
96		TagName {
97			direction: JSXTagNameDirection,
98			lexed_start: bool,
99		},
100		/// For lexing the close chevron after the slash in self closing tags
101		SelfClosingTagClose,
102		AttributeKey,
103		AttributeEqual,
104		AttributeValue(JSXAttributeValueDelimiter),
105		Comment,
106		Content,
107		/// For script and style tags
108		LiteralContent {
109			last_char_was_open_chevron: bool,
110		},
111	}
112
113	#[derive(PartialEq, Debug)]
114	enum NumberLiteralType {
115		BinaryLiteral,
116		/// strict mode done at the parse level
117		OctalLiteral,
118		HexadecimalLiteral,
119		/// Base 10
120		Decimal {
121			/// has decimal point
122			fractional: bool,
123		},
124		BigInt,
125		Exponent,
126	}
127
128	impl Default for NumberLiteralType {
129		fn default() -> Self {
130			Self::Decimal { fractional: false }
131		}
132	}
133
134	/// Current parsing state of the lexer.
135	#[derive(PartialEq, Debug)]
136	enum LexingState {
137		None,
138		Identifier,
139		Symbol(GetAutomataStateForValue<TSXToken>),
140		// Literals:
141		Number(NumberLiteralType),
142		String {
143			double_quoted: bool,
144			escaped: bool,
145		},
146		TemplateLiteral {
147			interpolation_depth: u16,
148			last_char_was_dollar: bool,
149			escaped: bool,
150		},
151		JSXLiteral {
152			state: JSXLexingState,
153			interpolation_depth: u16,
154			tag_depth: u16,
155			/// `true` for `script` and `style` tags
156			/// TODO currently isn't handled at all
157			no_inner_tags_or_expressions: bool,
158			is_self_closing_tag: bool,
159		},
160		SingleLineComment,
161		MultiLineComment {
162			last_char_was_star: bool,
163		},
164		RegexLiteral {
165			escaped: bool,
166			/// aka on flags
167			after_last_slash: bool,
168			/// Forward slash while in `[...]` is allowed
169			in_set: bool,
170		},
171	}
172
173	// TODO WIP
174	const DEFAULT_JSX_LEXING_STATE: LexingState = LexingState::JSXLiteral {
175		interpolation_depth: 0,
176		tag_depth: 0,
177		state: JSXLexingState::ExpectingOpenChevron,
178		no_inner_tags_or_expressions: false,
179		is_self_closing_tag: false,
180	};
181	const FIRST_CHEVRON_JSX_LEXING_STATE: LexingState = LexingState::JSXLiteral {
182		interpolation_depth: 0,
183		tag_depth: 0,
184		state: JSXLexingState::TagName {
185			direction: JSXTagNameDirection::Opening,
186			lexed_start: false,
187		},
188		no_inner_tags_or_expressions: false,
189		is_self_closing_tag: false,
190	};
191
192	if script.len() > u32::MAX as usize {
193		return Err((LexingErrors::CannotLoadLargeFile(script.len()), source_map::Nullable::NULL));
194	}
195
196	let mut state: LexingState =
197		if options.top_level_html { DEFAULT_JSX_LEXING_STATE } else { LexingState::None };
198
199	// Used to go back to previous state if was in template literal or JSX literal
200	let mut state_stack: Vec<LexingState> = Vec::new();
201
202	// Used to index the slice (thus no offset)
203	let mut start: usize = 0;
204	let offset = offset.unwrap_or_default();
205
206	// This is a sneaky technique for regex and JSX literals. It seems to be almost impossible to determine
207	// whether the forward slash in: `/ x` should be a division symbol token or the start of regex literal. It is import
208	// to discern whether it is regex or division at this point as regex literal needs to be parsed as a literal rather
209	// than a sequence of tokens. Similarly for JSX is a < a less than comparison or the start of a tag. This variable
210	// should be set to true if the last pushed token was `=`, `return` etc and set to else set to false.
211	// TODO this doesn't work see #165
212	let mut expect_expression = true;
213
214	macro_rules! return_err {
215		($err:expr) => {{
216			sender.push(Token(TSXToken::EOS, TokenStart::new(script.len() as u32)));
217			return Err((
218				$err,
219				Span {
220					start: start as u32 + offset,
221					// TODO + 1
222					end: start as u32 + offset,
223					source: (),
224				},
225			));
226		}};
227	}
228
229	let mut characters = script.char_indices();
230	if script.starts_with("#!") {
231		for (idx, c) in characters.by_ref() {
232			if c == '\n' {
233				sender.push(Token(
234					TSXToken::HashBangComment(script[2..idx].to_owned()),
235					TokenStart::new(0),
236				));
237				break;
238			}
239		}
240	}
241
242	if options.top_level_html && script.starts_with("<!DOCTYPE html>") {
243		for (_idx, c) in characters.by_ref() {
244			if c == '>' {
245				sender.push(Token(TSXToken::DocTypeHTML, TokenStart::new(0)));
246				break;
247			}
248		}
249	}
250
251	for (idx, chr) in characters {
252		// dbg!(chr, &state);
253
254		// Sets current parser state and updates start track
255		macro_rules! set_state {
256			($s:expr) => {{
257				start = idx;
258				state = $s;
259				expect_expression = false;
260			}};
261
262			($s:expr, EXPECT_EXPRESSION: $v:expr) => {{
263				start = idx;
264				state = $s;
265				expect_expression = $v;
266			}};
267		}
268
269		// Pushes a new token
270		macro_rules! push_token {
271			($t:expr $(,)?) => {{
272				let res = sender.push(Token($t, TokenStart::new(start as u32 + offset)));
273				if !res {
274					return Ok(());
275				}
276			}};
277		}
278
279		match state {
280			LexingState::Number(ref mut literal_type) => {
281				match chr {
282					_ if matches!(literal_type, NumberLiteralType::BigInt) => {
283						if is_number_delimiter(chr) {
284							// Content already checked
285							push_token!(TSXToken::NumberLiteral(script[start..idx].to_owned()));
286							set_state!(LexingState::None);
287						} else {
288							return_err!(LexingErrors::UnexpectedEndToNumberLiteral)
289						}
290					}
291					// For binary/hexadecimal/octal literals
292					'b' | 'B' | 'x' | 'X' | 'o' | 'O' if start + 1 == idx => {
293						if script[start..].starts_with('0') {
294							*literal_type = match chr {
295								'b' | 'B' => NumberLiteralType::BinaryLiteral,
296								'o' | 'O' => NumberLiteralType::OctalLiteral,
297								'x' | 'X' => NumberLiteralType::HexadecimalLiteral,
298								_ => unreachable!(),
299							}
300						} else {
301							return_err!(
302								LexingErrors::NumberLiteralBaseSpecifierMustPrecededWithZero
303							);
304						}
305					}
306					'0'..='9' | 'a'..='f' | 'A'..='F' => match literal_type {
307						NumberLiteralType::BinaryLiteral => {
308							if !matches!(chr, '0' | '1') {
309								return_err!(LexingErrors::InvalidNumeralItemBecauseOfLiteralKind)
310							}
311						}
312						NumberLiteralType::OctalLiteral => {
313							if !matches!(chr, '0'..='7') {
314								return_err!(LexingErrors::InvalidNumeralItemBecauseOfLiteralKind)
315							}
316						}
317						// Handling for 'e' & 'E'
318						NumberLiteralType::Decimal { fractional } => {
319							if matches!(chr, 'e' | 'E')
320								&& !(*fractional || script[..idx].ends_with('_'))
321							{
322								*literal_type = NumberLiteralType::Exponent;
323							} else if !chr.is_ascii_digit() {
324								return_err!(LexingErrors::InvalidNumeralItemBecauseOfLiteralKind)
325							}
326						}
327						NumberLiteralType::Exponent => {
328							if !chr.is_ascii_digit() {
329								return_err!(LexingErrors::InvalidNumeralItemBecauseOfLiteralKind)
330							}
331						}
332						// all above allowed
333						NumberLiteralType::HexadecimalLiteral => {}
334						NumberLiteralType::BigInt => unreachable!(),
335					},
336					'.' => {
337						if let NumberLiteralType::Decimal { fractional } = literal_type {
338							if script[..idx].ends_with(['_']) {
339								return_err!(LexingErrors::InvalidUnderscore)
340							} else if *fractional {
341								// Catch for spread token `...`
342								if start + 1 == idx {
343									let automaton = TSXToken::new_automaton();
344									let derive_finite_automaton::GetNextResult::NewState(
345										dot_state_one,
346									) = automaton.get_next('.')
347									else {
348										unreachable!()
349									};
350									let derive_finite_automaton::GetNextResult::NewState(
351										dot_state_two,
352									) = dot_state_one.get_next('.')
353									else {
354										unreachable!()
355									};
356									state = LexingState::Symbol(dot_state_two);
357								} else {
358									return_err!(LexingErrors::SecondDecimalPoint);
359								}
360							} else {
361								*fractional = true;
362							}
363						} else {
364							return_err!(LexingErrors::NumberLiteralCannotHaveDecimalPoint);
365						}
366					}
367					'_' => {
368						let invalid = match literal_type {
369							NumberLiteralType::BinaryLiteral |
370							NumberLiteralType::OctalLiteral |
371							// Second `(idx - start) < 1` is for octal with prefix 0
372							NumberLiteralType::HexadecimalLiteral => {
373								if start + 2 == idx {
374									script[..idx].ends_with(['b', 'B', 'x', 'X', 'o' , 'O'])
375								} else {
376									false
377								}
378							},
379							NumberLiteralType::Decimal { .. } => script[..idx].ends_with('.') || &script[start..idx] == "0",
380							NumberLiteralType::Exponent => script[..idx].ends_with(['e', 'E']),
381							NumberLiteralType::BigInt => false
382						};
383						if invalid {
384							return_err!(LexingErrors::InvalidUnderscore);
385						}
386					}
387					'n' if matches!(
388						literal_type,
389						NumberLiteralType::Decimal { fractional: false }
390					) =>
391					{
392						*literal_type = NumberLiteralType::BigInt;
393					}
394					// `10e-5` is a valid literal
395					'-' if matches!(literal_type, NumberLiteralType::Exponent if script[..idx].ends_with(['e', 'E'])) =>
396						{}
397					chr => {
398						if is_number_delimiter(chr) {
399							// Note not = as don't want to include chr
400							let num_slice = &script[start..idx];
401							if num_slice.trim_end() == "."
402								|| num_slice.ends_with(['x', 'X', 'o', 'O', '_', '-'])
403								|| (!matches!(literal_type, NumberLiteralType::HexadecimalLiteral)
404									&& num_slice.ends_with(['e', 'E', 'b', 'B']))
405							{
406								return_err!(LexingErrors::UnexpectedEndToNumberLiteral)
407							}
408							push_token!(TSXToken::NumberLiteral(num_slice.to_owned()));
409							set_state!(LexingState::None);
410						} else {
411							return_err!(LexingErrors::UnexpectedEndToNumberLiteral)
412						}
413					}
414				}
415			}
416			LexingState::Symbol(symbol_state) => {
417				// TODO if number and state == first dot then do number parsing (should be
418				// done when derive finite automaton gets pattern support)
419				match symbol_state.get_next(chr) {
420					GetNextResult::Result { result, ate_character } => {
421						// Handle comments
422						match result {
423							TSXToken::Comment(_) => {
424								state = LexingState::SingleLineComment;
425								continue;
426							}
427							TSXToken::MultiLineComment(_) => {
428								state = LexingState::MultiLineComment { last_char_was_star: false };
429								continue;
430							}
431							_ => {}
432						}
433						state = LexingState::None;
434						expect_expression = result.is_expression_prefix();
435						if ate_character {
436							push_token!(result);
437							start = idx + chr.len_utf8();
438							continue;
439						}
440
441						push_token!(result);
442						start = idx;
443					}
444					GetNextResult::NewState(new_state) => {
445						state = LexingState::Symbol(new_state);
446					}
447					GetNextResult::InvalidCharacter(err) => {
448						return_err!(LexingErrors::UnexpectedCharacter(err));
449					}
450				}
451			}
452			LexingState::Identifier => match chr {
453				'A'..='Z' | 'a'..='z' | '0'..='9' | '_' | '$' => {}
454				_ => {
455					let token = TSXToken::from_slice(&script[start..idx]);
456					let is_expression_prefix = token.is_expression_prefix();
457					push_token!(token);
458					set_state!(LexingState::None, EXPECT_EXPRESSION: is_expression_prefix);
459				}
460			},
461			LexingState::String { ref mut double_quoted, ref mut escaped } => match chr {
462				'\n' => {
463					return_err!(LexingErrors::NewLineInStringLiteral);
464				}
465				'\'' if !*double_quoted && !*escaped => {
466					push_token!(TSXToken::StringLiteral(
467						script[(start + 1)..idx].to_owned(),
468						Quoted::Single
469					));
470					state = LexingState::None;
471					start = idx + 1;
472					expect_expression = false;
473					continue;
474				}
475				'"' if *double_quoted && !*escaped => {
476					push_token!(TSXToken::StringLiteral(
477						script[(start + 1)..idx].to_owned(),
478						Quoted::Double
479					));
480					state = LexingState::None;
481					start = idx + 1;
482					expect_expression = false;
483					continue;
484				}
485				'\\' if !*escaped => {
486					*escaped = true;
487				}
488				_ => {
489					*escaped = false;
490				}
491			},
492			LexingState::SingleLineComment => {
493				if let '\n' = chr {
494					let content = &script[(start + 2)..idx];
495					if options.comments.should_add_comment(content) {
496						push_token!(TSXToken::Comment(content.trim_end().to_owned()));
497					}
498					set_state!(LexingState::None);
499					continue;
500				}
501			}
502			LexingState::MultiLineComment { ref mut last_char_was_star } => match chr {
503				'/' if *last_char_was_star => {
504					let content = &script[(start + 2)..(idx - 1)];
505					if options.comments.should_add_comment(content) {
506						push_token!(TSXToken::MultiLineComment(content.to_owned()));
507					}
508					set_state!(LexingState::None);
509					continue;
510				}
511				chr => {
512					*last_char_was_star = chr == '*';
513				}
514			},
515			LexingState::RegexLiteral {
516				ref mut escaped,
517				ref mut after_last_slash,
518				ref mut in_set,
519			} => {
520				if *after_last_slash {
521					if !chr.is_alphabetic() {
522						if start != idx {
523							push_token!(TSXToken::RegexFlagLiteral(script[start..idx].to_owned()));
524						}
525						set_state!(LexingState::None);
526					}
527				} else {
528					match chr {
529						'/' if start + 1 == idx => {
530							state = LexingState::SingleLineComment;
531							continue;
532						}
533						'*' if start + 1 == idx => {
534							state = LexingState::MultiLineComment { last_char_was_star: false };
535							continue;
536						}
537						'/' if !*escaped && !*in_set => {
538							push_token!(TSXToken::RegexLiteral(
539								script[(start + 1)..idx].to_owned()
540							));
541							*after_last_slash = true;
542							start = idx + 1;
543						}
544						'\\' if !*escaped => {
545							*escaped = true;
546						}
547						'[' => {
548							*in_set = true;
549						}
550						']' if *in_set => {
551							*in_set = false;
552						}
553						'\n' => {
554							return_err!(LexingErrors::ExpectedEndToRegexLiteral);
555						}
556						_ => {
557							*escaped = false;
558						}
559					}
560				}
561			}
562			LexingState::TemplateLiteral {
563				ref mut last_char_was_dollar,
564				ref mut interpolation_depth,
565				ref mut escaped,
566			} => match chr {
567				'$' if !*escaped => *last_char_was_dollar = true,
568				'{' if *last_char_was_dollar => {
569					if idx > start + 1 {
570						push_token!(TSXToken::TemplateLiteralChunk(
571							script[start..(idx - 1)].to_owned()
572						));
573					}
574					start = idx - 1;
575					push_token!(TSXToken::TemplateLiteralExpressionStart);
576					*interpolation_depth += 1;
577					*last_char_was_dollar = false;
578					state_stack.push(state);
579
580					start = idx + 1;
581					state = LexingState::None;
582					expect_expression = true;
583					continue;
584				}
585				'`' if !*escaped => {
586					if idx > start {
587						push_token!(TSXToken::TemplateLiteralChunk(script[start..idx].to_owned()));
588					}
589					start = idx;
590					push_token!(TSXToken::TemplateLiteralEnd);
591					start = idx + 1;
592					state = LexingState::None;
593					expect_expression = false;
594					continue;
595				}
596				'\\' => {
597					*last_char_was_dollar = false;
598					*escaped = true;
599				}
600				_ => {
601					*last_char_was_dollar = false;
602					*escaped = false;
603				}
604			},
605			LexingState::JSXLiteral {
606				ref mut interpolation_depth,
607				ref mut tag_depth,
608				ref mut no_inner_tags_or_expressions,
609				ref mut is_self_closing_tag,
610				state: ref mut jsx_state,
611			} => {
612				match jsx_state {
613					JSXLexingState::ExpectingOpenChevron => {
614						if chr == '<' {
615							set_state!(FIRST_CHEVRON_JSX_LEXING_STATE);
616						} else if !chr.is_whitespace() {
617							dbg!(chr);
618							return_err!(LexingErrors::ExpectedOpenChevron);
619						}
620					}
621					JSXLexingState::TagName { ref mut direction, ref mut lexed_start } => match chr
622					{
623						// Closing tag
624						'>' if *direction == JSXTagNameDirection::Closing => {
625							*tag_depth = match tag_depth.checked_sub(1) {
626								Some(value) => value,
627								None => {
628									return_err!(LexingErrors::UnbalancedJSXClosingTags);
629								}
630							};
631							if *lexed_start {
632								push_token!(TSXToken::JSXClosingTagName(
633									script[start..idx].trim().to_owned()
634								));
635							} else {
636								push_token!(TSXToken::JSXFragmentEnd);
637							}
638							// If JSX literal range has ended
639							if *tag_depth == 0 {
640								set_state!(LexingState::None);
641								continue;
642							}
643
644							start = idx + 1;
645							*jsx_state = JSXLexingState::Content;
646						}
647						// Fragment start
648						'>' if !*lexed_start => {
649							push_token!(TSXToken::JSXFragmentStart);
650							*jsx_state = JSXLexingState::Content;
651							start = idx + 1;
652							*tag_depth += 1;
653							continue;
654						}
655						// Tag name characters:
656						'A'..='Z' | 'a'..='z' | '0'..='9' => {
657							// Add the opening tag here as know it is not closing
658							if !*lexed_start {
659								match direction {
660									JSXTagNameDirection::Opening => {
661										push_token!(TSXToken::JSXOpeningTagStart);
662										start += 1;
663									}
664									JSXTagNameDirection::Closing => {
665										push_token!(TSXToken::JSXClosingTagStart);
666										start += 2;
667									}
668								}
669								*lexed_start = true;
670							}
671						}
672						'-' => {
673							if start + 1 == idx {
674								// TODO this is really the position rather the character
675								return_err!(LexingErrors::InvalidCharacterInJSXTag('-'))
676							}
677						}
678						// Runs if closing tag </div>
679						'/' if start + 1 == idx => {
680							*direction = JSXTagNameDirection::Closing;
681						}
682						// HTML comments!!!
683						'!' if start + 1 == idx => {
684							*jsx_state = JSXLexingState::Comment;
685						}
686						// Non-tag name character
687						chr => {
688							if *direction == JSXTagNameDirection::Closing {
689								return_err!(LexingErrors::ExpectedJSXEndTag);
690							}
691							let tag_name = script[start..idx].trim();
692							*is_self_closing_tag = html_tag_is_self_closing(tag_name);
693							*no_inner_tags_or_expressions =
694								html_tag_contains_literal_content(tag_name);
695							push_token!(TSXToken::JSXTagName(tag_name.to_owned()));
696							start = idx;
697							*tag_depth += 1;
698							match chr {
699								'/' if *is_self_closing_tag => {
700									*jsx_state = JSXLexingState::SelfClosingTagClose;
701								}
702								'>' => {
703									push_token!(TSXToken::JSXOpeningTagEnd);
704									start = idx + 1;
705									*jsx_state = if *no_inner_tags_or_expressions {
706										JSXLexingState::LiteralContent {
707											last_char_was_open_chevron: false,
708										}
709									} else {
710										JSXLexingState::Content
711									};
712									continue;
713								}
714								chr if chr.is_whitespace() => {
715									*jsx_state = JSXLexingState::AttributeKey;
716								}
717								chr => {
718									return_err!(LexingErrors::InvalidCharacterInJSXTag(chr));
719								}
720							}
721							start = idx + chr.len_utf8();
722						}
723					},
724					JSXLexingState::SelfClosingTagClose => {
725						if chr == '>' {
726							*tag_depth = match tag_depth.checked_sub(1) {
727								Some(value) => value,
728								None => {
729									return_err!(LexingErrors::UnbalancedJSXClosingTags);
730								}
731							};
732							push_token!(TSXToken::JSXSelfClosingTag);
733							start = idx + 1;
734							// If JSX literal range has ended
735							if *tag_depth == 0 {
736								set_state!(LexingState::None);
737							} else {
738								*jsx_state = JSXLexingState::Content;
739							}
740							continue;
741						}
742						return_err!(LexingErrors::ExpectedClosingChevronAtEndOfSelfClosingTag);
743					}
744					JSXLexingState::AttributeKey => match chr {
745						'=' => {
746							if start >= idx {
747								return_err!(LexingErrors::EmptyAttributeName);
748							}
749							let key_slice = script[start..idx].trim();
750							if !key_slice.is_empty() {
751								push_token!(TSXToken::JSXAttributeKey(key_slice.to_owned()));
752							}
753							start = idx;
754							push_token!(TSXToken::JSXAttributeAssign);
755							*jsx_state = JSXLexingState::AttributeEqual;
756							start = idx + 1;
757						}
758						'{' => {
759							push_token!(TSXToken::JSXExpressionStart);
760							*interpolation_depth += 1;
761							state_stack.push(state);
762							set_state!(LexingState::None, EXPECT_EXPRESSION: true);
763							continue;
764						}
765						'/' => {
766							*jsx_state = JSXLexingState::SelfClosingTagClose;
767						}
768						'>' => {
769							// Accounts for <div hidden>
770							if start < idx {
771								push_token!(TSXToken::JSXAttributeKey(
772									script[start..idx].to_owned()
773								));
774							}
775							if *is_self_closing_tag {
776								*tag_depth = match tag_depth.checked_sub(1) {
777									Some(value) => value,
778									None => {
779										return_err!(LexingErrors::UnbalancedJSXClosingTags);
780									}
781								};
782								push_token!(TSXToken::JSXSelfClosingTag);
783								start = idx + 1;
784								// If JSX literal range has ended
785								if *tag_depth == 0 {
786									set_state!(LexingState::None);
787								} else {
788									*jsx_state = JSXLexingState::Content;
789									*is_self_closing_tag = false;
790								}
791							} else {
792								push_token!(TSXToken::JSXOpeningTagEnd);
793								start = idx + 1;
794								*jsx_state = if *no_inner_tags_or_expressions {
795									JSXLexingState::LiteralContent {
796										last_char_was_open_chevron: false,
797									}
798								} else {
799									JSXLexingState::Content
800								};
801							}
802							continue;
803						}
804						chr if chr.is_whitespace() => {
805							if start < idx {
806								push_token!(TSXToken::JSXAttributeKey(
807									script[start..idx].to_owned()
808								));
809							}
810							start = idx + chr.len_utf8();
811						}
812						chr => {
813							let character_allowed = chr.is_alphanumeric()
814								|| chr == '-' || (options
815								.allow_unsupported_characters_in_jsx_attribute_keys
816								&& matches!(
817									chr,
818									'@' | ':' | '.' | '[' | ']' | '+' | '$' | '*' | '%'
819								));
820							if !character_allowed {
821								return_err!(LexingErrors::InvalidCharacterInAttributeKey(chr));
822							}
823						}
824					},
825					JSXLexingState::AttributeEqual => {
826						let delimiter = match chr {
827							'{' if options.allow_expressions_in_jsx => {
828								push_token!(TSXToken::JSXExpressionStart);
829								*interpolation_depth += 1;
830								*jsx_state = JSXLexingState::AttributeKey;
831								state_stack.push(state);
832								set_state!(LexingState::None, EXPECT_EXPRESSION: true);
833								continue;
834							}
835							'"' => JSXAttributeValueDelimiter::DoubleQuote,
836							'\'' => JSXAttributeValueDelimiter::SingleQuote,
837							'>' => {
838								return_err!(LexingErrors::EmptyAttributeName);
839							}
840							_ => JSXAttributeValueDelimiter::None,
841						};
842						*jsx_state = JSXLexingState::AttributeValue(delimiter);
843					}
844					JSXLexingState::AttributeValue(delimiter) => match (delimiter, chr) {
845						(JSXAttributeValueDelimiter::DoubleQuote, '"')
846						| (JSXAttributeValueDelimiter::SingleQuote, '\'') => {
847							push_token!(TSXToken::JSXAttributeValue(
848								script[(start + 1)..idx].to_owned()
849							));
850							*jsx_state = JSXLexingState::AttributeKey;
851							start = idx + 1;
852							continue;
853						}
854						(JSXAttributeValueDelimiter::None, ' ') => {
855							push_token!(TSXToken::JSXAttributeValue(script[start..idx].to_owned()));
856							*jsx_state = JSXLexingState::AttributeKey;
857							start = idx;
858						}
859						(JSXAttributeValueDelimiter::None, '>') => {
860							push_token!(TSXToken::JSXAttributeValue(script[start..idx].to_owned()));
861							if *is_self_closing_tag {
862								*tag_depth = match tag_depth.checked_sub(1) {
863									Some(value) => value,
864									None => {
865										return_err!(LexingErrors::UnbalancedJSXClosingTags);
866									}
867								};
868								push_token!(TSXToken::JSXSelfClosingTag);
869								start = idx + 1;
870								// If JSX literal range has ended
871								if *tag_depth == 0 {
872									set_state!(LexingState::None);
873								} else {
874									*jsx_state = JSXLexingState::Content;
875									*is_self_closing_tag = false;
876								}
877							} else {
878								push_token!(TSXToken::JSXOpeningTagEnd);
879								start = idx + 1;
880								*jsx_state = if *no_inner_tags_or_expressions {
881									JSXLexingState::LiteralContent {
882										last_char_was_open_chevron: false,
883									}
884								} else {
885									JSXLexingState::Content
886								};
887							}
888							continue;
889						}
890						_ => {}
891					},
892					JSXLexingState::Content => {
893						match chr {
894							'<' => {
895								let content_slice = &script[start..idx];
896								if !content_slice.trim().is_empty() {
897									push_token!(TSXToken::JSXContent(content_slice.to_owned()));
898								}
899								*jsx_state = JSXLexingState::TagName {
900									direction: JSXTagNameDirection::Opening,
901									lexed_start: false,
902								};
903								start = idx;
904							}
905							'{' if options.allow_expressions_in_jsx => {
906								let content_slice = &script[start..idx];
907								if !content_slice.trim().is_empty() {
908									push_token!(TSXToken::JSXContent(content_slice.to_owned()));
909								}
910								push_token!(TSXToken::JSXExpressionStart);
911								*interpolation_depth += 1;
912								state_stack.push(state);
913								set_state!(LexingState::None, EXPECT_EXPRESSION: true);
914								continue;
915							}
916							'\n' => {
917								let source = script[start..idx].trim();
918								if !source.is_empty() {
919									push_token!(TSXToken::JSXContent(source.to_owned()));
920									start = idx;
921								}
922								push_token!(TSXToken::JSXContentLineBreak);
923								start = idx + 1;
924							}
925							// Any content
926							_ => {}
927						}
928					}
929					JSXLexingState::LiteralContent { ref mut last_char_was_open_chevron } => {
930						match chr {
931							'<' => {
932								*last_char_was_open_chevron = true;
933							}
934							'/' if *last_char_was_open_chevron => {
935								let end = idx - '<'.len_utf8();
936								let source = script[start..end].trim();
937								if !source.is_empty() {
938									push_token!(TSXToken::JSXContent(source.to_owned()));
939								}
940								start = end;
941								push_token!(TSXToken::JSXClosingTagStart);
942								start = idx + '/'.len_utf8();
943								*jsx_state = JSXLexingState::TagName {
944									direction: JSXTagNameDirection::Closing,
945									lexed_start: true,
946								};
947								*no_inner_tags_or_expressions = false;
948							}
949							_ => {
950								*last_char_was_open_chevron = false;
951							}
952						}
953					}
954					// TODO this will allow for <!--> as a valid comment
955					JSXLexingState::Comment => {
956						if idx - start < 4 {
957							if chr != '-' {
958								return_err!(LexingErrors::ExpectedDashInComment);
959							}
960						} else if chr == '>' && script[..idx].ends_with("--") {
961							push_token!(TSXToken::JSXComment(
962								script[(start + 4)..(idx - 2)].to_owned()
963							));
964							start = idx + 1;
965							if *tag_depth == 0 {
966								set_state!(if options.top_level_html {
967									DEFAULT_JSX_LEXING_STATE
968								} else {
969									LexingState::None
970								});
971							} else {
972								*jsx_state = JSXLexingState::Content;
973							}
974							continue;
975						}
976					}
977				}
978			}
979			LexingState::None => {}
980		}
981
982		// This is done later as state may have been set to none by the matching
983		if state == LexingState::None {
984			match chr {
985				'0' if matches!(script.as_bytes().get(idx + 1), Some(b'0'..=b'7')) => {
986					// strict mode should be done in the parser stage (as that is where context is)
987					set_state!(LexingState::Number(NumberLiteralType::OctalLiteral));
988				}
989				'0'..='9' => set_state!(LexingState::Number(Default::default())),
990				'"' => set_state!(LexingState::String { double_quoted: true, escaped: false }),
991				'\'' => set_state!(LexingState::String { double_quoted: false, escaped: false }),
992				'_' | '$' => {
993					set_state!(LexingState::Identifier);
994				}
995				chr if chr.is_alphabetic() => {
996					set_state!(LexingState::Identifier);
997				}
998				chr if chr.is_whitespace() => {
999					continue;
1000				}
1001				chr => {
1002					// Handles lexing in nested contexts, e.g. JSX and template literals
1003					match (chr, state_stack.last_mut()) {
1004						(
1005							'}',
1006							Some(LexingState::TemplateLiteral {
1007								ref mut interpolation_depth, ..
1008							}),
1009						) => {
1010							*interpolation_depth -= 1;
1011							if *interpolation_depth == 0 {
1012								push_token!(TSXToken::TemplateLiteralExpressionEnd);
1013								start = idx + '}'.len_utf8();
1014								state = state_stack.pop().unwrap();
1015								continue;
1016							}
1017						}
1018						(
1019							'}',
1020							Some(LexingState::JSXLiteral { ref mut interpolation_depth, .. }),
1021						) => {
1022							*interpolation_depth -= 1;
1023							if *interpolation_depth == 0 {
1024								push_token!(TSXToken::JSXExpressionEnd);
1025								start = idx + '}'.len_utf8();
1026								state = state_stack.pop().unwrap();
1027								continue;
1028							}
1029						}
1030						(
1031							'{',
1032							Some(
1033								LexingState::JSXLiteral { ref mut interpolation_depth, .. }
1034								| LexingState::TemplateLiteral {
1035									ref mut interpolation_depth, ..
1036								},
1037							),
1038						) => {
1039							// Handle for if '{' are in the interpolation
1040							*interpolation_depth += 1;
1041						}
1042						(_, _) => {}
1043					}
1044
1045					start = idx;
1046
1047					// Handle regex, JSX literals and template literals
1048					match (expect_expression, chr) {
1049						(_, '`') => {
1050							push_token!(TSXToken::TemplateLiteralStart);
1051							start = idx + 1;
1052							state = LexingState::TemplateLiteral {
1053								interpolation_depth: 0,
1054								last_char_was_dollar: false,
1055								escaped: false,
1056							};
1057						}
1058						(true, '<') if options.lex_jsx => {
1059							set_state!(FIRST_CHEVRON_JSX_LEXING_STATE);
1060						}
1061						(true, '/') => {
1062							state = LexingState::RegexLiteral {
1063								escaped: false,
1064								after_last_slash: false,
1065								in_set: false,
1066							};
1067						}
1068						(true, '.') => {
1069							state = LexingState::Number(NumberLiteralType::Decimal {
1070								fractional: true,
1071							});
1072						}
1073						(_, _) => {
1074							// Else try do a symbol
1075							let automaton = TSXToken::new_automaton();
1076							match automaton.get_next(chr) {
1077								GetNextResult::Result {
1078									result,
1079									ate_character: _, // Should always be true
1080								} => {
1081									expect_expression = result.is_expression_prefix();
1082									push_token!(result);
1083								}
1084								GetNextResult::NewState(new_state) => {
1085									state = LexingState::Symbol(new_state);
1086								}
1087								GetNextResult::InvalidCharacter(err) => {
1088									return_err!(LexingErrors::UnexpectedCharacter(err));
1089								}
1090							}
1091						}
1092					}
1093				}
1094			}
1095		}
1096	}
1097
1098	// If source ends while there is still a parsing state
1099	match state {
1100		LexingState::Number(literal_type) => {
1101			// Just `.` or ends with combination token
1102			if script[start..].trim_end() == "."
1103				|| script.ends_with(['x', 'X', 'o', 'O', '_', '-'])
1104				|| (!matches!(literal_type, NumberLiteralType::HexadecimalLiteral)
1105					&& script.ends_with(['e', 'E', 'b', 'B']))
1106			{
1107				return_err!(LexingErrors::UnexpectedEndToNumberLiteral)
1108			}
1109			sender.push(Token(
1110				TSXToken::NumberLiteral(script[start..].to_owned()),
1111				TokenStart::new(start as u32 + offset),
1112			));
1113		}
1114		LexingState::Identifier => {
1115			sender.push(Token(
1116				TSXToken::from_slice(&script[start..]),
1117				TokenStart::new(start as u32 + offset),
1118			));
1119		}
1120		LexingState::Symbol(symbol_state) => {
1121			// Uses 0 as char to prevent continued matches, this is okay as long as
1122			// there is no 0 char in the finite automata
1123			match symbol_state.get_next(0 as char) {
1124				GetNextResult::Result {
1125					result,
1126					ate_character: _, // Should always be true
1127				} => {
1128					sender.push(Token(result, TokenStart::new(start as u32 + offset)));
1129				}
1130				GetNextResult::NewState(_new_state) => unreachable!(),
1131				GetNextResult::InvalidCharacter(err) => {
1132					return_err!(LexingErrors::UnexpectedCharacter(err));
1133				}
1134			}
1135		}
1136		LexingState::SingleLineComment => {
1137			let content = &script[(start + 2)..];
1138			if options.comments.should_add_comment(content) {
1139				sender.push(Token(
1140					TSXToken::Comment(content.trim_end().to_owned()),
1141					TokenStart::new(start as u32 + offset),
1142				));
1143			}
1144		}
1145		LexingState::MultiLineComment { .. } => {
1146			return_err!(LexingErrors::ExpectedEndToMultilineComment);
1147		}
1148		LexingState::String { .. } => {
1149			return_err!(LexingErrors::ExpectedEndToStringLiteral);
1150		}
1151		// This is okay as the state is not cleared until it finds flags.
1152		LexingState::RegexLiteral { after_last_slash, .. } => {
1153			if after_last_slash {
1154				sender.push(Token(
1155					TSXToken::RegexFlagLiteral(script[start..].to_owned()),
1156					TokenStart::new(start as u32 + offset),
1157				));
1158				sender.push(Token(TSXToken::EOS, TokenStart::new(script.len() as u32)));
1159			} else {
1160				sender.push(Token(TSXToken::EOS, TokenStart::new(script.len() as u32)));
1161				return_err!(LexingErrors::ExpectedEndToRegexLiteral);
1162			}
1163		}
1164		LexingState::JSXLiteral { state, .. } => {
1165			if !matches!(state, JSXLexingState::ExpectingOpenChevron) {
1166				return_err!(LexingErrors::ExpectedEndToJSXLiteral);
1167			}
1168		}
1169		LexingState::TemplateLiteral { .. } => {
1170			return_err!(LexingErrors::ExpectedEndToTemplateLiteral);
1171		}
1172		LexingState::None => {}
1173	}
1174
1175	sender.push(Token(TSXToken::EOS, TokenStart::new(script.len() as u32)));
1176
1177	Ok(())
1178}
ezno_parser/lexer.rs

ezno_parser/
lexer.rs