1use super::{
6 error::{CompilationError, CompilationErrorKind as ErrorKind, ErrorHandler},
7 util::{non_whitespace, VStr},
8 Name, Position, SourceLocation,
9};
10use rustc_hash::FxHashSet;
11use std::{iter::FusedIterator, str::Bytes};
12
13#[cfg(feature = "serde")]
14use serde::Serialize;
15
16#[cfg_attr(feature = "serde", derive(Serialize))]
17pub struct Attribute<'a> {
18 pub name: Name<'a>,
19 pub value: Option<AttributeValue<'a>>,
20 pub name_loc: SourceLocation,
21 pub location: SourceLocation,
22}
23
24impl<'a> Attribute<'a> {
25 pub fn has_empty_val(&self) -> bool {
26 self.value
27 .as_ref()
28 .map_or(true, |v| !v.content.contains(non_whitespace))
29 }
30}
31
32#[cfg_attr(feature = "serde", derive(Serialize))]
33pub struct AttributeValue<'a> {
34 pub content: VStr<'a>,
35 pub location: SourceLocation,
36}
37
38#[cfg_attr(feature = "serde", derive(Serialize))]
40pub struct Tag<'a> {
41 pub name: Name<'a>,
42 pub attributes: Vec<Attribute<'a>>,
43 pub self_closing: bool,
44}
45
46#[cfg_attr(feature = "serde", derive(Serialize))]
49pub enum Token<'a> {
50 StartTag(Tag<'a>),
51 EndTag(Name<'a>), Text(VStr<'a>), Comment(&'a str),
58 Interpolation(&'a str), }
60
61impl<'a> From<&'a str> for Token<'a> {
63 fn from(decoded: &'a str) -> Self {
64 Token::Text(VStr::raw(decoded))
65 }
66}
67
68#[derive(Clone)]
70pub struct ScanOption {
71 pub delimiters: (String, String),
72 pub get_text_mode: fn(&str) -> TextMode,
73}
74
75impl Default for ScanOption {
76 fn default() -> Self {
77 Self {
78 delimiters: ("{{".into(), "}}".into()),
79 get_text_mode: |_| TextMode::Data,
80 }
81 }
82}
83
84pub trait FlagCDataNs {
96 fn set_is_in_html(&mut self, flag: bool);
100 fn need_flag_hint(&self) -> bool;
103}
104
105pub trait Locatable {
107 fn current_position(&self) -> Position;
109 fn last_position(&self) -> Position;
110 fn get_location_from(&self, start: Position) -> SourceLocation;
112}
113
114#[derive(PartialEq, Eq)]
117pub enum TextMode {
118 Data,
123 RcData,
124 RawText,
125}
126
127pub struct Scanner {
128 option: ScanOption,
129 delimiter_first_char: char,
130}
131
132impl Scanner {
134 pub fn new(option: ScanOption) -> Self {
135 let delimiters = &option.delimiters;
136 let delimiter_first_char = delimiters
137 .0
138 .chars()
139 .next()
140 .expect("interpolation delimiter cannot be empty");
141 Self {
142 option,
143 delimiter_first_char,
144 }
145 }
146 pub fn scan<'a, E>(&self, source: &'a str, err_handle: E) -> impl TokenSource<'a>
147 where
148 E: ErrorHandler,
149 {
150 Tokens {
151 source,
152 err_handle,
153 position: Default::default(),
154 last_pos: Default::default(),
155 mode: TextMode::Data,
156 option: self.option.clone(),
157 last_start_tag_name: None,
158 is_in_html_namespace: true,
159 delimiter_first_char: self.delimiter_first_char,
160 }
161 }
162}
163
164pub struct Tokens<'a, E: ErrorHandler> {
165 source: &'a str,
166 err_handle: E,
167 position: Position,
168 last_pos: Position,
169 mode: TextMode,
170 pub option: ScanOption,
171 last_start_tag_name: Option<&'a str>,
176 is_in_html_namespace: bool,
178 delimiter_first_char: char,
179}
180
181impl<'a, C: ErrorHandler> Tokens<'a, C> {
187 fn scan_data(&mut self) -> Token<'a> {
190 debug_assert!(self.mode == TextMode::Data);
191 debug_assert!(!self.source.is_empty());
192 let d = self.delimiter_first_char;
193 let mut offset = 0;
194 while let Some(i) = self.source[offset..].find(&['<', d][..]) {
196 if i != 0 {
197 return self.scan_text(i);
199 } else if self.source.starts_with('<') {
200 return self.scan_tag_open();
201 } else if self.source.starts_with(&self.option.delimiters.0) {
202 return self.scan_interpolation();
203 } else {
204 offset = i + 1;
205 }
206 }
207 self.scan_text(self.source.len())
209 }
210
211 fn scan_text(&mut self, size: usize) -> Token<'a> {
213 debug_assert!(matches!(self.mode, TextMode::Data | TextMode::RcData));
214 debug_assert_ne!(size, 0);
215 let src = self.move_by(size);
216 Token::Text(self.decode_text(src, false))
217 }
218
219 fn scan_interpolation(&mut self) -> Token<'a> {
220 let delimiters = &self.option.delimiters;
221 debug_assert!(self.source.starts_with(&delimiters.0));
222 let index = self.source.find(&delimiters.1);
223 if index.is_none() {
224 let src = self.move_by(self.source.len());
225 self.emit_error(ErrorKind::MissingInterpolationEnd);
226 return Token::Interpolation(&src[2..]);
227 }
228 let src = &self.move_by(index.unwrap())[2..];
229 self.move_by(self.option.delimiters.1.len());
230 Token::Interpolation(src)
231 }
232
233 fn scan_tag_open(&mut self) -> Token<'a> {
235 let source = &self.source;
238 if source.starts_with("</") {
239 self.scan_end_tag_open()
240 } else if source.starts_with("<!") {
241 self.scan_comment_and_like()
242 } else if source.starts_with("<?") {
243 self.emit_error(ErrorKind::UnexpectedQuestionMarkInsteadOfTagName);
244 self.scan_bogus_comment()
245 } else if source.len() == 1 {
246 self.move_by(1);
247 self.emit_error(ErrorKind::EofBeforeTagName);
248 Token::from("<")
249 } else if !source[1..].starts_with(ascii_alpha) {
250 self.move_by(1);
253 self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
254 Token::from("<")
255 } else {
256 self.scan_start_tag()
257 }
258 }
259
260 fn scan_start_tag(&mut self) -> Token<'a> {
262 debug_assert!(self.source.starts_with('<'));
263 self.move_by(1);
264 let tag = self.scan_tag_name();
265 let parsing_algorithm = self.option.get_text_mode;
268 self.mode = parsing_algorithm(tag.name);
269 if self.mode != TextMode::Data {
270 self.last_start_tag_name.replace(tag.name);
271 }
272 Token::StartTag(tag)
273 }
274 fn scan_tag_name(&mut self) -> Tag<'a> {
275 debug_assert!(self.source.starts_with(ascii_alpha));
276 let bytes = self.source.bytes();
277 let l = scan_tag_name_length(bytes);
278 debug_assert!(l > 0);
279 let name = self.move_by(l);
280 let attributes = self.scan_attributes();
281 let self_closing = if self.source.is_empty() {
282 self.emit_error(ErrorKind::EofInTag);
283 false
284 } else {
285 self.scan_close_start_tag()
286 };
287 Tag {
288 name,
289 attributes,
290 self_closing,
291 }
292 }
293 fn scan_attributes(&mut self) -> Vec<Attribute<'a>> {
296 let mut attrs = vec![]; let mut set = FxHashSet::default();
298 loop {
299 self.skip_whitespace();
301 if self.is_about_to_close_tag() {
302 return attrs;
303 }
304 if self.did_skip_slash_in_tag() {
305 continue;
306 }
307 let attr = self.scan_attribute();
308 if set.contains(attr.name) {
309 self.emit_error(ErrorKind::DuplicateAttribute);
312 continue;
313 }
314 set.insert(attr.name);
315 attrs.push(attr);
316 }
317 }
318 fn scan_attribute(&mut self) -> Attribute<'a> {
320 debug_assert!(!self.source.is_empty());
321 let start = self.current_position();
322 let name = self.scan_attr_name();
323 let name_loc = self.get_location_from(start.clone());
324 self.skip_whitespace();
326 if self.is_about_to_close_tag()
327 || self.did_skip_slash_in_tag()
328 || !self.source.starts_with('=')
329 {
330 let location = self.get_location_from(start);
331 return Attribute {
332 name,
333 location,
334 name_loc,
335 value: None,
336 };
337 }
338 self.move_by(1); let value = self.scan_attr_value();
340 let location = self.get_location_from(start);
341 Attribute {
342 name,
343 value,
344 name_loc,
345 location,
346 }
347 }
348 fn is_about_to_close_tag(&self) -> bool {
349 let source = &self.source; source.is_empty() || source.starts_with("/>") || source.starts_with('>')
351 }
352 fn did_skip_slash_in_tag(&mut self) -> bool {
353 debug_assert!(!self.source.is_empty());
354 if self.source.starts_with('/') {
355 self.move_by(1);
356 self.emit_error(ErrorKind::UnexpectedSolidusInTag);
357 true
358 } else {
359 false
360 }
361 }
362 fn scan_attr_name(&mut self) -> &'a str {
364 debug_assert!(is_valid_name_char(self.source.as_bytes()[0]));
365 let offset = if self.source.starts_with('=') {
367 self.emit_error(ErrorKind::UnexpectedEqualsSignBeforeAttributeName);
368 1
369 } else {
370 0
371 };
372 let count = self.source[offset..]
373 .bytes()
374 .take_while(|&c| semi_valid_attr_name(c))
375 .count();
376 let src = self.move_by(count + offset);
377 if src.contains(&['<', '"', '\''][..]) {
378 self.emit_error(ErrorKind::UnexpectedCharacterInAttributeName);
379 }
380 src
381 }
382 fn scan_attr_value(&mut self) -> Option<AttributeValue<'a>> {
384 self.skip_whitespace();
385 let source = &self.source;
386 if source.starts_with('>') {
387 self.emit_error(ErrorKind::MissingAttributeValue);
388 return None;
389 }
390 let start = self.current_position();
391 let content = if self.source.starts_with(&['"', '\''][..]) {
392 let c = self.source.chars().next().unwrap();
393 self.scan_quoted_attr_value(c)?
394 } else {
395 self.scan_unquoted_attr_value()?
396 };
397 Some(AttributeValue {
398 content,
399 location: self.get_location_from(start),
400 })
401 }
402 fn scan_quoted_attr_value(&mut self, quote: char) -> Option<VStr<'a>> {
405 debug_assert!(self.source.starts_with(quote));
406 self.move_by(1);
407 let src = if let Some(i) = self.source.find(quote) {
408 let val = if i == 0 { "" } else { self.move_by(i) };
409 self.move_by(1); val
411 } else if !self.source.is_empty() {
412 self.move_by(self.source.len())
413 } else {
414 return None;
415 };
416 if !self.is_about_to_close_tag()
418 && !self.did_skip_slash_in_tag()
419 && self.skip_whitespace() == 0
420 {
421 self.emit_error(ErrorKind::MissingWhitespaceBetweenAttributes);
422 }
423 Some(self.decode_text(src, true))
424 }
425 fn scan_unquoted_attr_value(&mut self) -> Option<VStr<'a>> {
427 let val_len = self
428 .source
429 .bytes()
430 .take_while(semi_valid_unquoted_attr_value)
431 .count();
432 if val_len == 0 {
434 debug_assert!(self.source.is_empty());
437 return None;
438 }
439 let src = self.move_by(val_len);
440 if src.contains(&['"', '\'', '<', '=', '`'][..]) {
441 self.emit_error(ErrorKind::UnexpectedCharacterInUnquotedAttributeValue);
442 }
443 Some(self.decode_text(src, true))
444 }
445
446 fn scan_close_start_tag(&mut self) -> bool {
447 debug_assert!(!self.source.is_empty());
448 if self.source.starts_with("/>") {
449 self.move_by(2);
450 true
451 } else {
452 debug_assert!(self.source.starts_with('>'));
453 self.move_by(1);
454 false
455 }
456 }
457 fn scan_end_tag_open(&mut self) -> Token<'a> {
459 debug_assert!(self.source.starts_with("</"));
460 let source = &self.source;
461 if source.len() == 2 {
462 self.emit_error(ErrorKind::EofBeforeTagName);
463 Token::from(self.move_by(2))
464 } else if source.starts_with("</>") {
465 self.emit_error(ErrorKind::MissingEndTagName);
466 self.move_by(3);
467 Token::from("")
468 } else if !self.source[2..].starts_with(ascii_alpha) {
469 self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
470 self.scan_bogus_comment()
471 } else {
472 self.scan_end_tag()
473 }
474 }
475 fn scan_end_tag(&mut self) -> Token<'a> {
477 debug_assert!(self.source.starts_with("</"));
478 self.move_by(2);
479 let tag = self.scan_tag_name();
482 if !tag.attributes.is_empty() {
484 self.emit_error(ErrorKind::EndTagWithAttributes);
485 }
486 if tag.self_closing {
488 self.emit_error(ErrorKind::EndTagWithTrailingSolidus);
489 }
490 self.mode = TextMode::Data;
492 Token::EndTag(tag.name)
493 }
494
495 fn scan_comment_and_like(&mut self) -> Token<'a> {
497 let s = &self.source;
500 if s.starts_with("<!--") {
501 self.scan_comment()
502 } else if s.starts_with("<!DOCTYPE") {
503 self.scan_bogus_comment()
504 } else if s.starts_with("<![CDATA[") {
505 if self.is_in_html_namespace {
506 self.emit_error(ErrorKind::CDataInHtmlContent);
507 self.scan_bogus_comment()
508 } else {
509 self.scan_cdata()
510 }
511 } else {
512 self.emit_error(ErrorKind::IncorrectlyOpenedComment);
513 self.scan_bogus_comment()
514 }
515 }
516 fn scan_comment(&mut self) -> Token<'a> {
518 debug_assert!(self.source.starts_with("<!--"));
519 let comment_text = self.scan_comment_text();
520 if self.source.is_empty() {
521 self.emit_error(ErrorKind::EofInComment);
522 } else if self.source.starts_with("--!>") {
523 self.emit_error(ErrorKind::IncorrectlyClosedComment);
524 self.move_by(4);
525 } else {
526 debug_assert!(self.source.starts_with("-->"));
527 self.move_by(3);
528 };
529 Token::Comment(comment_text)
530 }
531 fn scan_comment_text(&mut self) -> &'a str {
532 debug_assert!(self.source.starts_with("<!--"));
533 let comment_end = self.source.find("-->").or_else(|| self.source.find("--!>"));
534 let text = if let Some(end) = comment_end {
536 debug_assert!(end >= 2, "first two chars must be <!");
537 if end <= 3 {
539 self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
540 self.move_by(end);
541 return "";
542 }
543 self.move_by(4); &self.source[..end - 4] } else {
546 self.move_by(4);
548 self.source
549 };
550
551 let mut s = text;
553 while let Some(i) = s.find("<!--") {
554 self.move_by(i + 4);
555 if !self.source.is_empty() {
558 self.emit_error(ErrorKind::NestedComment);
559 }
560 s = &s[i + 4..];
561 }
562 if !s.is_empty() {
564 self.move_by(s.len());
565 }
566 text
567 }
568 #[cold]
569 #[inline(never)]
570 fn scan_bogus_comment(&mut self) -> Token<'a> {
571 let s = &self.source;
581 debug_assert! {
582 s.starts_with("<!") || s.starts_with("<?") ||
583 (
584 s.starts_with("</") &&
585 s[2..].starts_with(|c| {
586 !matches!(c, 'a'..='z'|'A'..='Z'|'>')
587 })
588 )
589 };
590 let start = if s.starts_with("<?") { 1 } else { 2 };
591 let text = if let Some(end) = s.find('>') {
592 let t = &s[start..end];
593 self.move_by(end + 1);
594 t
595 } else {
596 let len = s.len();
597 &self.move_by(len)[start..]
598 };
599 Token::Comment(text)
600 }
601 #[cold]
602 #[inline(never)]
603 fn scan_cdata(&mut self) -> Token<'a> {
604 debug_assert!(self.source.starts_with("<![CDATA["));
605 self.move_by(9);
606 let i = self.source.find("]]>").unwrap_or_else(|| self.source.len());
607 let text = self.move_by(i); if self.source.is_empty() {
609 self.emit_error(ErrorKind::EofInCdata);
610 } else {
611 debug_assert!(self.source.starts_with("]]>"));
612 self.move_by(3);
613 }
614 Token::from(text)
616 }
617
618 fn scan_rawtext(&mut self) -> Token<'a> {
620 debug_assert!(self.mode == TextMode::RawText);
621 debug_assert!(!self.source.is_empty());
622 let end = self.find_appropriate_end();
623 let src = if end == 0 { "" } else { self.move_by(end) };
625 self.mode = TextMode::Data;
626 if src.is_empty() {
627 self.scan_data()
628 } else {
629 Token::from(src)
630 }
631 }
632
633 fn scan_rcdata(&mut self) -> Token<'a> {
634 debug_assert!(self.mode == TextMode::RcData);
635 debug_assert!(!self.source.is_empty());
636 let delimiter = &self.option.delimiters.0;
637 if self.source.starts_with(delimiter) {
638 return self.scan_interpolation();
639 }
640 let end = self.find_appropriate_end();
641 let interpolation_start = self.source.find(delimiter).unwrap_or(end);
642 if interpolation_start < end {
643 debug_assert_ne!(interpolation_start, 0);
644 return self.scan_text(interpolation_start);
645 }
646 self.mode = TextMode::Data;
648 if end > 0 {
649 self.scan_text(end)
650 } else {
651 self.scan_data()
652 }
653 }
654
655 fn find_appropriate_end(&self) -> usize {
657 let tag_name = self
658 .last_start_tag_name
659 .expect("RAWTEXT/RCDATA must appear inside a tag");
660 let len = tag_name.len();
661 let source = self.source; for (i, _) in source.match_indices("</") {
663 let e = i + 2 + len;
666 if e >= source.len() {
668 break;
669 }
670 let is_appropriate_end = source[i + 2..e].eq_ignore_ascii_case(tag_name);
672 let terminated = !is_valid_name_char(source.as_bytes()[e]);
674 if is_appropriate_end && terminated {
675 return i;
677 }
678 }
679 source.len()
680 }
681}
682
683impl<'a, C: ErrorHandler> Tokens<'a, C> {
685 fn emit_error(&self, error_kind: ErrorKind) {
686 let start = self.current_position();
687 let loc = self.get_location_from(start);
688 let err = CompilationError::new(error_kind).with_location(loc);
689 self.err_handle.on_error(err);
690 }
691
692 fn decode_text(&self, src: &'a str, is_attr: bool) -> VStr<'a> {
693 *VStr::raw(src).decode(is_attr)
694 }
695
696 fn move_by(&mut self, size: usize) -> &'a str {
701 debug_assert!(size > 0, "scanner must move forward");
702 let mut lines = 0;
703 let mut last_new_line_pos = -1;
704 for (i, c) in self.source[..size].bytes().enumerate() {
705 if c == b'\n' {
706 lines += 1;
707 last_new_line_pos = i as i32;
708 }
709 }
710 let old_source = self.source;
711 self.source = &self.source[size..];
712 let ret = &old_source[..size];
713 let pos = &mut self.position;
715 let offset = ret.chars().count();
716 pos.offset += offset;
717 pos.line += lines;
718 pos.column = if last_new_line_pos == -1 {
719 pos.column + offset as u32
720 } else {
721 ret[last_new_line_pos as usize..].chars().count() as u32
722 };
724 ret
725 }
726
727 fn skip_whitespace(&mut self) -> usize {
728 let idx = self.source.find(non_whitespace);
729 let len = idx.unwrap_or_else(|| self.source.len());
730 if len != 0 {
731 self.move_by(len);
732 }
733 len
734 }
735}
736
737#[inline]
738fn ascii_alpha(c: char) -> bool {
739 c.is_ascii_alphabetic()
740}
741
742#[inline]
745fn semi_valid_attr_name(c: u8) -> bool {
746 is_valid_name_char(c) && c != b'='
747}
748
749#[inline]
752fn semi_valid_unquoted_attr_value(&c: &u8) -> bool {
753 !c.is_ascii_whitespace() && c != b'>'
754}
755
756#[inline]
757fn is_valid_name_char(c: u8) -> bool {
758 !c.is_ascii_whitespace() && c != b'/' && c != b'>'
759}
760
761fn scan_tag_name_length(mut bytes: Bytes<'_>) -> usize {
764 let first_char = bytes.next();
765 debug_assert!(first_char.is_some());
766 if !first_char.unwrap().is_ascii_alphabetic() {
767 return 0;
768 }
769 let l = bytes.take_while(|&c| is_valid_name_char(c)).count();
770 l + 1
771}
772
773impl<'a, C: ErrorHandler> Iterator for Tokens<'a, C> {
774 type Item = Token<'a>;
775 fn next(&mut self) -> Option<Self::Item> {
777 if self.source.is_empty() {
778 return None;
779 }
780 self.last_pos = self.current_position();
781 Some(match self.mode {
782 TextMode::Data => self.scan_data(),
783 TextMode::RcData => self.scan_rcdata(),
784 TextMode::RawText => self.scan_rawtext(),
785 })
786 }
787}
788
789impl<'a, C: ErrorHandler> FusedIterator for Tokens<'a, C> {}
791
792impl<'a, C: ErrorHandler> FlagCDataNs for Tokens<'a, C> {
793 fn set_is_in_html(&mut self, in_html: bool) {
794 self.is_in_html_namespace = in_html;
795 }
796 fn need_flag_hint(&self) -> bool {
797 self.source.contains("<![CDATA[")
798 }
799}
800
801impl<'a, C: ErrorHandler> Locatable for Tokens<'a, C> {
802 fn current_position(&self) -> Position {
803 self.position.clone()
804 }
805 fn last_position(&self) -> Position {
806 debug_assert! {
807 self.position.offset == 0 ||
808 self.last_pos.offset < self.position.offset
809 };
810 self.last_pos.clone()
811 }
812 fn get_location_from(&self, start: Position) -> SourceLocation {
813 let end = self.current_position();
814 SourceLocation { start, end }
815 }
816}
817
818pub trait TokenSource<'a>: FusedIterator<Item = Token<'a>> + FlagCDataNs + Locatable {}
819impl<'a, C> TokenSource<'a> for Tokens<'a, C> where C: ErrorHandler {}
820
821#[cfg(test)]
822pub mod test {
823 use super::{super::error::test::TestErrorHandler, *};
824 #[test]
825 fn test_single_delimiter() {
826 let a: Vec<_> = base_scan("{ test }").collect();
827 assert_eq!(a.len(), 1);
828 assert!(matches!(
829 a[0],
830 Token::Text(VStr {
831 raw: "{ test }",
832 ..
833 })
834 ));
835 }
836
837 fn scan_with_opt(s: &str, opt: ScanOption) -> impl TokenSource {
838 let scanner = Scanner::new(opt);
839 let ctx = TestErrorHandler;
840 scanner.scan(s, ctx)
841 }
842
843 pub fn base_scan(s: &str) -> impl TokenSource {
844 scan_with_opt(s, ScanOption::default())
845 }
846}