vue_compiler_core/
scanner.rs

1//! Vue template tokenization.
2//! The canonical parsing strategy should adhere to the spec below.
3//! https://html.spec.whatwg.org/multipage/parsing.html#tokenization
4
5use super::{
6    error::{CompilationError, CompilationErrorKind as ErrorKind, ErrorHandler},
7    util::{non_whitespace, VStr},
8    Name, Position, SourceLocation,
9};
10use rustc_hash::FxHashSet;
11use std::{iter::FusedIterator, str::Bytes};
12
13#[cfg(feature = "serde")]
14use serde::Serialize;
15
16#[cfg_attr(feature = "serde", derive(Serialize))]
17pub struct Attribute<'a> {
18    pub name: Name<'a>,
19    pub value: Option<AttributeValue<'a>>,
20    pub name_loc: SourceLocation,
21    pub location: SourceLocation,
22}
23
24impl<'a> Attribute<'a> {
25    pub fn has_empty_val(&self) -> bool {
26        self.value
27            .as_ref()
28            .map_or(true, |v| !v.content.contains(non_whitespace))
29    }
30}
31
32#[cfg_attr(feature = "serde", derive(Serialize))]
33pub struct AttributeValue<'a> {
34    pub content: VStr<'a>,
35    pub location: SourceLocation,
36}
37
38/// Tag is used only for start tag since end tag is bare
39#[cfg_attr(feature = "serde", derive(Serialize))]
40pub struct Tag<'a> {
41    pub name: Name<'a>,
42    pub attributes: Vec<Attribute<'a>>,
43    pub self_closing: bool,
44}
45
46/// html token definition is tailored for convenience.
47/// https://html.spec.whatwg.org/multipage/parsing.html#tokenization
48#[cfg_attr(feature = "serde", derive(Serialize))]
49pub enum Token<'a> {
50    StartTag(Tag<'a>),
51    EndTag(Name<'a>), // with no attrs or self_closing flag
52    // TODO: investigate if we can postpone decoding to codegen
53    // 1. in SSR we don't need to output decoded entities
54    // 2. in DOM we can output decoded text during transform
55    // 3. parser/IRConverter does not read text content
56    Text(VStr<'a>), // merges chars to one str
57    Comment(&'a str),
58    Interpolation(&'a str), // Vue specific token
59}
60
61// NB: Token::from only takes decoded str
62impl<'a> From<&'a str> for Token<'a> {
63    fn from(decoded: &'a str) -> Self {
64        Token::Text(VStr::raw(decoded))
65    }
66}
67
68/// ScanOption defined a list of methods used in scanning
69#[derive(Clone)]
70pub struct ScanOption {
71    pub delimiters: (String, String),
72    pub get_text_mode: fn(&str) -> TextMode,
73}
74
75impl Default for ScanOption {
76    fn default() -> Self {
77        Self {
78            delimiters: ("{{".into(), "}}".into()),
79            get_text_mode: |_| TextMode::Data,
80        }
81    }
82}
83
84/// A scanner needs to implement this trait to know if it is_in_html_namespace.
85/// A parser tells scanner the current namespace through the trait's method.
86// Because parsing CDATA requires scanner to know the parser's state.
87// The trait decouples parser state from scanner state.
88// The logic is somewhat convoluted in that the parser must handle logic belonging to
89// scanner. A parser can skip flagging namespace if need_flag_hint returns false.
90// Alternative is wrap Parser in a RefCell to appease Rust borrow check
91// minimal case https://play.rust-lang.org/?gist=c5cb2658afbebceacdfc6d387c72e1ab
92// but it is either too hard to bypass brrwchk or using too many Rc/RefCell
93// Another alternative in Servo's parser:
94// https://github.com/servo/html5ever/blob/57eb334c0ffccc6f88d563419f0fbeef6ff5741c/html5ever/src/tokenizer/interface.rs#L98
95pub trait FlagCDataNs {
96    /// Sets the scanner's is_in_html_namespace flag for CDATA.
97    /// NB: Parser should call this method if necessary. See trait comment for details.
98    /// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
99    fn set_is_in_html(&mut self, flag: bool);
100    /// hint the parser if flagging is needed. Hint must be conservative.
101    /// False alarm is acceptable but miss detection is not.
102    fn need_flag_hint(&self) -> bool;
103}
104
105/// This trait produces a compiler's current position and selects a range.
106pub trait Locatable {
107    /// Returns the scanner's current position in the source.
108    fn current_position(&self) -> Position;
109    fn last_position(&self) -> Position;
110    /// Returns the scanner's source location from the start position.
111    fn get_location_from(&self, start: Position) -> SourceLocation;
112}
113
114/// TextMode represents different text scanning strategy.
115/// e.g. Scanning in script/textarea/div are different.
116#[derive(PartialEq, Eq)]
117pub enum TextMode {
118    //         | Elements | Entities | End sign              | Inside of
119    // DATA    | ✔        | ✔        | End tags of ancestors |
120    // RCDATA  | ✘        | ✔        | End tag of the parent | <textarea>
121    // RAWTEXT | ✘        | ✘        | End tag of the parent | <style>,<script>
122    Data,
123    RcData,
124    RawText,
125}
126
127pub struct Scanner {
128    option: ScanOption,
129    delimiter_first_char: char,
130}
131
132// builder methods
133impl Scanner {
134    pub fn new(option: ScanOption) -> Self {
135        let delimiters = &option.delimiters;
136        let delimiter_first_char = delimiters
137            .0
138            .chars()
139            .next()
140            .expect("interpolation delimiter cannot be empty");
141        Self {
142            option,
143            delimiter_first_char,
144        }
145    }
146    pub fn scan<'a, E>(&self, source: &'a str, err_handle: E) -> impl TokenSource<'a>
147    where
148        E: ErrorHandler,
149    {
150        Tokens {
151            source,
152            err_handle,
153            position: Default::default(),
154            last_pos: Default::default(),
155            mode: TextMode::Data,
156            option: self.option.clone(),
157            last_start_tag_name: None,
158            is_in_html_namespace: true,
159            delimiter_first_char: self.delimiter_first_char,
160        }
161    }
162}
163
164pub struct Tokens<'a, E: ErrorHandler> {
165    source: &'a str,
166    err_handle: E,
167    position: Position,
168    last_pos: Position,
169    mode: TextMode,
170    pub option: ScanOption,
171    // following fields are implementation details
172
173    //  appropriate end tag token needs last start tag, if any
174    // https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token
175    last_start_tag_name: Option<&'a str>,
176    // this flag is for handling CDATA in non HTML namespace.
177    is_in_html_namespace: bool,
178    delimiter_first_char: char,
179}
180
181// scanning methods
182// NB: When storing self.source to a name, prefer using a ref.
183// because Rust ownership can help us to prevent invalid state.
184// e.g. `let src = self.source` causes a stale src after [`move_by`].
185// while `let src= &self.source` forbids any src usage after a mut call.
186impl<'a, C: ErrorHandler> Tokens<'a, C> {
187    // https://html.spec.whatwg.org/multipage/parsing.html#data-state
188    // NB: & is not handled here but instead in `decode_entities`
189    fn scan_data(&mut self) -> Token<'a> {
190        debug_assert!(self.mode == TextMode::Data);
191        debug_assert!(!self.source.is_empty());
192        let d = self.delimiter_first_char;
193        let mut offset = 0;
194        // process html entity & later
195        while let Some(i) = self.source[offset..].find(&['<', d][..]) {
196            if i != 0 {
197                // found non empty text
198                return self.scan_text(i);
199            } else if self.source.starts_with('<') {
200                return self.scan_tag_open();
201            } else if self.source.starts_with(&self.option.delimiters.0) {
202                return self.scan_interpolation();
203            } else {
204                offset = i + 1;
205            }
206        }
207        // return text if no tag or interpolation found
208        self.scan_text(self.source.len())
209    }
210
211    // produces an entity_decoded Text token.
212    fn scan_text(&mut self, size: usize) -> Token<'a> {
213        debug_assert!(matches!(self.mode, TextMode::Data | TextMode::RcData));
214        debug_assert_ne!(size, 0);
215        let src = self.move_by(size);
216        Token::Text(self.decode_text(src, false))
217    }
218
219    fn scan_interpolation(&mut self) -> Token<'a> {
220        let delimiters = &self.option.delimiters;
221        debug_assert!(self.source.starts_with(&delimiters.0));
222        let index = self.source.find(&delimiters.1);
223        if index.is_none() {
224            let src = self.move_by(self.source.len());
225            self.emit_error(ErrorKind::MissingInterpolationEnd);
226            return Token::Interpolation(&src[2..]);
227        }
228        let src = &self.move_by(index.unwrap())[2..];
229        self.move_by(self.option.delimiters.1.len());
230        Token::Interpolation(src)
231    }
232
233    // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
234    fn scan_tag_open(&mut self) -> Token<'a> {
235        // use a ref to &str to ensure source is always valid
236        // that is, source cannot be used after move_by
237        let source = &self.source;
238        if source.starts_with("</") {
239            self.scan_end_tag_open()
240        } else if source.starts_with("<!") {
241            self.scan_comment_and_like()
242        } else if source.starts_with("<?") {
243            self.emit_error(ErrorKind::UnexpectedQuestionMarkInsteadOfTagName);
244            self.scan_bogus_comment()
245        } else if source.len() == 1 {
246            self.move_by(1);
247            self.emit_error(ErrorKind::EofBeforeTagName);
248            Token::from("<")
249        } else if !source[1..].starts_with(ascii_alpha) {
250            // we can indeed merge this standalone < char into surrounding text
251            // but optimization for error is not worth the candle
252            self.move_by(1);
253            self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
254            Token::from("<")
255        } else {
256            self.scan_start_tag()
257        }
258    }
259
260    // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
261    fn scan_start_tag(&mut self) -> Token<'a> {
262        debug_assert!(self.source.starts_with('<'));
263        self.move_by(1);
264        let tag = self.scan_tag_name();
265        // https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text
266        // Parsing algorithms are always invoked in response to a start tag token.
267        let parsing_algorithm = self.option.get_text_mode;
268        self.mode = parsing_algorithm(tag.name);
269        if self.mode != TextMode::Data {
270            self.last_start_tag_name.replace(tag.name);
271        }
272        Token::StartTag(tag)
273    }
274    fn scan_tag_name(&mut self) -> Tag<'a> {
275        debug_assert!(self.source.starts_with(ascii_alpha));
276        let bytes = self.source.bytes();
277        let l = scan_tag_name_length(bytes);
278        debug_assert!(l > 0);
279        let name = self.move_by(l);
280        let attributes = self.scan_attributes();
281        let self_closing = if self.source.is_empty() {
282            self.emit_error(ErrorKind::EofInTag);
283            false
284        } else {
285            self.scan_close_start_tag()
286        };
287        Tag {
288            name,
289            attributes,
290            self_closing,
291        }
292    }
293    // return attributes and if the tag is self closing
294    // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
295    fn scan_attributes(&mut self) -> Vec<Attribute<'a>> {
296        let mut attrs = vec![]; // TODO: size hint?
297        let mut set = FxHashSet::default();
298        loop {
299            // TODO: forbid infinite loop
300            self.skip_whitespace();
301            if self.is_about_to_close_tag() {
302                return attrs;
303            }
304            if self.did_skip_slash_in_tag() {
305                continue;
306            }
307            let attr = self.scan_attribute();
308            if set.contains(attr.name) {
309                // new attribute must be removed from the token.
310                // NB: original vue compiler does not remove it.
311                self.emit_error(ErrorKind::DuplicateAttribute);
312                continue;
313            }
314            set.insert(attr.name);
315            attrs.push(attr);
316        }
317    }
318    // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
319    fn scan_attribute(&mut self) -> Attribute<'a> {
320        debug_assert!(!self.source.is_empty());
321        let start = self.current_position();
322        let name = self.scan_attr_name();
323        let name_loc = self.get_location_from(start.clone());
324        // 13.2.5.34 After attribute name state, ignore white spaces
325        self.skip_whitespace();
326        if self.is_about_to_close_tag()
327            || self.did_skip_slash_in_tag()
328            || !self.source.starts_with('=')
329        {
330            let location = self.get_location_from(start);
331            return Attribute {
332                name,
333                location,
334                name_loc,
335                value: None,
336            };
337        }
338        self.move_by(1); // equal sign
339        let value = self.scan_attr_value();
340        let location = self.get_location_from(start);
341        Attribute {
342            name,
343            value,
344            name_loc,
345            location,
346        }
347    }
348    fn is_about_to_close_tag(&self) -> bool {
349        let source = &self.source; // must get fresh source
350        source.is_empty() || source.starts_with("/>") || source.starts_with('>')
351    }
352    fn did_skip_slash_in_tag(&mut self) -> bool {
353        debug_assert!(!self.source.is_empty());
354        if self.source.starts_with('/') {
355            self.move_by(1);
356            self.emit_error(ErrorKind::UnexpectedSolidusInTag);
357            true
358        } else {
359            false
360        }
361    }
362    // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
363    fn scan_attr_name(&mut self) -> &'a str {
364        debug_assert!(is_valid_name_char(self.source.as_bytes()[0]));
365        // case like <tag =="value"/>
366        let offset = if self.source.starts_with('=') {
367            self.emit_error(ErrorKind::UnexpectedEqualsSignBeforeAttributeName);
368            1
369        } else {
370            0
371        };
372        let count = self.source[offset..]
373            .bytes()
374            .take_while(|&c| semi_valid_attr_name(c))
375            .count();
376        let src = self.move_by(count + offset);
377        if src.contains(&['<', '"', '\''][..]) {
378            self.emit_error(ErrorKind::UnexpectedCharacterInAttributeName);
379        }
380        src
381    }
382    // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
383    fn scan_attr_value(&mut self) -> Option<AttributeValue<'a>> {
384        self.skip_whitespace();
385        let source = &self.source;
386        if source.starts_with('>') {
387            self.emit_error(ErrorKind::MissingAttributeValue);
388            return None;
389        }
390        let start = self.current_position();
391        let content = if self.source.starts_with(&['"', '\''][..]) {
392            let c = self.source.chars().next().unwrap();
393            self.scan_quoted_attr_value(c)?
394        } else {
395            self.scan_unquoted_attr_value()?
396        };
397        Some(AttributeValue {
398            content,
399            location: self.get_location_from(start),
400        })
401    }
402    // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
403    // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
404    fn scan_quoted_attr_value(&mut self, quote: char) -> Option<VStr<'a>> {
405        debug_assert!(self.source.starts_with(quote));
406        self.move_by(1);
407        let src = if let Some(i) = self.source.find(quote) {
408            let val = if i == 0 { "" } else { self.move_by(i) };
409            self.move_by(1); // consume quote char
410            val
411        } else if !self.source.is_empty() {
412            self.move_by(self.source.len())
413        } else {
414            return None;
415        };
416        // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
417        if !self.is_about_to_close_tag()
418            && !self.did_skip_slash_in_tag()
419            && self.skip_whitespace() == 0
420        {
421            self.emit_error(ErrorKind::MissingWhitespaceBetweenAttributes);
422        }
423        Some(self.decode_text(src, /*is_attr*/ true))
424    }
425    // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
426    fn scan_unquoted_attr_value(&mut self) -> Option<VStr<'a>> {
427        let val_len = self
428            .source
429            .bytes()
430            .take_while(semi_valid_unquoted_attr_value)
431            .count();
432        // unexpected EOF: <tag attr=
433        if val_len == 0 {
434            // whitespace or > is precluded in scan_attribute
435            // so empty value must implies EOF
436            debug_assert!(self.source.is_empty());
437            return None;
438        }
439        let src = self.move_by(val_len);
440        if src.contains(&['"', '\'', '<', '=', '`'][..]) {
441            self.emit_error(ErrorKind::UnexpectedCharacterInUnquotedAttributeValue);
442        }
443        Some(self.decode_text(src, /* is_attr */ true))
444    }
445
446    fn scan_close_start_tag(&mut self) -> bool {
447        debug_assert!(!self.source.is_empty());
448        if self.source.starts_with("/>") {
449            self.move_by(2);
450            true
451        } else {
452            debug_assert!(self.source.starts_with('>'));
453            self.move_by(1);
454            false
455        }
456    }
457    // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
458    fn scan_end_tag_open(&mut self) -> Token<'a> {
459        debug_assert!(self.source.starts_with("</"));
460        let source = &self.source;
461        if source.len() == 2 {
462            self.emit_error(ErrorKind::EofBeforeTagName);
463            Token::from(self.move_by(2))
464        } else if source.starts_with("</>") {
465            self.emit_error(ErrorKind::MissingEndTagName);
466            self.move_by(3);
467            Token::from("")
468        } else if !self.source[2..].starts_with(ascii_alpha) {
469            self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
470            self.scan_bogus_comment()
471        } else {
472            self.scan_end_tag()
473        }
474    }
475    // errors emit here is defined at the top of the tokenization spec
476    fn scan_end_tag(&mut self) -> Token<'a> {
477        debug_assert!(self.source.starts_with("</"));
478        self.move_by(2);
479        // indeed in end tag collecting attributes is useless
480        // but, no, I don't want to opt for ill-formed input
481        let tag = self.scan_tag_name();
482        // When an end tag token is emitted with attributes
483        if !tag.attributes.is_empty() {
484            self.emit_error(ErrorKind::EndTagWithAttributes);
485        }
486        // When an end tag token is emitted with its self-closing flag set
487        if tag.self_closing {
488            self.emit_error(ErrorKind::EndTagWithTrailingSolidus);
489        }
490        // reset text mode after tag close
491        self.mode = TextMode::Data;
492        Token::EndTag(tag.name)
493    }
494
495    // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
496    fn scan_comment_and_like(&mut self) -> Token<'a> {
497        // TODO: investigate https://github.com/jneem/teddy
498        // for simd string pattern matching
499        let s = &self.source;
500        if s.starts_with("<!--") {
501            self.scan_comment()
502        } else if s.starts_with("<!DOCTYPE") {
503            self.scan_bogus_comment()
504        } else if s.starts_with("<![CDATA[") {
505            if self.is_in_html_namespace {
506                self.emit_error(ErrorKind::CDataInHtmlContent);
507                self.scan_bogus_comment()
508            } else {
509                self.scan_cdata()
510            }
511        } else {
512            self.emit_error(ErrorKind::IncorrectlyOpenedComment);
513            self.scan_bogus_comment()
514        }
515    }
516    // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
517    fn scan_comment(&mut self) -> Token<'a> {
518        debug_assert!(self.source.starts_with("<!--"));
519        let comment_text = self.scan_comment_text();
520        if self.source.is_empty() {
521            self.emit_error(ErrorKind::EofInComment);
522        } else if self.source.starts_with("--!>") {
523            self.emit_error(ErrorKind::IncorrectlyClosedComment);
524            self.move_by(4);
525        } else {
526            debug_assert!(self.source.starts_with("-->"));
527            self.move_by(3);
528        };
529        Token::Comment(comment_text)
530    }
531    fn scan_comment_text(&mut self) -> &'a str {
532        debug_assert!(self.source.starts_with("<!--"));
533        let comment_end = self.source.find("-->").or_else(|| self.source.find("--!>"));
534        // NB: we take &str here since we will call move_by later
535        let text = if let Some(end) = comment_end {
536            debug_assert!(end >= 2, "first two chars must be <!");
537            // <!---> or <!-->
538            if end <= 3 {
539                self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
540                self.move_by(end);
541                return "";
542            }
543            self.move_by(4); // skip <!--
544            &self.source[..end - 4] // must be exclusive
545        } else {
546            // no closing comment
547            self.move_by(4);
548            self.source
549        };
550
551        // report nested comment error
552        let mut s = text;
553        while let Some(i) = s.find("<!--") {
554            self.move_by(i + 4);
555            // spec does not emit the NestedComment error when EOF is met
556            // #13.2.5.49 Comment less-than sign bang dash dash state
557            if !self.source.is_empty() {
558                self.emit_error(ErrorKind::NestedComment);
559            }
560            s = &s[i + 4..];
561        }
562        // consume remaining comment
563        if !s.is_empty() {
564            self.move_by(s.len());
565        }
566        text
567    }
568    #[cold]
569    #[inline(never)]
570    fn scan_bogus_comment(&mut self) -> Token<'a> {
571        /* /^<(?:[\!\?]|\/[^a-z>])/i from Vue's parseBogusComment
572        ^            // starts with
573        <            // a < followed by
574        (?:          // a non-capturing group of
575         [\!\?]      // a char of ! or ?
576         |           // or
577         \/[^a-z>]   // a slash and non alpha or >
578        )
579        */
580        let s = &self.source;
581        debug_assert! {
582            s.starts_with("<!") || s.starts_with("<?") ||
583            (
584                s.starts_with("</") &&
585                s[2..].starts_with(|c| {
586                    !matches!(c, 'a'..='z'|'A'..='Z'|'>')
587                })
588            )
589        };
590        let start = if s.starts_with("<?") { 1 } else { 2 };
591        let text = if let Some(end) = s.find('>') {
592            let t = &s[start..end];
593            self.move_by(end + 1);
594            t
595        } else {
596            let len = s.len();
597            &self.move_by(len)[start..]
598        };
599        Token::Comment(text)
600    }
601    #[cold]
602    #[inline(never)]
603    fn scan_cdata(&mut self) -> Token<'a> {
604        debug_assert!(self.source.starts_with("<![CDATA["));
605        self.move_by(9);
606        let i = self.source.find("]]>").unwrap_or_else(|| self.source.len());
607        let text = self.move_by(i); // can be zero
608        if self.source.is_empty() {
609            self.emit_error(ErrorKind::EofInCdata);
610        } else {
611            debug_assert!(self.source.starts_with("]]>"));
612            self.move_by(3);
613        }
614        // don't call scan_text since CDATA decodes nothing
615        Token::from(text)
616    }
617
618    // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
619    fn scan_rawtext(&mut self) -> Token<'a> {
620        debug_assert!(self.mode == TextMode::RawText);
621        debug_assert!(!self.source.is_empty());
622        let end = self.find_appropriate_end();
623        // NOTE: rawtext decodes no entity. Don't call scan_text
624        let src = if end == 0 { "" } else { self.move_by(end) };
625        self.mode = TextMode::Data;
626        if src.is_empty() {
627            self.scan_data()
628        } else {
629            Token::from(src)
630        }
631    }
632
633    fn scan_rcdata(&mut self) -> Token<'a> {
634        debug_assert!(self.mode == TextMode::RcData);
635        debug_assert!(!self.source.is_empty());
636        let delimiter = &self.option.delimiters.0;
637        if self.source.starts_with(delimiter) {
638            return self.scan_interpolation();
639        }
640        let end = self.find_appropriate_end();
641        let interpolation_start = self.source.find(delimiter).unwrap_or(end);
642        if interpolation_start < end {
643            debug_assert_ne!(interpolation_start, 0);
644            return self.scan_text(interpolation_start);
645        }
646        // scan_text does not read mode so it's safe to put this ahead.
647        self.mode = TextMode::Data;
648        if end > 0 {
649            self.scan_text(end)
650        } else {
651            self.scan_data()
652        }
653    }
654
655    /// find first </{last_start_tag_name}
656    fn find_appropriate_end(&self) -> usize {
657        let tag_name = self
658            .last_start_tag_name
659            .expect("RAWTEXT/RCDATA must appear inside a tag");
660        let len = tag_name.len();
661        let source = self.source; // no mut self, need no &&str
662        for (i, _) in source.match_indices("</") {
663            //  match point
664            //      ￬   </  style
665            let e = i + 2 + len;
666            // emit text without error per spec
667            if e >= source.len() {
668                break;
669            }
670            // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
671            let is_appropriate_end = source[i + 2..e].eq_ignore_ascii_case(tag_name);
672            // equivalent to source[e..] does not start with valid_name_char
673            let terminated = !is_valid_name_char(source.as_bytes()[e]);
674            if is_appropriate_end && terminated {
675                // found!
676                return i;
677            }
678        }
679        source.len()
680    }
681}
682
683// utility methods
684impl<'a, C: ErrorHandler> Tokens<'a, C> {
685    fn emit_error(&self, error_kind: ErrorKind) {
686        let start = self.current_position();
687        let loc = self.get_location_from(start);
688        let err = CompilationError::new(error_kind).with_location(loc);
689        self.err_handle.on_error(err);
690    }
691
692    fn decode_text(&self, src: &'a str, is_attr: bool) -> VStr<'a> {
693        *VStr::raw(src).decode(is_attr)
694    }
695
696    /// move scanner's internal position forward and return &str
697    /// scanner's line/column are also updated in the method
698    /// NB: it only moves forward, not backward
699    /// `advance_to` is a better name but it collides with iter
700    fn move_by(&mut self, size: usize) -> &'a str {
701        debug_assert!(size > 0, "scanner must move forward");
702        let mut lines = 0;
703        let mut last_new_line_pos = -1;
704        for (i, c) in self.source[..size].bytes().enumerate() {
705            if c == b'\n' {
706                lines += 1;
707                last_new_line_pos = i as i32;
708            }
709        }
710        let old_source = self.source;
711        self.source = &self.source[size..];
712        let ret = &old_source[..size];
713        // NB: pos is counted in char not u8
714        let pos = &mut self.position;
715        let offset = ret.chars().count();
716        pos.offset += offset;
717        pos.line += lines;
718        pos.column = if last_new_line_pos == -1 {
719            pos.column + offset as u32
720        } else {
721            ret[last_new_line_pos as usize..].chars().count() as u32
722            // size as u32 - last_new_line_pos as u32
723        };
724        ret
725    }
726
727    fn skip_whitespace(&mut self) -> usize {
728        let idx = self.source.find(non_whitespace);
729        let len = idx.unwrap_or_else(|| self.source.len());
730        if len != 0 {
731            self.move_by(len);
732        }
733        len
734    }
735}
736
737#[inline]
738fn ascii_alpha(c: char) -> bool {
739    c.is_ascii_alphabetic()
740}
741
742// `< ' "` are not valid but counted as semi valid
743// to leniently recover from a parsing error
744#[inline]
745fn semi_valid_attr_name(c: u8) -> bool {
746    is_valid_name_char(c) && c != b'='
747}
748
749// only whitespace and > terminates unquoted attr value
750// other special char only emits error
751#[inline]
752fn semi_valid_unquoted_attr_value(&c: &u8) -> bool {
753    !c.is_ascii_whitespace() && c != b'>'
754}
755
756#[inline]
757fn is_valid_name_char(c: u8) -> bool {
758    !c.is_ascii_whitespace() && c != b'/' && c != b'>'
759}
760
761// tag name should begin with [a-zA-Z]
762// followed by chars except whitespace, / or >
763fn scan_tag_name_length(mut bytes: Bytes<'_>) -> usize {
764    let first_char = bytes.next();
765    debug_assert!(first_char.is_some());
766    if !first_char.unwrap().is_ascii_alphabetic() {
767        return 0;
768    }
769    let l = bytes.take_while(|&c| is_valid_name_char(c)).count();
770    l + 1
771}
772
773impl<'a, C: ErrorHandler> Iterator for Tokens<'a, C> {
774    type Item = Token<'a>;
775    // https://html.spec.whatwg.org/multipage/parsing.html#concept-frag-parse-context
776    fn next(&mut self) -> Option<Self::Item> {
777        if self.source.is_empty() {
778            return None;
779        }
780        self.last_pos = self.current_position();
781        Some(match self.mode {
782            TextMode::Data => self.scan_data(),
783            TextMode::RcData => self.scan_rcdata(),
784            TextMode::RawText => self.scan_rawtext(),
785        })
786    }
787}
788
789// Parser requires Tokens always yield None when exhausted.
790impl<'a, C: ErrorHandler> FusedIterator for Tokens<'a, C> {}
791
792impl<'a, C: ErrorHandler> FlagCDataNs for Tokens<'a, C> {
793    fn set_is_in_html(&mut self, in_html: bool) {
794        self.is_in_html_namespace = in_html;
795    }
796    fn need_flag_hint(&self) -> bool {
797        self.source.contains("<![CDATA[")
798    }
799}
800
801impl<'a, C: ErrorHandler> Locatable for Tokens<'a, C> {
802    fn current_position(&self) -> Position {
803        self.position.clone()
804    }
805    fn last_position(&self) -> Position {
806        debug_assert! {
807            self.position.offset == 0 ||
808            self.last_pos.offset < self.position.offset
809        };
810        self.last_pos.clone()
811    }
812    fn get_location_from(&self, start: Position) -> SourceLocation {
813        let end = self.current_position();
814        SourceLocation { start, end }
815    }
816}
817
818pub trait TokenSource<'a>: FusedIterator<Item = Token<'a>> + FlagCDataNs + Locatable {}
819impl<'a, C> TokenSource<'a> for Tokens<'a, C> where C: ErrorHandler {}
820
821#[cfg(test)]
822pub mod test {
823    use super::{super::error::test::TestErrorHandler, *};
824    #[test]
825    fn test_single_delimiter() {
826        let a: Vec<_> = base_scan("{ test }").collect();
827        assert_eq!(a.len(), 1);
828        assert!(matches!(
829            a[0],
830            Token::Text(VStr {
831                raw: "{ test }",
832                ..
833            })
834        ));
835    }
836
837    fn scan_with_opt(s: &str, opt: ScanOption) -> impl TokenSource {
838        let scanner = Scanner::new(opt);
839        let ctx = TestErrorHandler;
840        scanner.scan(s, ctx)
841    }
842
843    pub fn base_scan(s: &str) -> impl TokenSource {
844        scan_with_opt(s, ScanOption::default())
845    }
846}
vue_compiler_core/scanner.rs

vue_compiler_core/
scanner.rs