html_languageservice/parser/
html_scanner.rs

1use std::borrow::Cow;
2
3use lazy_static::lazy_static;
4use multi_line_stream::MultiLineStream;
5use regex::Regex;
6
7lazy_static! {
8    static ref REG_DOCTYPE: Regex = Regex::new(r"^!(?i)doctype").unwrap();
9    static ref REG_NON_SPECIAL_START: Regex = Regex::new(r#"^[^\s"'`=<>]+"#).unwrap();
10    static ref REG_SCRIPT_COMMENT: Regex = Regex::new(r"<!--|-->|<\/?script\s*\/?>?").unwrap();
11    static ref REG_ELEMENT_NAME: Regex = Regex::new(r"^[_:\w][_:\w\-.\d]*").unwrap();
12    static ref REG_NON_ELEMENT_NAME: Regex =
13        Regex::new(r#"^[^\s"'></=\x00-\x0F\x7F\x80-\x9F]+"#).unwrap();
14    static ref REG_STYLE: Regex = Regex::new(r"<\/style").unwrap();
15}
16
17/// Scan the input string with byte as the base unit to generate a token stream
18pub struct Scanner<'a> {
19    state: ScannerState,
20    token_type: TokenType,
21    token_offset: usize,
22    token_error: Option<&'static str>,
23    stream: MultiLineStream<'a>,
24
25    emit_pseudo_close_tags: bool,
26    has_space_after_tag: bool,
27    last_tag: Option<Cow<'a, str>>,
28    last_attribute_name: Option<Cow<'a, str>>,
29    last_type_value: Option<Cow<'a, str>>,
30    case_sensitive: bool,
31}
32
33impl<'a> Scanner<'a> {
34    pub fn new(
35        input: &'a str,
36        initial_offset: usize,
37        initial_state: ScannerState,
38        emit_pseudo_close_tags: bool,
39        case_sensitive: bool,
40    ) -> Scanner<'a> {
41        let stream = MultiLineStream::new(input, initial_offset);
42        let token_offset = 0;
43        let token_type = TokenType::Unknown;
44        Scanner {
45            state: initial_state,
46            token_type,
47            token_offset,
48            token_error: None,
49            stream,
50            emit_pseudo_close_tags,
51            has_space_after_tag: false,
52            last_tag: None,
53            last_attribute_name: None,
54            last_type_value: None,
55            case_sensitive,
56        }
57    }
58
59    pub fn scan(&mut self) -> TokenType {
60        let offset = self.stream.pos();
61        let old_state = &self.state.clone();
62        self.internal_scan();
63        if self.token_type != TokenType::EOS
64            && offset == self.stream.pos()
65            && !(self.emit_pseudo_close_tags
66                && [TokenType::StartTagClose, TokenType::EndTagClose].contains(&self.token_type))
67        {
68            eprintln!(
69                "Scanner.scan has not advanced at offset {}, state before: {:?} after: {:?}",
70                offset, old_state, self.state,
71            );
72            self.stream.advance(1);
73            return self.finish_token(offset, TokenType::Unknown, None);
74        }
75        self.token_type
76    }
77
78    pub fn get_token_type(&self) -> TokenType {
79        self.token_type
80    }
81
82    pub fn get_token_offset(&self) -> usize {
83        self.token_offset
84    }
85
86    pub fn get_token_length(&self) -> usize {
87        self.stream.pos() - self.token_offset
88    }
89
90    pub fn get_token_end(&self) -> usize {
91        self.stream.pos()
92    }
93
94    pub fn get_token_text(&self) -> &'a str {
95        &self.stream.source[self.get_token_offset()..self.get_token_end()]
96    }
97
98    pub fn get_scanner_state(&self) -> ScannerState {
99        self.state
100    }
101
102    pub fn get_token_error(&self) -> Option<&'static str> {
103        self.token_error
104    }
105
106    pub fn get_source_len(&self) -> usize {
107        self.stream.source.len()
108    }
109
110    fn internal_scan(&mut self) -> TokenType {
111        let offset = self.stream.pos();
112        if self.stream.eos() {
113            return self.finish_token(offset, TokenType::EOS, None);
114        }
115        let error_message;
116
117        match self.state {
118            ScannerState::WithinComment => {
119                if self.stream.advance_if_chars("-->") {
120                    // -->
121                    self.state = ScannerState::WithinContent;
122                    return self.finish_token(offset, TokenType::EndCommentTag, None);
123                }
124                self.stream.advance_until_chars("-->"); // -->
125                return self.finish_token(offset, TokenType::Comment, None);
126            }
127
128            ScannerState::WithinDoctype => {
129                if self.stream.advance_if_char(b'>') {
130                    self.state = ScannerState::WithinContent;
131                    return self.finish_token(offset, TokenType::EndDoctypeTag, None);
132                }
133                self.stream.advance_until_char(b'>'); // >
134                return self.finish_token(offset, TokenType::Doctype, None);
135            }
136
137            ScannerState::WithinContent => {
138                if self.stream.advance_if_char(b'<') {
139                    // <
140                    if !self.stream.eos() && self.stream.peek_char(0) == Some(b'!') {
141                        // !
142                        if self.stream.advance_if_chars("!--") {
143                            // <!--
144                            self.state = ScannerState::WithinComment;
145                            return self.finish_token(offset, TokenType::StartCommentTag, None);
146                        }
147                        if self.stream.advance_if_regexp(&REG_DOCTYPE).is_some() {
148                            self.state = ScannerState::WithinDoctype;
149                            return self.finish_token(offset, TokenType::StartDoctypeTag, None);
150                        }
151                    }
152                    if self.stream.advance_if_char(b'/') {
153                        // /
154                        self.state = ScannerState::AfterOpeningEndTag;
155                        return self.finish_token(offset, TokenType::EndTagOpen, None);
156                    }
157                    self.state = ScannerState::AfterOpeningStartTag;
158                    return self.finish_token(offset, TokenType::StartTagOpen, None);
159                }
160                self.stream.advance_until_char(b'<');
161                return self.finish_token(offset, TokenType::Content, None);
162            }
163
164            ScannerState::AfterOpeningEndTag => {
165                let tag_name = self.next_element_name();
166                if tag_name.is_some() {
167                    self.state = ScannerState::WithinEndTag;
168                    return self.finish_token(offset, TokenType::EndTag, None);
169                }
170                if self.stream.skip_whitespace() {
171                    // white space is not valid here
172                    return self.finish_token(
173                        offset,
174                        TokenType::Whitespace,
175                        Some("Tag name must directly follow the open bracket."),
176                    );
177                }
178                self.state = ScannerState::WithinEndTag;
179                self.stream.advance_until_char(b'>');
180                if offset < self.stream.pos() {
181                    return self.finish_token(
182                        offset,
183                        TokenType::Unknown,
184                        Some("End tag name expected."),
185                    );
186                }
187                return self.internal_scan();
188            }
189
190            ScannerState::WithinEndTag => {
191                if self.stream.skip_whitespace() {
192                    // white space is valid here
193                    return self.finish_token(offset, TokenType::Whitespace, None);
194                }
195                if self.stream.advance_if_char(b'>') {
196                    // >
197                    self.state = ScannerState::WithinContent;
198                    return self.finish_token(offset, TokenType::EndTagClose, None);
199                }
200                if self.emit_pseudo_close_tags && self.stream.peek_char(0) == Some(b'<') {
201                    // <
202                    self.state = ScannerState::WithinContent;
203                    return self.finish_token(
204                        offset,
205                        TokenType::EndTagClose,
206                        Some("Closing bracket missing."),
207                    );
208                }
209                error_message = Some("Closing bracket expected.");
210            }
211
212            ScannerState::AfterOpeningStartTag => {
213                self.last_tag = self.next_element_name();
214                self.last_type_value = None;
215                self.last_attribute_name = None;
216                if self.last_tag.is_some() {
217                    self.has_space_after_tag = false;
218                    self.state = ScannerState::WithinTag;
219                    return self.finish_token(offset, TokenType::StartTag, None);
220                }
221                if self.stream.skip_whitespace() {
222                    // white space is not valid here
223                    return self.finish_token(
224                        offset,
225                        TokenType::Whitespace,
226                        Some("Tag name must directly follow the open bracket."),
227                    );
228                }
229                self.state = ScannerState::WithinTag;
230                self.stream.advance_until_char(b'>');
231                if offset < self.stream.pos() {
232                    return self.finish_token(
233                        offset,
234                        TokenType::Unknown,
235                        Some("Start tag name expected."),
236                    );
237                }
238                return self.internal_scan();
239            }
240
241            ScannerState::WithinTag => {
242                if self.stream.skip_whitespace() {
243                    self.has_space_after_tag = true; // remember that we have seen a whitespace
244                    return self.finish_token(offset, TokenType::Whitespace, None);
245                }
246                if self.has_space_after_tag {
247                    self.last_attribute_name = self.next_attribute_name();
248                    if self.last_attribute_name.is_some() {
249                        self.state = ScannerState::AfterAttributeName;
250                        self.has_space_after_tag = false;
251                        return self.finish_token(offset, TokenType::AttributeName, None);
252                    }
253                    if self.stream.peek_char(0) == Some(b'=') {
254                        self.has_space_after_tag = false;
255                    }
256                }
257                if self.stream.advance_if_chars("/>") {
258                    // />
259                    self.state = ScannerState::WithinContent;
260                    return self.finish_token(offset, TokenType::StartTagSelfClose, None);
261                }
262                if self.stream.advance_if_char(b'>') {
263                    // >
264                    if self
265                        .last_tag
266                        .as_ref()
267                        .is_some_and(|v| v.as_ref() == "script")
268                    {
269                        if self.last_type_value.is_some() {
270                            // stay in html
271                            self.state = ScannerState::WithinContent;
272                        } else {
273                            self.state = ScannerState::WithinScriptContent;
274                        }
275                    } else if self
276                        .last_tag
277                        .as_ref()
278                        .is_some_and(|v| v.as_ref() == "style")
279                    {
280                        self.state = ScannerState::WithinStyleContent;
281                    } else {
282                        self.state = ScannerState::WithinContent;
283                    }
284                    return self.finish_token(offset, TokenType::StartTagClose, None);
285                }
286                if self.emit_pseudo_close_tags && self.stream.peek_char(0) == Some(b'<') {
287                    // <
288                    self.state = ScannerState::WithinContent;
289                    return self.finish_token(
290                        offset,
291                        TokenType::StartTagClose,
292                        Some("Closing bracket missing."),
293                    );
294                }
295                self.stream.advance(1);
296                return self.finish_token(
297                    offset,
298                    TokenType::Unknown,
299                    Some("Unexpected character in tag."),
300                );
301            }
302
303            ScannerState::AfterAttributeName => {
304                if self.stream.skip_whitespace() {
305                    self.has_space_after_tag = true;
306                    return self.finish_token(offset, TokenType::Whitespace, None);
307                }
308
309                if self.stream.advance_if_char(b'=') {
310                    self.state = ScannerState::BeforeAttributeValue;
311                    return self.finish_token(offset, TokenType::DelimiterAssign, None);
312                }
313                self.state = ScannerState::WithinTag;
314                return self.internal_scan(); // no advance yet - jump to WithinTag
315            }
316
317            ScannerState::BeforeAttributeValue => {
318                if self.stream.skip_whitespace() {
319                    self.state = ScannerState::WithinTag;
320                    self.has_space_after_tag = true;
321                    return self.finish_token(offset, TokenType::Whitespace, None);
322                }
323                let cur_char = self.stream.peek_char(0);
324                let prev_char = self.stream.peek_char(-1);
325                let attribute_value = self.stream.advance_if_regexp(&REG_NON_SPECIAL_START);
326                if let Some(mut attribute_value) = attribute_value {
327                    let mut is_go_back = false;
328                    if cur_char == Some(b'>') && prev_char == Some(b'/') {
329                        // <foo bar=http://foo/>
330                        is_go_back = true;
331                        attribute_value = &attribute_value[..attribute_value.len() - 1];
332                    }
333                    if self.stream.advance_if_char(b'\'') || self.stream.advance_if_char(b'"') {
334                        attribute_value = &self.stream.source
335                            [self.stream.pos() - attribute_value.len() - 1..self.stream.pos()]
336                    }
337                    if self
338                        .last_attribute_name
339                        .as_ref()
340                        .is_some_and(|v| v.as_ref() == "type")
341                    {
342                        let s = attribute_value;
343                        self.last_type_value = if s.len() != 0 {
344                            Some(Cow::Borrowed(s))
345                        } else {
346                            None
347                        };
348                    }
349                    let attribute_value_len = attribute_value.len();
350                    if is_go_back {
351                        self.stream.go_back(1);
352                    }
353                    if attribute_value_len > 0 {
354                        self.state = ScannerState::WithinTag;
355                        self.has_space_after_tag = false;
356                        return self.finish_token(offset, TokenType::AttributeValue, None);
357                    }
358                }
359                let ch = self.stream.peek_char(0);
360                if let Some(ch) = ch {
361                    if ch == b'\'' || ch == b'"' {
362                        self.stream.advance(1); // consume quote
363                        if self.stream.advance_until_char(ch) {
364                            self.stream.advance(1); // consume quote
365                        }
366                        if self
367                            .last_attribute_name
368                            .as_ref()
369                            .is_some_and(|v| v.as_ref() == "type")
370                        {
371                            let s = &self.stream.source[if offset + 1 > self.stream.pos() - 1 {
372                                self.stream.pos() - 1..offset + 1
373                            } else {
374                                offset + 1..self.stream.pos() - 1
375                            }];
376                            self.last_type_value = if s.len() != 0 {
377                                Some(Cow::Borrowed(s))
378                            } else {
379                                None
380                            }
381                        }
382                        self.state = ScannerState::WithinTag;
383                        self.has_space_after_tag = false;
384                        return self.finish_token(offset, TokenType::AttributeValue, None);
385                    }
386                }
387                self.state = ScannerState::WithinTag;
388                self.has_space_after_tag = false;
389                return self.internal_scan(); // no advance yet - jump to WithinTag
390            }
391
392            ScannerState::WithinScriptContent => {
393                // see http://stackoverflow.com/questions/14574471/how-do-browsers-parse-a-script-tag-exactly
394                let mut script_state: u8 = 1;
395                while !self.stream.eos() {
396                    let m = self.stream.advance_if_regexp(&REG_SCRIPT_COMMENT);
397                    if m.is_none() {
398                        self.stream.go_to_end();
399                        return self.finish_token(offset, TokenType::Script, None);
400                    } else if m == Some("<!--") {
401                        if script_state == 1 {
402                            script_state = 2;
403                        }
404                    } else if m == Some("-->") {
405                        script_state = 1;
406                    } else if m.is_some_and(|m| &m[1..2] != "/") {
407                        // <script
408                        if script_state == 2 {
409                            script_state = 3;
410                        }
411                    } else {
412                        // </script
413                        if script_state == 3 {
414                            script_state = 2;
415                        } else {
416                            let length = m.map(|v| v.len()).unwrap_or_default();
417                            self.stream.go_back(length); // to the beginning of the closing tag
418                            break;
419                        }
420                    }
421                }
422                self.state = ScannerState::WithinContent;
423                if offset < self.stream.pos() {
424                    return self.finish_token(offset, TokenType::Script, None);
425                }
426                return self.internal_scan(); // no advance yet - jump to content
427            }
428
429            ScannerState::WithinStyleContent => {
430                self.stream.advance_until_regexp(&REG_STYLE);
431                self.state = ScannerState::WithinContent;
432                if offset < self.stream.pos() {
433                    return self.finish_token(offset, TokenType::Styles, None);
434                }
435                return self.internal_scan(); // no advance yet - jump to content
436            }
437        }
438
439        self.stream.advance(1);
440        self.state = ScannerState::WithinContent;
441        return self.finish_token(offset, TokenType::Unknown, error_message);
442    }
443
444    fn finish_token(
445        &mut self,
446        offset: usize,
447        token_type: TokenType,
448        error_message: Option<&'static str>,
449    ) -> TokenType {
450        self.token_type = token_type;
451        self.token_offset = offset;
452        self.token_error = error_message;
453        self.token_type
454    }
455
456    fn next_element_name(&mut self) -> Option<Cow<'a, str>> {
457        if self.case_sensitive {
458            Some(Cow::Borrowed(
459                self.stream.advance_if_regexp(&REG_ELEMENT_NAME)?,
460            ))
461        } else {
462            Some(Cow::Owned(
463                self.stream
464                    .advance_if_regexp(&REG_ELEMENT_NAME)?
465                    .to_lowercase(),
466            ))
467        }
468    }
469
470    fn next_attribute_name(&mut self) -> Option<Cow<'a, str>> {
471        if self.case_sensitive {
472            Some(Cow::Borrowed(
473                self.stream.advance_if_regexp(&REG_NON_ELEMENT_NAME)?,
474            ))
475        } else {
476            Some(Cow::Owned(
477                self.stream
478                    .advance_if_regexp(&REG_NON_ELEMENT_NAME)?
479                    .to_lowercase(),
480            ))
481        }
482    }
483}
484
485#[derive(PartialEq, Debug, Copy, Clone)]
486pub enum TokenType {
487    StartCommentTag,
488    Comment,
489    EndCommentTag,
490    StartTagOpen,
491    StartTagClose,
492    StartTagSelfClose,
493    StartTag,
494    EndTagOpen,
495    EndTagClose,
496    EndTag,
497    DelimiterAssign,
498    AttributeName,
499    AttributeValue,
500    StartDoctypeTag,
501    Doctype,
502    EndDoctypeTag,
503    Content,
504    Whitespace,
505    Unknown,
506    Script,
507    Styles,
508    EOS,
509}
510
511#[derive(Debug, Clone, Copy)]
512pub enum ScannerState {
513    WithinContent,
514    AfterOpeningStartTag,
515    AfterOpeningEndTag,
516    WithinDoctype,
517    WithinTag,
518    WithinEndTag,
519    WithinComment,
520    WithinScriptContent,
521    WithinStyleContent,
522    AfterAttributeName,
523    BeforeAttributeValue,
524}
525
526#[cfg(test)]
527mod tests {
528    use super::*;
529
530    fn assert_tokens(tests: Vec<TestItem>) {
531        let mut scanner_state = ScannerState::WithinContent;
532
533        for t in tests {
534            let mut scanner = Scanner::new(&t.input, 0, scanner_state, false, false);
535            let mut token_type = scanner.scan();
536            let mut actual = vec![];
537            while token_type != TokenType::EOS {
538                let offset = scanner.get_token_offset();
539                let mut actual_token = Token {
540                    offset,
541                    token_type: token_type,
542                    content: None,
543                };
544                if [TokenType::StartTag, TokenType::EndTag].contains(&token_type) {
545                    actual_token.content = Some(
546                        t.input[scanner.get_token_offset()..scanner.get_token_end()].to_string(),
547                    );
548                }
549                actual.push(actual_token);
550                token_type = scanner.scan();
551            }
552            assert_eq!(actual, t.tokens);
553            scanner_state = scanner.get_scanner_state();
554        }
555    }
556
557    #[test]
558    fn open_start_tag() {
559        assert_tokens(vec![TestItem {
560            input: "<abc".to_string(),
561            tokens: vec![
562                Token {
563                    offset: 0,
564                    token_type: TokenType::StartTagOpen,
565                    content: None,
566                },
567                Token {
568                    offset: 1,
569                    token_type: TokenType::StartTag,
570                    content: Some("abc".to_string()),
571                },
572            ],
573        }]);
574        assert_tokens(vec![TestItem {
575            input: "<input".to_string(),
576            tokens: vec![
577                Token {
578                    offset: 0,
579                    token_type: TokenType::StartTagOpen,
580                    content: None,
581                },
582                Token {
583                    offset: 1,
584                    token_type: TokenType::StartTag,
585                    content: Some("input".to_string()),
586                },
587            ],
588        }]);
589    }
590
591    struct TestItem {
592        input: String,
593        tokens: Vec<Token>,
594    }
595
596    #[derive(PartialEq, Debug)]
597    struct Token {
598        offset: usize,
599        token_type: TokenType,
600        content: Option<String>,
601    }
602}