1use std::borrow::Cow;
2
3use lazy_static::lazy_static;
4use multi_line_stream::MultiLineStream;
5use regex::Regex;
6
7lazy_static! {
8 static ref REG_DOCTYPE: Regex = Regex::new(r"^!(?i)doctype").unwrap();
9 static ref REG_NON_SPECIAL_START: Regex = Regex::new(r#"^[^\s"'`=<>]+"#).unwrap();
10 static ref REG_SCRIPT_COMMENT: Regex = Regex::new(r"<!--|-->|<\/?script\s*\/?>?").unwrap();
11 static ref REG_ELEMENT_NAME: Regex = Regex::new(r"^[_:\w][_:\w\-.\d]*").unwrap();
12 static ref REG_NON_ELEMENT_NAME: Regex =
13 Regex::new(r#"^[^\s"'></=\x00-\x0F\x7F\x80-\x9F]+"#).unwrap();
14 static ref REG_STYLE: Regex = Regex::new(r"<\/style").unwrap();
15}
16
17pub struct Scanner<'a> {
19 state: ScannerState,
20 token_type: TokenType,
21 token_offset: usize,
22 token_error: Option<&'static str>,
23 stream: MultiLineStream<'a>,
24
25 emit_pseudo_close_tags: bool,
26 has_space_after_tag: bool,
27 last_tag: Option<Cow<'a, str>>,
28 last_attribute_name: Option<Cow<'a, str>>,
29 last_type_value: Option<Cow<'a, str>>,
30 case_sensitive: bool,
31}
32
33impl<'a> Scanner<'a> {
34 pub fn new(
35 input: &'a str,
36 initial_offset: usize,
37 initial_state: ScannerState,
38 emit_pseudo_close_tags: bool,
39 case_sensitive: bool,
40 ) -> Scanner<'a> {
41 let stream = MultiLineStream::new(input, initial_offset);
42 let token_offset = 0;
43 let token_type = TokenType::Unknown;
44 Scanner {
45 state: initial_state,
46 token_type,
47 token_offset,
48 token_error: None,
49 stream,
50 emit_pseudo_close_tags,
51 has_space_after_tag: false,
52 last_tag: None,
53 last_attribute_name: None,
54 last_type_value: None,
55 case_sensitive,
56 }
57 }
58
59 pub fn scan(&mut self) -> TokenType {
60 let offset = self.stream.pos();
61 let old_state = &self.state.clone();
62 self.internal_scan();
63 if self.token_type != TokenType::EOS
64 && offset == self.stream.pos()
65 && !(self.emit_pseudo_close_tags
66 && [TokenType::StartTagClose, TokenType::EndTagClose].contains(&self.token_type))
67 {
68 eprintln!(
69 "Scanner.scan has not advanced at offset {}, state before: {:?} after: {:?}",
70 offset, old_state, self.state,
71 );
72 self.stream.advance(1);
73 return self.finish_token(offset, TokenType::Unknown, None);
74 }
75 self.token_type
76 }
77
78 pub fn get_token_type(&self) -> TokenType {
79 self.token_type
80 }
81
82 pub fn get_token_offset(&self) -> usize {
83 self.token_offset
84 }
85
86 pub fn get_token_length(&self) -> usize {
87 self.stream.pos() - self.token_offset
88 }
89
90 pub fn get_token_end(&self) -> usize {
91 self.stream.pos()
92 }
93
94 pub fn get_token_text(&self) -> &'a str {
95 &self.stream.source[self.get_token_offset()..self.get_token_end()]
96 }
97
98 pub fn get_scanner_state(&self) -> ScannerState {
99 self.state
100 }
101
102 pub fn get_token_error(&self) -> Option<&'static str> {
103 self.token_error
104 }
105
106 pub fn get_source_len(&self) -> usize {
107 self.stream.source.len()
108 }
109
110 fn internal_scan(&mut self) -> TokenType {
111 let offset = self.stream.pos();
112 if self.stream.eos() {
113 return self.finish_token(offset, TokenType::EOS, None);
114 }
115 let error_message;
116
117 match self.state {
118 ScannerState::WithinComment => {
119 if self.stream.advance_if_chars("-->") {
120 self.state = ScannerState::WithinContent;
122 return self.finish_token(offset, TokenType::EndCommentTag, None);
123 }
124 self.stream.advance_until_chars("-->"); return self.finish_token(offset, TokenType::Comment, None);
126 }
127
128 ScannerState::WithinDoctype => {
129 if self.stream.advance_if_char(b'>') {
130 self.state = ScannerState::WithinContent;
131 return self.finish_token(offset, TokenType::EndDoctypeTag, None);
132 }
133 self.stream.advance_until_char(b'>'); return self.finish_token(offset, TokenType::Doctype, None);
135 }
136
137 ScannerState::WithinContent => {
138 if self.stream.advance_if_char(b'<') {
139 if !self.stream.eos() && self.stream.peek_char(0) == Some(b'!') {
141 if self.stream.advance_if_chars("!--") {
143 self.state = ScannerState::WithinComment;
145 return self.finish_token(offset, TokenType::StartCommentTag, None);
146 }
147 if self.stream.advance_if_regexp(®_DOCTYPE).is_some() {
148 self.state = ScannerState::WithinDoctype;
149 return self.finish_token(offset, TokenType::StartDoctypeTag, None);
150 }
151 }
152 if self.stream.advance_if_char(b'/') {
153 self.state = ScannerState::AfterOpeningEndTag;
155 return self.finish_token(offset, TokenType::EndTagOpen, None);
156 }
157 self.state = ScannerState::AfterOpeningStartTag;
158 return self.finish_token(offset, TokenType::StartTagOpen, None);
159 }
160 self.stream.advance_until_char(b'<');
161 return self.finish_token(offset, TokenType::Content, None);
162 }
163
164 ScannerState::AfterOpeningEndTag => {
165 let tag_name = self.next_element_name();
166 if tag_name.is_some() {
167 self.state = ScannerState::WithinEndTag;
168 return self.finish_token(offset, TokenType::EndTag, None);
169 }
170 if self.stream.skip_whitespace() {
171 return self.finish_token(
173 offset,
174 TokenType::Whitespace,
175 Some("Tag name must directly follow the open bracket."),
176 );
177 }
178 self.state = ScannerState::WithinEndTag;
179 self.stream.advance_until_char(b'>');
180 if offset < self.stream.pos() {
181 return self.finish_token(
182 offset,
183 TokenType::Unknown,
184 Some("End tag name expected."),
185 );
186 }
187 return self.internal_scan();
188 }
189
190 ScannerState::WithinEndTag => {
191 if self.stream.skip_whitespace() {
192 return self.finish_token(offset, TokenType::Whitespace, None);
194 }
195 if self.stream.advance_if_char(b'>') {
196 self.state = ScannerState::WithinContent;
198 return self.finish_token(offset, TokenType::EndTagClose, None);
199 }
200 if self.emit_pseudo_close_tags && self.stream.peek_char(0) == Some(b'<') {
201 self.state = ScannerState::WithinContent;
203 return self.finish_token(
204 offset,
205 TokenType::EndTagClose,
206 Some("Closing bracket missing."),
207 );
208 }
209 error_message = Some("Closing bracket expected.");
210 }
211
212 ScannerState::AfterOpeningStartTag => {
213 self.last_tag = self.next_element_name();
214 self.last_type_value = None;
215 self.last_attribute_name = None;
216 if self.last_tag.is_some() {
217 self.has_space_after_tag = false;
218 self.state = ScannerState::WithinTag;
219 return self.finish_token(offset, TokenType::StartTag, None);
220 }
221 if self.stream.skip_whitespace() {
222 return self.finish_token(
224 offset,
225 TokenType::Whitespace,
226 Some("Tag name must directly follow the open bracket."),
227 );
228 }
229 self.state = ScannerState::WithinTag;
230 self.stream.advance_until_char(b'>');
231 if offset < self.stream.pos() {
232 return self.finish_token(
233 offset,
234 TokenType::Unknown,
235 Some("Start tag name expected."),
236 );
237 }
238 return self.internal_scan();
239 }
240
241 ScannerState::WithinTag => {
242 if self.stream.skip_whitespace() {
243 self.has_space_after_tag = true; return self.finish_token(offset, TokenType::Whitespace, None);
245 }
246 if self.has_space_after_tag {
247 self.last_attribute_name = self.next_attribute_name();
248 if self.last_attribute_name.is_some() {
249 self.state = ScannerState::AfterAttributeName;
250 self.has_space_after_tag = false;
251 return self.finish_token(offset, TokenType::AttributeName, None);
252 }
253 if self.stream.peek_char(0) == Some(b'=') {
254 self.has_space_after_tag = false;
255 }
256 }
257 if self.stream.advance_if_chars("/>") {
258 self.state = ScannerState::WithinContent;
260 return self.finish_token(offset, TokenType::StartTagSelfClose, None);
261 }
262 if self.stream.advance_if_char(b'>') {
263 if self
265 .last_tag
266 .as_ref()
267 .is_some_and(|v| v.as_ref() == "script")
268 {
269 if self.last_type_value.is_some() {
270 self.state = ScannerState::WithinContent;
272 } else {
273 self.state = ScannerState::WithinScriptContent;
274 }
275 } else if self
276 .last_tag
277 .as_ref()
278 .is_some_and(|v| v.as_ref() == "style")
279 {
280 self.state = ScannerState::WithinStyleContent;
281 } else {
282 self.state = ScannerState::WithinContent;
283 }
284 return self.finish_token(offset, TokenType::StartTagClose, None);
285 }
286 if self.emit_pseudo_close_tags && self.stream.peek_char(0) == Some(b'<') {
287 self.state = ScannerState::WithinContent;
289 return self.finish_token(
290 offset,
291 TokenType::StartTagClose,
292 Some("Closing bracket missing."),
293 );
294 }
295 self.stream.advance(1);
296 return self.finish_token(
297 offset,
298 TokenType::Unknown,
299 Some("Unexpected character in tag."),
300 );
301 }
302
303 ScannerState::AfterAttributeName => {
304 if self.stream.skip_whitespace() {
305 self.has_space_after_tag = true;
306 return self.finish_token(offset, TokenType::Whitespace, None);
307 }
308
309 if self.stream.advance_if_char(b'=') {
310 self.state = ScannerState::BeforeAttributeValue;
311 return self.finish_token(offset, TokenType::DelimiterAssign, None);
312 }
313 self.state = ScannerState::WithinTag;
314 return self.internal_scan(); }
316
317 ScannerState::BeforeAttributeValue => {
318 if self.stream.skip_whitespace() {
319 self.state = ScannerState::WithinTag;
320 self.has_space_after_tag = true;
321 return self.finish_token(offset, TokenType::Whitespace, None);
322 }
323 let cur_char = self.stream.peek_char(0);
324 let prev_char = self.stream.peek_char(-1);
325 let attribute_value = self.stream.advance_if_regexp(®_NON_SPECIAL_START);
326 if let Some(mut attribute_value) = attribute_value {
327 let mut is_go_back = false;
328 if cur_char == Some(b'>') && prev_char == Some(b'/') {
329 is_go_back = true;
331 attribute_value = &attribute_value[..attribute_value.len() - 1];
332 }
333 if self.stream.advance_if_char(b'\'') || self.stream.advance_if_char(b'"') {
334 attribute_value = &self.stream.source
335 [self.stream.pos() - attribute_value.len() - 1..self.stream.pos()]
336 }
337 if self
338 .last_attribute_name
339 .as_ref()
340 .is_some_and(|v| v.as_ref() == "type")
341 {
342 let s = attribute_value;
343 self.last_type_value = if s.len() != 0 {
344 Some(Cow::Borrowed(s))
345 } else {
346 None
347 };
348 }
349 let attribute_value_len = attribute_value.len();
350 if is_go_back {
351 self.stream.go_back(1);
352 }
353 if attribute_value_len > 0 {
354 self.state = ScannerState::WithinTag;
355 self.has_space_after_tag = false;
356 return self.finish_token(offset, TokenType::AttributeValue, None);
357 }
358 }
359 let ch = self.stream.peek_char(0);
360 if let Some(ch) = ch {
361 if ch == b'\'' || ch == b'"' {
362 self.stream.advance(1); if self.stream.advance_until_char(ch) {
364 self.stream.advance(1); }
366 if self
367 .last_attribute_name
368 .as_ref()
369 .is_some_and(|v| v.as_ref() == "type")
370 {
371 let s = &self.stream.source[if offset + 1 > self.stream.pos() - 1 {
372 self.stream.pos() - 1..offset + 1
373 } else {
374 offset + 1..self.stream.pos() - 1
375 }];
376 self.last_type_value = if s.len() != 0 {
377 Some(Cow::Borrowed(s))
378 } else {
379 None
380 }
381 }
382 self.state = ScannerState::WithinTag;
383 self.has_space_after_tag = false;
384 return self.finish_token(offset, TokenType::AttributeValue, None);
385 }
386 }
387 self.state = ScannerState::WithinTag;
388 self.has_space_after_tag = false;
389 return self.internal_scan(); }
391
392 ScannerState::WithinScriptContent => {
393 let mut script_state: u8 = 1;
395 while !self.stream.eos() {
396 let m = self.stream.advance_if_regexp(®_SCRIPT_COMMENT);
397 if m.is_none() {
398 self.stream.go_to_end();
399 return self.finish_token(offset, TokenType::Script, None);
400 } else if m == Some("<!--") {
401 if script_state == 1 {
402 script_state = 2;
403 }
404 } else if m == Some("-->") {
405 script_state = 1;
406 } else if m.is_some_and(|m| &m[1..2] != "/") {
407 if script_state == 2 {
409 script_state = 3;
410 }
411 } else {
412 if script_state == 3 {
414 script_state = 2;
415 } else {
416 let length = m.map(|v| v.len()).unwrap_or_default();
417 self.stream.go_back(length); break;
419 }
420 }
421 }
422 self.state = ScannerState::WithinContent;
423 if offset < self.stream.pos() {
424 return self.finish_token(offset, TokenType::Script, None);
425 }
426 return self.internal_scan(); }
428
429 ScannerState::WithinStyleContent => {
430 self.stream.advance_until_regexp(®_STYLE);
431 self.state = ScannerState::WithinContent;
432 if offset < self.stream.pos() {
433 return self.finish_token(offset, TokenType::Styles, None);
434 }
435 return self.internal_scan(); }
437 }
438
439 self.stream.advance(1);
440 self.state = ScannerState::WithinContent;
441 return self.finish_token(offset, TokenType::Unknown, error_message);
442 }
443
444 fn finish_token(
445 &mut self,
446 offset: usize,
447 token_type: TokenType,
448 error_message: Option<&'static str>,
449 ) -> TokenType {
450 self.token_type = token_type;
451 self.token_offset = offset;
452 self.token_error = error_message;
453 self.token_type
454 }
455
456 fn next_element_name(&mut self) -> Option<Cow<'a, str>> {
457 if self.case_sensitive {
458 Some(Cow::Borrowed(
459 self.stream.advance_if_regexp(®_ELEMENT_NAME)?,
460 ))
461 } else {
462 Some(Cow::Owned(
463 self.stream
464 .advance_if_regexp(®_ELEMENT_NAME)?
465 .to_lowercase(),
466 ))
467 }
468 }
469
470 fn next_attribute_name(&mut self) -> Option<Cow<'a, str>> {
471 if self.case_sensitive {
472 Some(Cow::Borrowed(
473 self.stream.advance_if_regexp(®_NON_ELEMENT_NAME)?,
474 ))
475 } else {
476 Some(Cow::Owned(
477 self.stream
478 .advance_if_regexp(®_NON_ELEMENT_NAME)?
479 .to_lowercase(),
480 ))
481 }
482 }
483}
484
485#[derive(PartialEq, Debug, Copy, Clone)]
486pub enum TokenType {
487 StartCommentTag,
488 Comment,
489 EndCommentTag,
490 StartTagOpen,
491 StartTagClose,
492 StartTagSelfClose,
493 StartTag,
494 EndTagOpen,
495 EndTagClose,
496 EndTag,
497 DelimiterAssign,
498 AttributeName,
499 AttributeValue,
500 StartDoctypeTag,
501 Doctype,
502 EndDoctypeTag,
503 Content,
504 Whitespace,
505 Unknown,
506 Script,
507 Styles,
508 EOS,
509}
510
511#[derive(Debug, Clone, Copy)]
512pub enum ScannerState {
513 WithinContent,
514 AfterOpeningStartTag,
515 AfterOpeningEndTag,
516 WithinDoctype,
517 WithinTag,
518 WithinEndTag,
519 WithinComment,
520 WithinScriptContent,
521 WithinStyleContent,
522 AfterAttributeName,
523 BeforeAttributeValue,
524}
525
526#[cfg(test)]
527mod tests {
528 use super::*;
529
530 fn assert_tokens(tests: Vec<TestItem>) {
531 let mut scanner_state = ScannerState::WithinContent;
532
533 for t in tests {
534 let mut scanner = Scanner::new(&t.input, 0, scanner_state, false, false);
535 let mut token_type = scanner.scan();
536 let mut actual = vec![];
537 while token_type != TokenType::EOS {
538 let offset = scanner.get_token_offset();
539 let mut actual_token = Token {
540 offset,
541 token_type: token_type,
542 content: None,
543 };
544 if [TokenType::StartTag, TokenType::EndTag].contains(&token_type) {
545 actual_token.content = Some(
546 t.input[scanner.get_token_offset()..scanner.get_token_end()].to_string(),
547 );
548 }
549 actual.push(actual_token);
550 token_type = scanner.scan();
551 }
552 assert_eq!(actual, t.tokens);
553 scanner_state = scanner.get_scanner_state();
554 }
555 }
556
557 #[test]
558 fn open_start_tag() {
559 assert_tokens(vec![TestItem {
560 input: "<abc".to_string(),
561 tokens: vec![
562 Token {
563 offset: 0,
564 token_type: TokenType::StartTagOpen,
565 content: None,
566 },
567 Token {
568 offset: 1,
569 token_type: TokenType::StartTag,
570 content: Some("abc".to_string()),
571 },
572 ],
573 }]);
574 assert_tokens(vec![TestItem {
575 input: "<input".to_string(),
576 tokens: vec![
577 Token {
578 offset: 0,
579 token_type: TokenType::StartTagOpen,
580 content: None,
581 },
582 Token {
583 offset: 1,
584 token_type: TokenType::StartTag,
585 content: Some("input".to_string()),
586 },
587 ],
588 }]);
589 }
590
591 struct TestItem {
592 input: String,
593 tokens: Vec<Token>,
594 }
595
596 #[derive(PartialEq, Debug)]
597 struct Token {
598 offset: usize,
599 token_type: TokenType,
600 content: Option<String>,
601 }
602}