1use lazy_static::lazy_static;
2use regex::Regex;
3
4lazy_static! {
5 static ref REG_DOCTYPE: Regex = Regex::new(r"^!(?i)doctype").unwrap();
6 static ref REG_NON_SPECIAL_START: Regex = Regex::new(r#"^[^\s"'`=<>]+"#).unwrap();
7 static ref REG_SCRIPT_COMMENT: Regex = Regex::new(r"<!--|-->|<\/?script\s*\/?>?").unwrap();
8 static ref REG_ELEMENT_NAME: Regex = Regex::new(r"^[_:\w][_:\w\-.\d]*").unwrap();
9 static ref REG_NON_ELEMENT_NAME: Regex =
10 Regex::new(r#"^[^\s"'></=\x00-\x0F\x7F\x80-\x9F]*"#).unwrap();
11 static ref REG_STYLE: Regex = Regex::new(r"<\/style").unwrap();
12}
13
14pub struct Scanner<'a> {
16 state: ScannerState,
17 token_type: TokenType,
18 token_offset: usize,
19 token_error: Option<&'static str>,
20 stream: MultiLineStream<'a>,
21
22 emit_pseudo_close_tags: bool,
23 has_space_after_tag: bool,
24 last_tag: Option<String>,
25 last_attribute_name: Option<String>,
26 last_type_value: Option<String>,
27}
28
29impl Scanner<'_> {
30 pub fn new<'a>(
31 input: &'a str,
32 initial_offset: usize,
33 initial_state: ScannerState,
34 emit_pseudo_close_tags: bool,
35 ) -> Scanner<'a> {
36 let stream = MultiLineStream::new(input, initial_offset);
37 let token_offset = 0;
38 let token_type = TokenType::Unknown;
39 Scanner {
40 state: initial_state,
41 token_type,
42 token_offset,
43 token_error: None,
44 stream,
45 emit_pseudo_close_tags,
46 has_space_after_tag: false,
47 last_tag: None,
48 last_attribute_name: None,
49 last_type_value: None,
50 }
51 }
52
53 pub fn scan(&mut self) -> TokenType {
54 let offset = self.stream.pos();
55 let old_state = &self.state.clone();
56 self.internal_scan();
57 if self.token_type != TokenType::EOS
58 && offset == self.stream.pos()
59 && !(self.emit_pseudo_close_tags
60 && [TokenType::StartTagClose, TokenType::EndTagClose].contains(&self.token_type))
61 {
62 eprintln!(
63 "Scanner.scan has not advanced at offset {}, state before: {:?} after: {:?}",
64 offset, old_state, self.state,
65 );
66 self.stream.advance(1);
67 return self.finish_token(offset, TokenType::Unknown, None);
68 }
69 self.token_type
70 }
71
72 pub fn get_token_type(&self) -> TokenType {
73 self.token_type
74 }
75
76 pub fn get_token_offset(&self) -> usize {
77 self.token_offset
78 }
79
80 pub fn get_token_length(&self) -> usize {
81 self.stream.pos() - self.token_offset
82 }
83
84 pub fn get_token_end(&self) -> usize {
85 self.stream.pos()
86 }
87
88 pub fn get_token_text(&self) -> &str {
89 &self.stream.source[self.get_token_offset()..self.get_token_end()]
90 }
91
92 pub fn get_scanner_state(&self) -> ScannerState {
93 self.state
94 }
95
96 pub fn get_token_error(&self) -> Option<&'static str> {
97 self.token_error
98 }
99
100 pub fn get_source_len(&self) -> usize {
101 self.stream.len
102 }
103
104 fn internal_scan(&mut self) -> TokenType {
105 let offset = self.stream.pos();
106 if self.stream.eos() {
107 return self.finish_token(offset, TokenType::EOS, None);
108 }
109 let error_message;
110
111 match self.state {
112 ScannerState::WithinComment => {
113 if self.stream.advance_if_chars("-->") {
114 self.state = ScannerState::WithinContent;
116 return self.finish_token(offset, TokenType::EndCommentTag, None);
117 }
118 self.stream.advance_until_chars("-->"); return self.finish_token(offset, TokenType::Comment, None);
120 }
121
122 ScannerState::WithinDoctype => {
123 if self.stream.advance_if_char(b'>') {
124 self.state = ScannerState::WithinContent;
125 return self.finish_token(offset, TokenType::EndDoctypeTag, None);
126 }
127 self.stream.advance_until_char(b'>'); return self.finish_token(offset, TokenType::Doctype, None);
129 }
130
131 ScannerState::WithinContent => {
132 if self.stream.advance_if_char(b'<') {
133 if !self.stream.eos() && self.stream.peek_char(0) == Some(b'!') {
135 if self.stream.advance_if_chars("!--") {
137 self.state = ScannerState::WithinComment;
139 return self.finish_token(offset, TokenType::StartCommentTag, None);
140 }
141 if self.stream.advance_if_regexp(®_DOCTYPE) != "" {
142 self.state = ScannerState::WithinDoctype;
143 return self.finish_token(offset, TokenType::StartDoctypeTag, None);
144 }
145 }
146 if self.stream.advance_if_char(b'/') {
147 self.state = ScannerState::AfterOpeningEndTag;
149 return self.finish_token(offset, TokenType::EndTagOpen, None);
150 }
151 self.state = ScannerState::AfterOpeningStartTag;
152 return self.finish_token(offset, TokenType::StartTagOpen, None);
153 }
154 self.stream.advance_until_char(b'<');
155 return self.finish_token(offset, TokenType::Content, None);
156 }
157
158 ScannerState::AfterOpeningEndTag => {
159 let tag_name = self.next_element_name();
160 if tag_name.is_some() {
161 self.state = ScannerState::WithinEndTag;
162 return self.finish_token(offset, TokenType::EndTag, None);
163 }
164 if self.stream.skip_whitespace() {
165 return self.finish_token(
167 offset,
168 TokenType::Whitespace,
169 Some("Tag name must directly follow the open bracket."),
170 );
171 }
172 self.state = ScannerState::WithinEndTag;
173 self.stream.advance_until_char(b'>');
174 if offset < self.stream.pos() {
175 return self.finish_token(
176 offset,
177 TokenType::Unknown,
178 Some("End tag name expected."),
179 );
180 }
181 return self.internal_scan();
182 }
183
184 ScannerState::WithinEndTag => {
185 if self.stream.skip_whitespace() {
186 return self.finish_token(offset, TokenType::Whitespace, None);
188 }
189 if self.stream.advance_if_char(b'>') {
190 self.state = ScannerState::WithinContent;
192 return self.finish_token(offset, TokenType::EndTagClose, None);
193 }
194 if self.emit_pseudo_close_tags && self.stream.peek_char(0) == Some(b'<') {
195 self.state = ScannerState::WithinContent;
197 return self.finish_token(
198 offset,
199 TokenType::EndTagClose,
200 Some("Closing bracket missing."),
201 );
202 }
203 error_message = Some("Closing bracket expected.");
204 }
205
206 ScannerState::AfterOpeningStartTag => {
207 self.last_tag = self.next_element_name();
208 self.last_type_value = None;
209 self.last_attribute_name = None;
210 if self.last_tag.is_some() {
211 self.has_space_after_tag = false;
212 self.state = ScannerState::WithinTag;
213 return self.finish_token(offset, TokenType::StartTag, None);
214 }
215 if self.stream.skip_whitespace() {
216 return self.finish_token(
218 offset,
219 TokenType::Whitespace,
220 Some("Tag name must directly follow the open bracket."),
221 );
222 }
223 self.state = ScannerState::WithinTag;
224 self.stream.advance_until_char(b'>');
225 if offset < self.stream.pos() {
226 return self.finish_token(
227 offset,
228 TokenType::Unknown,
229 Some("Start tag name expected."),
230 );
231 }
232 return self.internal_scan();
233 }
234
235 ScannerState::WithinTag => {
236 if self.stream.skip_whitespace() {
237 self.has_space_after_tag = true; return self.finish_token(offset, TokenType::Whitespace, None);
239 }
240 if self.has_space_after_tag {
241 self.last_attribute_name = self.next_attribute_name();
242 if self.last_attribute_name.is_some() {
243 self.state = ScannerState::AfterAttributeName;
244 self.has_space_after_tag = false;
245 return self.finish_token(offset, TokenType::AttributeName, None);
246 }
247 }
248 if self.stream.advance_if_chars("/>") {
249 self.state = ScannerState::WithinContent;
251 return self.finish_token(offset, TokenType::StartTagSelfClose, None);
252 }
253 if self.stream.advance_if_char(b'>') {
254 if self.last_tag == Some("script".to_string()) {
256 if self.last_type_value.is_some() {
257 self.state = ScannerState::WithinContent;
259 } else {
260 self.state = ScannerState::WithinScriptContent;
261 }
262 } else if self.last_tag == Some("style".to_string()) {
263 self.state = ScannerState::WithinStyleContent;
264 } else {
265 self.state = ScannerState::WithinContent;
266 }
267 return self.finish_token(offset, TokenType::StartTagClose, None);
268 }
269 if self.emit_pseudo_close_tags && self.stream.peek_char(0) == Some(b'<') {
270 self.state = ScannerState::WithinContent;
272 return self.finish_token(
273 offset,
274 TokenType::StartTagClose,
275 Some("Closing bracket missing."),
276 );
277 }
278 self.stream.advance(1);
279 return self.finish_token(
280 offset,
281 TokenType::Unknown,
282 Some("Unexpected character in tag."),
283 );
284 }
285
286 ScannerState::AfterAttributeName => {
287 if self.stream.skip_whitespace() {
288 self.has_space_after_tag = true;
289 return self.finish_token(offset, TokenType::Whitespace, None);
290 }
291
292 if self.stream.advance_if_char(b'=') {
293 self.state = ScannerState::BeforeAttributeValue;
294 return self.finish_token(offset, TokenType::DelimiterAssign, None);
295 }
296 self.state = ScannerState::WithinTag;
297 return self.internal_scan(); }
299
300 ScannerState::BeforeAttributeValue => {
301 if self.stream.skip_whitespace() {
302 return self.finish_token(offset, TokenType::Whitespace, None);
303 }
304 let cur_char = self.stream.peek_char(0);
305 let prev_char = self.stream.peek_char(-1);
306 let mut attribute_value = self.stream.advance_if_regexp(®_NON_SPECIAL_START);
307 if attribute_value.len() > 0 {
308 let mut is_go_back = false;
309 if cur_char == Some(b'>') && prev_char == Some(b'/') {
310 is_go_back = true;
312 attribute_value = &attribute_value[..attribute_value.len() - 1];
313 }
314 if self.last_attribute_name == Some("type".to_string()) {
315 let s = attribute_value.to_string();
316 self.last_type_value = if s.len() != 0 { Some(s) } else { None };
317 }
318 let attribute_value_len = attribute_value.len();
319 if is_go_back {
320 self.stream.go_back(1);
321 }
322 if attribute_value_len > 0 {
323 self.state = ScannerState::WithinTag;
324 self.has_space_after_tag = false;
325 return self.finish_token(offset, TokenType::AttributeValue, None);
326 }
327 }
328 let ch = self.stream.peek_char(0);
329 if let Some(ch) = ch {
330 if ch == b'\'' || ch == b'"' {
331 self.stream.advance(1); if self.stream.advance_until_char(ch) {
333 self.stream.advance(1); }
335 if self.last_attribute_name == Some("type".to_string()) {
336 let s =
337 self.stream.get_source()[if offset + 1 > self.stream.pos() - 1 {
338 self.stream.pos() - 1..offset + 1
339 } else {
340 offset + 1..self.stream.pos() - 1
341 }]
342 .to_string();
343 self.last_type_value = if s.len() != 0 { Some(s) } else { None }
344 }
345 self.state = ScannerState::WithinTag;
346 self.has_space_after_tag = false;
347 return self.finish_token(offset, TokenType::AttributeValue, None);
348 }
349 }
350 self.state = ScannerState::WithinTag;
351 self.has_space_after_tag = false;
352 return self.internal_scan(); }
354
355 ScannerState::WithinScriptContent => {
356 let mut script_state: u8 = 1;
358 while !self.stream.eos() {
359 let m = self.stream.advance_if_regexp(®_SCRIPT_COMMENT);
360 if m.len() == 0 {
361 self.stream.go_to_end();
362 return self.finish_token(offset, TokenType::Script, None);
363 } else if m == "<!--" {
364 if script_state == 1 {
365 script_state = 2;
366 }
367 } else if m == "-->" {
368 script_state = 1;
369 } else if &m[1..2] != "/" {
370 if script_state == 2 {
372 script_state = 3;
373 }
374 } else {
375 if script_state == 3 {
377 script_state = 2;
378 } else {
379 let length = m.len();
380 self.stream.go_back(length); break;
382 }
383 }
384 }
385 self.state = ScannerState::WithinContent;
386 if offset < self.stream.pos() {
387 return self.finish_token(offset, TokenType::Script, None);
388 }
389 return self.internal_scan(); }
391
392 ScannerState::WithinStyleContent => {
393 self.stream.advance_until_regexp(®_STYLE);
394 self.state = ScannerState::WithinContent;
395 if offset < self.stream.pos() {
396 return self.finish_token(offset, TokenType::Styles, None);
397 }
398 return self.internal_scan(); }
400 }
401
402 self.stream.advance(1);
403 self.state = ScannerState::WithinContent;
404 return self.finish_token(offset, TokenType::Unknown, error_message);
405 }
406
407 fn finish_token(
408 &mut self,
409 offset: usize,
410 token_type: TokenType,
411 error_message: Option<&'static str>,
412 ) -> TokenType {
413 self.token_type = token_type;
414 self.token_offset = offset;
415 self.token_error = error_message;
416 self.token_type
417 }
418
419 fn next_element_name(&mut self) -> Option<String> {
420 let s = self
421 .stream
422 .advance_if_regexp(®_ELEMENT_NAME)
423 .to_lowercase();
424 if s.len() != 0 {
425 Some(s)
426 } else {
427 None
428 }
429 }
430
431 fn next_attribute_name(&mut self) -> Option<String> {
432 let s = self
433 .stream
434 .advance_if_regexp(®_NON_ELEMENT_NAME)
435 .to_lowercase();
436 if s.len() != 0 {
437 Some(s)
438 } else {
439 None
440 }
441 }
442}
443
444struct MultiLineStream<'a> {
445 source: &'a str,
446 len: usize,
447 position: usize,
448}
449
450impl MultiLineStream<'_> {
451 pub fn new<'a>(source: &'a str, position: usize) -> MultiLineStream<'a> {
452 MultiLineStream {
453 source,
454 len: source.len(),
455 position,
456 }
457 }
458
459 pub fn eos(&self) -> bool {
460 self.len <= self.position
461 }
462
463 pub fn get_source(&self) -> &str {
464 self.source
465 }
466
467 pub fn pos(&self) -> usize {
468 self.position
469 }
470
471 pub fn go_back(&mut self, n: usize) {
472 self.position -= n;
473 }
474
475 pub fn advance(&mut self, n: usize) {
476 self.position += n;
477 }
478
479 pub fn go_to_end(&mut self) {
480 self.position = self.len;
481 }
482
483 pub fn peek_char(&self, n: isize) -> Option<u8> {
484 let index = if n >= 0 {
485 self.position + n as usize
486 } else {
487 self.position - (-n) as usize
488 };
489 Some(self.source.bytes().nth(index)?)
490 }
491
492 pub fn advance_if_char(&mut self, ch: u8) -> bool {
493 if let Some(char) = self.source.bytes().nth(self.position) {
494 if char == ch {
495 self.position += 1;
496 return true;
497 }
498 }
499 false
500 }
501
502 pub fn advance_if_chars(&mut self, ch: &str) -> bool {
503 if self.position + ch.len() > self.len {
504 return false;
505 }
506
507 if !self
508 .source
509 .get(self.position..self.position + ch.len())
510 .is_some_and(|v| v == ch)
511 {
512 return false;
513 }
514
515 self.advance(ch.len());
516 true
517 }
518
519 pub fn advance_if_regexp(&mut self, regexp: &Regex) -> &str {
520 let haystack = &self.source[self.position..];
521 if let Some(captures) = regexp.captures(haystack) {
522 let m = captures.get(0).unwrap();
523 self.position += m.end();
524 m.as_str()
525 } else {
526 ""
527 }
528 }
529
530 pub fn advance_until_regexp(&mut self, regexp: &Regex) -> &str {
531 let haystack = &self.source[self.position..];
532 if let Some(captures) = regexp.captures(haystack) {
533 let m = captures.get(0).unwrap();
534 self.position += m.start();
535 m.as_str()
536 } else {
537 self.go_to_end();
538 ""
539 }
540 }
541
542 pub fn advance_until_char(&mut self, ch: u8) -> bool {
543 while self.position < self.len {
544 if self.source.bytes().nth(self.position) == Some(ch) {
545 return true;
546 }
547 self.advance(1);
548 }
549 false
550 }
551
552 pub fn advance_until_chars(&mut self, ch: &str) -> bool {
553 while self.position + ch.len() <= self.len {
554 if self
555 .source
556 .get(self.position..self.position + ch.len())
557 .is_some_and(|v| v == ch)
558 {
559 return true;
560 }
561 self.advance(1);
562 }
563 self.go_to_end();
564 false
565 }
566
567 pub fn skip_whitespace(&mut self) -> bool {
568 let n = self.advance_while_char(|ch| vec![b' ', b'\t', b'\n', 12, b'\r'].contains(&ch));
569 n > 0
570 }
571
572 pub fn advance_while_char<F>(&mut self, condition: F) -> usize
573 where
574 F: Fn(u8) -> bool,
575 {
576 let pos_now = self.position;
577 while self.position < self.len && condition(self.source.bytes().nth(self.position).unwrap())
578 {
579 self.advance(1);
580 }
581 self.position - pos_now
582 }
583}
584
585#[derive(PartialEq, Debug, Copy, Clone)]
586pub enum TokenType {
587 StartCommentTag,
588 Comment,
589 EndCommentTag,
590 StartTagOpen,
591 StartTagClose,
592 StartTagSelfClose,
593 StartTag,
594 EndTagOpen,
595 EndTagClose,
596 EndTag,
597 DelimiterAssign,
598 AttributeName,
599 AttributeValue,
600 StartDoctypeTag,
601 Doctype,
602 EndDoctypeTag,
603 Content,
604 Whitespace,
605 Unknown,
606 Script,
607 Styles,
608 EOS,
609}
610
611#[derive(Debug, Clone, Copy)]
612pub enum ScannerState {
613 WithinContent,
614 AfterOpeningStartTag,
615 AfterOpeningEndTag,
616 WithinDoctype,
617 WithinTag,
618 WithinEndTag,
619 WithinComment,
620 WithinScriptContent,
621 WithinStyleContent,
622 AfterAttributeName,
623 BeforeAttributeValue,
624}
625
626#[cfg(test)]
627mod tests {
628 use super::*;
629
630 fn assert_tokens(tests: Vec<TestItem>) {
631 let mut scanner_state = ScannerState::WithinContent;
632
633 for t in tests {
634 let mut scanner = Scanner::new(&t.input, 0, scanner_state, false);
635 let mut token_type = scanner.scan();
636 let mut actual = vec![];
637 while token_type != TokenType::EOS {
638 let offset = scanner.get_token_offset();
639 let mut actual_token = Token {
640 offset,
641 token_type: token_type,
642 content: None,
643 };
644 if [TokenType::StartTag, TokenType::EndTag].contains(&token_type) {
645 actual_token.content = Some(
646 t.input[scanner.get_token_offset()..scanner.get_token_end()].to_string(),
647 );
648 }
649 actual.push(actual_token);
650 token_type = scanner.scan();
651 }
652 assert_eq!(actual, t.tokens);
653 scanner_state = scanner.get_scanner_state();
654 }
655 }
656
657 #[test]
658 fn open_start_tag() {
659 assert_tokens(vec![TestItem {
660 input: "<abc".to_string(),
661 tokens: vec![
662 Token {
663 offset: 0,
664 token_type: TokenType::StartTagOpen,
665 content: None,
666 },
667 Token {
668 offset: 1,
669 token_type: TokenType::StartTag,
670 content: Some("abc".to_string()),
671 },
672 ],
673 }]);
674 assert_tokens(vec![TestItem {
675 input: "<input".to_string(),
676 tokens: vec![
677 Token {
678 offset: 0,
679 token_type: TokenType::StartTagOpen,
680 content: None,
681 },
682 Token {
683 offset: 1,
684 token_type: TokenType::StartTag,
685 content: Some("input".to_string()),
686 },
687 ],
688 }]);
689 }
690
691 struct TestItem {
692 input: String,
693 tokens: Vec<Token>,
694 }
695
696 #[derive(PartialEq, Debug)]
697 struct Token {
698 offset: usize,
699 token_type: TokenType,
700 content: Option<String>,
701 }
702}