1use crate::common::{is_name_char, is_whitespace_char, is_xml10_char, is_xml11_char, Position, TextPosition};
6use crate::reader::error::SyntaxError;
7use crate::reader::{Error, ErrorKind};
8use crate::util::{CharReader, Encoding};
9use std::collections::VecDeque;
10use std::io::Read;
11use std::{fmt, result};
12
13use super::ParserConfig2;
14
15#[derive(Copy, Clone, PartialEq, Eq, Debug)]
18pub(crate) enum Token {
19 ProcessingInstructionStart,
21 ProcessingInstructionEnd,
23 DoctypeStart,
25 OpeningTagStart,
27 ClosingTagStart,
29 TagEnd,
31 EmptyTagEnd,
33 CommentStart,
35 CommentEnd,
37 Character(char),
39 EqualsSign,
41 SingleQuote,
43 DoubleQuote,
45 CDataStart,
47 CDataEnd,
49 ReferenceStart,
51 ReferenceEnd,
53 MarkupDeclarationStart,
55 Eof,
57}
58
59impl fmt::Display for Token {
60 #[cold]
61 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
62 match *self {
63 Token::Character(c) => c.fmt(f),
64 other => match other {
65 Token::OpeningTagStart => "<",
66 Token::ProcessingInstructionStart => "<?",
67 Token::DoctypeStart => "<!DOCTYPE",
68 Token::ClosingTagStart => "</",
69 Token::CommentStart => "<!--",
70 Token::CDataStart => "<![CDATA[",
71 Token::TagEnd => ">",
72 Token::EmptyTagEnd => "/>",
73 Token::ProcessingInstructionEnd => "?>",
74 Token::CommentEnd => "-->",
75 Token::CDataEnd => "]]>",
76 Token::ReferenceStart => "&",
77 Token::ReferenceEnd => ";",
78 Token::EqualsSign => "=",
79 Token::SingleQuote => "'",
80 Token::DoubleQuote => "\"",
81 Token::MarkupDeclarationStart => "<!",
82 Token::Eof | Token::Character(_) => {
83 debug_assert!(false);
84 ""
85 },
86 }.fmt(f),
87 }
88 }
89}
90
91impl Token {
92 pub const fn as_static_str(self) -> Option<&'static str> {
93 match self {
94 Self::OpeningTagStart => Some("<"),
95 Self::ProcessingInstructionStart => Some("<?"),
96 Self::DoctypeStart => Some("<!DOCTYPE"),
97 Self::ClosingTagStart => Some("</"),
98 Self::CommentStart => Some("<!--"),
99 Self::CDataStart => Some("<![CDATA["),
100 Self::TagEnd => Some(">"),
101 Self::EmptyTagEnd => Some("/>"),
102 Self::ProcessingInstructionEnd => Some("?>"),
103 Self::CommentEnd => Some("-->"),
104 Self::CDataEnd => Some("]]>"),
105 Self::ReferenceStart => Some("&"),
106 Self::ReferenceEnd => Some(";"),
107 Self::EqualsSign => Some("="),
108 Self::SingleQuote => Some("'"),
109 Self::DoubleQuote => Some("\""),
110 _ => None
111 }
112 }
113
114 pub fn push_to_string(self, target: &mut String) {
116 match self {
117 Self::Character(c) => {
118 debug_assert!(is_xml10_char(c) || is_xml11_char(c));
119 target.push(c);
120 },
121 _ => if let Some(s) = self.as_static_str() {
122 target.push_str(s);
123 }
124 }
125 }
126}
127
128#[derive(Copy, Clone)]
129enum State {
130 Normal,
132 TagStarted,
134 CommentOrCDataOrDoctypeStarted,
136 CommentStarted,
138 DoctypeStarted(DoctypeStartedSubstate),
140 InsideMarkupDeclaration,
142 InsideDoctype,
144 CDataStarted(CDataStartedSubstate),
146 ProcessingInstructionClosing,
148 EmptyTagClosing,
150 CommentClosing(ClosingSubstate),
152 CDataClosing(ClosingSubstate),
154 InvalidCDataClosing(ClosingSubstate),
156 InsideComment,
158 InsideCdata,
160 InsideProcessingInstruction,
162 InsideMarkupDeclarationQuotedString(QuoteStyle),
164}
165
166#[derive(Copy, Clone, Eq, PartialEq)]
167enum QuoteStyle {
168 Single, Double
169}
170
171#[derive(Copy, Clone)]
172enum ClosingSubstate {
173 First, Second
174}
175
176#[derive(Copy, Clone)]
177#[allow(clippy::upper_case_acronyms)]
178enum DoctypeStartedSubstate {
179 D, DO, DOC, DOCT, DOCTY, DOCTYP
180}
181
182#[derive(Copy, Clone)]
183#[allow(clippy::upper_case_acronyms)]
184enum CDataStartedSubstate {
185 E, C, CD, CDA, CDAT, CDATA
186}
187
188pub(crate) type Result<T = Option<Token>, E = Error> = result::Result<T, E>;
190
191macro_rules! dispatch_on_enum_state(
194 ($_self:ident, $s:expr, $c:expr, $is:expr,
195 $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
196 $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
197 match $s {
198 $(
199 $st => match $c {
200 $stc => Ok($_self.move_to($is($next_st))),
201 _ => $_self.handle_error($chunk, $c)
202 },
203 )+
204 $end_st => match $c {
205 $end_c => $e,
206 _ => $_self.handle_error($end_chunk, $c)
207 }
208 }
209 )
210);
211
212pub(crate) struct Lexer {
222 st: State,
223 reader: CharReader,
224 pos: TextPosition,
225 head_pos: TextPosition,
226 char_queue: VecDeque<char>,
227 normal_state: State,
229 inside_token: bool,
230 eof_handled: bool,
231 reparse_depth: u8,
232 #[cfg(test)]
233 skip_errors: bool,
234
235 max_entity_expansion_depth: u8,
236 max_entity_expansion_length: usize,
237}
238
239impl Position for Lexer {
240 #[inline]
241 fn position(&self) -> TextPosition { self.pos }
243}
244
245impl Lexer {
246 pub(crate) fn new(config: &ParserConfig2) -> Self {
248 Self {
249 reader: CharReader::new(),
250 pos: TextPosition::new(),
251 head_pos: TextPosition::new(),
252 char_queue: VecDeque::with_capacity(4), st: State::Normal,
254 normal_state: State::Normal,
255 inside_token: false,
256 eof_handled: false,
257 reparse_depth: 0,
258 #[cfg(test)]
259 skip_errors: false,
260
261 max_entity_expansion_depth: config.max_entity_expansion_depth,
262 max_entity_expansion_length: config.max_entity_expansion_length,
263 }
264 }
265
266 pub(crate) fn encoding(&self) -> Encoding {
267 self.reader.encoding
268 }
269
270 pub(crate) fn set_encoding(&mut self, encoding: Encoding) {
271 self.reader.encoding = encoding;
272 }
273
274 #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; }
277
278 #[inline]
280 pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
281
282 pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result<Token> {
292 if self.eof_handled {
294 return Ok(Token::Eof);
295 }
296
297 if !self.inside_token {
298 self.pos = self.head_pos;
299 self.inside_token = true;
300 }
301
302 while let Some(c) = self.char_queue.pop_front() {
304 if let Some(t) = self.dispatch_char(c)? {
305 self.inside_token = false;
306 return Ok(t);
307 }
308 }
309 self.reparse_depth = 0;
311 while let Some(c) = self.reader.next_char_from(b)? {
312 if c == '\n' {
313 self.head_pos.new_line();
314 } else {
315 self.head_pos.advance(1);
316 }
317
318 if let Some(t) = self.dispatch_char(c)? {
319 self.inside_token = false;
320 return Ok(t);
321 }
322 }
323
324 self.end_of_stream()
325 }
326
327 #[inline(never)]
328 fn end_of_stream(&mut self) -> Result<Token> {
329 self.eof_handled = true;
331 self.pos = self.head_pos;
332 match self.st {
333 State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)),
334 State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
335 State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
336 State::CommentClosing(ClosingSubstate::Second) |
337 State::InsideComment | State::InsideMarkupDeclaration |
338 State::InsideProcessingInstruction | State::ProcessingInstructionClosing |
339 State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) =>
340 Err(self.error(SyntaxError::UnexpectedEof)),
341 State::EmptyTagClosing =>
342 Ok(Token::Character('/')),
343 State::CommentClosing(ClosingSubstate::First) =>
344 Ok(Token::Character('-')),
345 State::InvalidCDataClosing(ClosingSubstate::First) =>
346 Ok(Token::Character(']')),
347 State::InvalidCDataClosing(ClosingSubstate::Second) => {
348 self.eof_handled = false;
349 Ok(self.move_to_with_unread(State::Normal, &[']'], Token::Character(']')))
350 },
351 State::Normal => Ok(Token::Eof),
352 }
353 }
354
355 #[cold]
356 #[allow(clippy::needless_pass_by_value)]
357 fn error(&self, e: SyntaxError) -> Error {
358 Error {
359 pos: self.position(),
360 kind: ErrorKind::Syntax(e.to_cow()),
361 }
362 }
363
364 #[inline(never)]
365 fn dispatch_char(&mut self, c: char) -> Result {
366 match self.st {
367 State::Normal => Ok(self.normal(c)),
368 State::TagStarted => self.tag_opened(c),
369 State::EmptyTagClosing => Ok(Some(self.empty_element_closing(c))),
370 State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
371 State::InsideCdata => Ok(self.inside_cdata(c)),
372 State::CDataStarted(s) => self.cdata_started(c, s),
373 State::InsideComment => Ok(self.inside_comment_state(c)),
374 State::CommentStarted => self.comment_started(c),
375 State::InsideProcessingInstruction => Ok(self.inside_processing_instruction(c)),
376 State::ProcessingInstructionClosing => Ok(Some(self.processing_instruction_closing(c))),
377 State::CommentClosing(s) => self.comment_closing(c, s),
378 State::CDataClosing(s) => Ok(self.cdata_closing(c, s)),
379 State::InsideDoctype => Ok(self.inside_doctype(c)),
380 State::DoctypeStarted(s) => self.doctype_started(c, s),
381 State::InvalidCDataClosing(s) => Ok(self.invalid_cdata_closing(c, s)),
382 State::InsideMarkupDeclaration => self.markup_declaration(c),
383 State::InsideMarkupDeclarationQuotedString(q) => Ok(Some(self.markup_declaration_string(c, q))),
384 }
385 }
386
387 #[inline]
388 fn move_to(&mut self, st: State) -> Option<Token> {
389 self.st = st;
390 None
391 }
392
393 #[inline]
394 fn move_to_with(&mut self, st: State, token: Token) -> Token {
395 self.st = st;
396 token
397 }
398
399 #[inline]
400 fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Token {
401 self.normal_state = st;
402 self.st = st;
403 token
404 }
405
406 fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Token {
407 for c in cs.iter().rev().copied() {
408 self.char_queue.push_front(c);
409 }
410 self.move_to_with(st, token)
411 }
412
413 pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> {
414 if markup.is_empty() {
415 return Ok(());
416 }
417
418 self.reparse_depth += 1;
419 if self.reparse_depth > self.max_entity_expansion_depth || self.char_queue.len() > self.max_entity_expansion_length {
420 return Err(self.error(SyntaxError::EntityTooBig));
421 }
422
423 self.eof_handled = false;
424 self.char_queue.reserve(markup.len());
425 for c in markup.chars().rev() {
426 self.char_queue.push_front(c);
427 }
428
429 Ok(())
430 }
431
432 fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
433 debug_assert!(!chunk.is_empty());
434
435 #[cfg(test)]
436 if self.skip_errors {
437 let mut chars = chunk.chars();
438 let first = chars.next().unwrap_or('\0');
439 self.char_queue.extend(chars);
440 self.char_queue.push_back(c);
441 return Ok(Some(self.move_to_with(State::Normal, Token::Character(first))));
442 }
443 Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c)))
444 }
445
446 fn normal(&mut self, c: char) -> Option<Token> {
448 match c {
449 '<' => self.move_to(State::TagStarted),
450 '>' => Some(Token::TagEnd),
451 '/' => self.move_to(State::EmptyTagClosing),
452 '=' => Some(Token::EqualsSign),
453 '"' => Some(Token::DoubleQuote),
454 '\'' => Some(Token::SingleQuote),
455 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)),
456 '&' => Some(Token::ReferenceStart),
457 ';' => Some(Token::ReferenceEnd),
458 _ => Some(Token::Character(c))
459 }
460 }
461
462 fn inside_cdata(&mut self, c: char) -> Option<Token> {
463 match c {
464 ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)),
465 _ => Some(Token::Character(c)),
466 }
467 }
468
469 fn inside_processing_instruction(&mut self, c: char) -> Option<Token> {
470 match c {
472 '?' => self.move_to(State::ProcessingInstructionClosing),
473 '<' => Some(Token::OpeningTagStart),
474 '>' => Some(Token::TagEnd),
475 '=' => Some(Token::EqualsSign),
476 '"' => Some(Token::DoubleQuote),
477 '\'' => Some(Token::SingleQuote),
478 '&' => Some(Token::ReferenceStart),
479 ';' => Some(Token::ReferenceEnd),
480 _ => Some(Token::Character(c))
481 }
482 }
483
484 fn inside_comment_state(&mut self, c: char) -> Option<Token> {
485 match c {
486 '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)),
487 _ => Some(Token::Character(c)),
488 }
489 }
490
491 fn tag_opened(&mut self, c: char) -> Result {
493 match c {
494 '?' => Ok(Some(self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart))),
495 '/' => Ok(Some(self.move_to_with(self.normal_state, Token::ClosingTagStart))),
496 '!' => Ok(self.move_to(State::CommentOrCDataOrDoctypeStarted)),
497 _ if is_whitespace_char(c) => Ok(Some(self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart))),
498 _ if is_name_char(c) => Ok(Some(self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart))),
499 _ => self.handle_error("<", c)
500 }
501 }
502
503 fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
505 match c {
506 '-' => Ok(self.move_to(State::CommentStarted)),
507 '[' => Ok(self.move_to(State::CDataStarted(CDataStartedSubstate::E))),
508 'D' => Ok(self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D))),
509 'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => {
510 Ok(Some(self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart)))
511 },
512 _ => self.handle_error("<!", c),
513 }
514 }
515
516 fn comment_started(&mut self, c: char) -> Result {
518 match c {
519 '-' => Ok(Some(self.move_to_with(State::InsideComment, Token::CommentStart))),
520 _ => self.handle_error("<!-", c),
521 }
522 }
523
524 fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
526 use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E};
527 dispatch_on_enum_state!(self, s, c, State::CDataStarted,
528 E ; 'C' ; C ; "<![",
529 C ; 'D' ; CD ; "<![C",
530 CD ; 'A' ; CDA ; "<![CD",
531 CDA ; 'T' ; CDAT ; "<![CDA",
532 CDAT ; 'A' ; CDATA ; "<![CDAT";
533 CDATA ; '[' ; "<![CDATA" ; Ok(Some(self.move_to_with(State::InsideCdata, Token::CDataStart)))
534 )
535 }
536
537 fn markup_declaration(&mut self, c: char) -> Result {
539 match c {
540 '<' => self.handle_error("<!", c),
541 '>' => Ok(Some(self.move_to_with(self.normal_state, Token::TagEnd))),
542 '&' => Ok(Some(Token::ReferenceStart)),
543 ';' => Ok(Some(Token::ReferenceEnd)),
544 '"' => Ok(Some(self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote))),
545 '\'' => Ok(Some(self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote))),
546 _ => Ok(Some(Token::Character(c))),
547 }
548 }
549
550 fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Token {
551 match c {
552 '"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote),
553 '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote),
554 _ => Token::Character(c),
555 }
556 }
557
558 fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
560 use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
561 dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
562 D ; 'O' ; DO ; "<!D",
563 DO ; 'C' ; DOC ; "<!DO",
564 DOC ; 'T' ; DOCT ; "<!DOC",
565 DOCT ; 'Y' ; DOCTY ; "<!DOCT",
566 DOCTY ; 'P' ; DOCTYP ; "<!DOCTY";
567 DOCTYP ; 'E' ; "<!DOCTYP" ; Ok(Some(self.move_to_and_reset_normal(State::InsideDoctype, Token::DoctypeStart)))
568 )
569 }
570
571 fn inside_doctype(&mut self, c: char) -> Option<Token> {
573 match c {
574 '>' => Some(self.move_to_and_reset_normal(State::Normal, Token::TagEnd)),
575 '<' => self.move_to(State::TagStarted),
576 '&' => Some(Token::ReferenceStart),
577 ';' => Some(Token::ReferenceEnd),
578 '"' => Some(Token::DoubleQuote),
579 '\'' => Some(Token::SingleQuote),
580 _ => Some(Token::Character(c)),
581 }
582 }
583
584 fn processing_instruction_closing(&mut self, c: char) -> Token {
586 match c {
587 '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd),
588 _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')),
589 }
590 }
591
592 fn empty_element_closing(&mut self, c: char) -> Token {
594 match c {
595 '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd),
596 _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')),
597 }
598 }
599
600 fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
602 match s {
603 ClosingSubstate::First => match c {
604 '-' => Ok(self.move_to(State::CommentClosing(ClosingSubstate::Second))),
605 _ => Ok(Some(self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')))),
606 },
607 ClosingSubstate::Second => match c {
608 '>' => Ok(Some(self.move_to_with(self.normal_state, Token::CommentEnd))),
609 _ => self.handle_error("--", c),
611 },
612 }
613 }
614
615 fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Option<Token> {
617 match s {
618 ClosingSubstate::First => match c {
619 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
620 _ => Some(self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']'))),
621 },
622 ClosingSubstate::Second => match c {
623 '>' => Some(self.move_to_with(State::Normal, Token::CDataEnd)),
624 _ => Some(self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']'))),
625 },
626 }
627 }
628
629 fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Option<Token> {
631 match s {
632 ClosingSubstate::First => match c {
633 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)),
634 _ => Some(self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))),
635 },
636 ClosingSubstate::Second => match c {
637 '>' => Some(self.move_to_with(self.normal_state, Token::CDataEnd)),
638 _ => Some(self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))),
639 },
640 }
641 }
642}
643
644#[cfg(test)]
645mod tests {
646 use crate::{common::Position, reader::ParserConfig2};
647 use std::io::{BufReader, Cursor};
648
649 use super::{Lexer, Token};
650
651 macro_rules! assert_oks(
652 (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
653 $(
654 assert_eq!(Ok($e), $lex.next_token(&mut $buf));
655 )+
656 })
657 );
658
659 macro_rules! assert_err(
660 (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
661 let err = $lex.next_token(&mut $buf);
662 assert!(err.is_err());
663 let err = err.unwrap_err();
664 assert_eq!($r as u64, err.position().row);
665 assert_eq!($c as u64, err.position().column);
666 })
667 );
668
669 macro_rules! assert_none(
670 (for $lex:ident and $buf:ident) => (
671 assert_eq!(Ok(Token::Eof), $lex.next_token(&mut $buf))
672 )
673 );
674
675 fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
676 (Lexer::new(&ParserConfig2::default()), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
677 }
678
679 #[test]
680 fn tricky_pi() {
681 let (mut lex, mut buf) = make_lex_and_buf(r"<?x<!-- &??><x>");
682
683 assert_oks!(for lex and buf ;
684 Token::ProcessingInstructionStart
685 Token::Character('x')
686 Token::OpeningTagStart Token::Character('!')
688 Token::Character('-')
689 Token::Character('-')
690 Token::Character(' ')
691 Token::ReferenceStart
692 Token::Character('?')
693 Token::ProcessingInstructionEnd
694 Token::OpeningTagStart
695 Token::Character('x')
696 Token::TagEnd
697 );
698 assert_none!(for lex and buf);
699 }
700
701 #[test]
702 fn reparser() {
703 let (mut lex, mut buf) = make_lex_and_buf(r"&a;");
704
705 assert_oks!(for lex and buf ;
706 Token::ReferenceStart
707 Token::Character('a')
708 Token::ReferenceEnd
709 );
710 lex.reparse("<hi/>").unwrap();
711 assert_oks!(for lex and buf ;
712 Token::OpeningTagStart
713 Token::Character('h')
714 Token::Character('i')
715 Token::EmptyTagEnd
716 );
717 assert_none!(for lex and buf);
718 }
719
720 #[test]
721 fn simple_lexer_test() {
722 let (mut lex, mut buf) = make_lex_and_buf(
723 r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "#
724 );
725
726 assert_oks!(for lex and buf ;
727 Token::OpeningTagStart
728 Token::Character('a')
729 Token::Character(' ')
730 Token::Character('p')
731 Token::EqualsSign
732 Token::SingleQuote
733 Token::Character('q')
734 Token::SingleQuote
735 Token::TagEnd
736 Token::Character(' ')
737 Token::Character('x')
738 Token::OpeningTagStart
739 Token::Character('b')
740 Token::Character(' ')
741 Token::Character('z')
742 Token::EqualsSign
743 Token::DoubleQuote
744 Token::Character('y')
745 Token::DoubleQuote
746 Token::TagEnd
747 Token::Character('d')
748 Token::Character('\t')
749 Token::ClosingTagStart
750 Token::Character('b')
751 Token::TagEnd
752 Token::ClosingTagStart
753 Token::Character('a')
754 Token::TagEnd
755 Token::OpeningTagStart
756 Token::Character('p')
757 Token::EmptyTagEnd
758 Token::Character(' ')
759 Token::ProcessingInstructionStart
760 Token::Character('n')
761 Token::Character('m')
762 Token::Character(' ')
763 Token::ProcessingInstructionEnd
764 Token::Character(' ')
765 Token::CommentStart
766 Token::Character(' ')
767 Token::Character('a')
768 Token::Character(' ')
769 Token::Character('c')
770 Token::Character(' ')
771 Token::CommentEnd
772 Token::Character(' ')
773 Token::ReferenceStart
774 Token::Character('n')
775 Token::Character('b')
776 Token::Character('s')
777 Token::Character('p')
778 Token::ReferenceEnd
779 );
780 assert_none!(for lex and buf);
781 }
782
783 #[test]
784 fn special_chars_test() {
785 let (mut lex, mut buf) = make_lex_and_buf(
786 r"?x!+ // -| ]z]]"
787 );
788
789 assert_oks!(for lex and buf ;
790 Token::Character('?')
791 Token::Character('x')
792 Token::Character('!')
793 Token::Character('+')
794 Token::Character(' ')
795 Token::Character('/')
796 Token::Character('/')
797 Token::Character(' ')
798 Token::Character('-')
799 Token::Character('|')
800 Token::Character(' ')
801 Token::Character(']')
802 Token::Character('z')
803 Token::Character(']')
804 Token::Character(']')
805 );
806 assert_none!(for lex and buf);
807 }
808
809 #[test]
810 fn cdata_test() {
811 let (mut lex, mut buf) = make_lex_and_buf(
812 r"<a><![CDATA[x y ?]]> </a>"
813 );
814
815 assert_oks!(for lex and buf ;
816 Token::OpeningTagStart
817 Token::Character('a')
818 Token::TagEnd
819 Token::CDataStart
820 Token::Character('x')
821 Token::Character(' ')
822 Token::Character('y')
823 Token::Character(' ')
824 Token::Character('?')
825 Token::CDataEnd
826 Token::Character(' ')
827 Token::ClosingTagStart
828 Token::Character('a')
829 Token::TagEnd
830 );
831 assert_none!(for lex and buf);
832 }
833
834 #[test]
835 fn cdata_closers_test() {
836 let (mut lex, mut buf) = make_lex_and_buf(
837 r"<![CDATA[] > ]> ]]><!---->]]<a>"
838 );
839
840 assert_oks!(for lex and buf ;
841 Token::CDataStart
842 Token::Character(']')
843 Token::Character(' ')
844 Token::Character('>')
845 Token::Character(' ')
846 Token::Character(']')
847 Token::Character('>')
848 Token::Character(' ')
849 Token::CDataEnd
850 Token::CommentStart
851 Token::CommentEnd
852 Token::Character(']')
853 Token::Character(']')
854 Token::OpeningTagStart
855 Token::Character('a')
856 Token::TagEnd
857 );
858 assert_none!(for lex and buf);
859 }
860
861 #[test]
862 fn doctype_test() {
863 let (mut lex, mut buf) = make_lex_and_buf(
864 r"<a><!DOCTYPE ab xx z> "
865 );
866 assert_oks!(for lex and buf ;
867 Token::OpeningTagStart
868 Token::Character('a')
869 Token::TagEnd
870 Token::DoctypeStart
871 Token::Character(' ')
872 Token::Character('a')
873 Token::Character('b')
874 Token::Character(' ')
875 Token::Character('x')
876 Token::Character('x')
877 Token::Character(' ')
878 Token::Character('z')
879 Token::TagEnd
880 Token::Character(' ')
881 );
882 assert_none!(for lex and buf);
883 }
884
885 #[test]
886 fn tricky_comments() {
887 let (mut lex, mut buf) = make_lex_and_buf(
888 r"<a><!-- C ->--></a>"
889 );
890 assert_oks!(for lex and buf ;
891 Token::OpeningTagStart
892 Token::Character('a')
893 Token::TagEnd
894 Token::CommentStart
895 Token::Character(' ')
896 Token::Character('C')
897 Token::Character(' ')
898 Token::Character('-')
899 Token::Character('>')
900 Token::CommentEnd
901 Token::ClosingTagStart
902 Token::Character('a')
903 Token::TagEnd
904 );
905 assert_none!(for lex and buf);
906 }
907
908 #[test]
909 fn doctype_with_internal_subset_test() {
910 let (mut lex, mut buf) = make_lex_and_buf(
911 r#"<a><!DOCTYPE ab[<!ELEMENT ba ">>>"> ]> "#
912 );
913 assert_oks!(for lex and buf ;
914 Token::OpeningTagStart
915 Token::Character('a')
916 Token::TagEnd
917 Token::DoctypeStart
918 Token::Character(' ')
919 Token::Character('a')
920 Token::Character('b')
921 Token::Character('[')
922 Token::MarkupDeclarationStart
923 Token::Character('E')
924 Token::Character('L')
925 Token::Character('E')
926 Token::Character('M')
927 Token::Character('E')
928 Token::Character('N')
929 Token::Character('T')
930 Token::Character(' ')
931 Token::Character('b')
932 Token::Character('a')
933 Token::Character(' ')
934 Token::DoubleQuote
935 Token::Character('>')
936 Token::Character('>')
937 Token::Character('>')
938 Token::DoubleQuote
939 Token::TagEnd
940 Token::Character(' ')
941 Token::Character(']')
942 Token::TagEnd
943 Token::Character(' ')
944 );
945 assert_none!(for lex and buf);
946 }
947
948 #[test]
949 fn doctype_internal_pi_comment() {
950 let (mut lex, mut buf) = make_lex_and_buf(
951 "<!DOCTYPE a [\n<!ELEMENT l ANY> <!-- <?non?>--> <?pi > ?> \n]>"
952 );
953 assert_oks!(for lex and buf ;
954 Token::DoctypeStart
955 Token::Character(' ')
956 Token::Character('a')
957 Token::Character(' ')
958 Token::Character('[')
959 Token::Character('\n')
960 Token::MarkupDeclarationStart
961 Token::Character('E')
962 Token::Character('L')
963 Token::Character('E')
964 Token::Character('M')
965 Token::Character('E')
966 Token::Character('N')
967 Token::Character('T')
968 Token::Character(' ')
969 Token::Character('l')
970 Token::Character(' ')
971 Token::Character('A')
972 Token::Character('N')
973 Token::Character('Y')
974 Token::TagEnd
975 Token::Character(' ')
976 Token::CommentStart
977 Token::Character(' ')
978 Token::Character('<')
979 Token::Character('?')
980 Token::Character('n')
981 Token::Character('o')
982 Token::Character('n')
983 Token::Character('?')
984 Token::Character('>')
985 Token::CommentEnd
986 Token::Character(' ')
987 Token::ProcessingInstructionStart
988 Token::Character('p')
989 Token::Character('i')
990 Token::Character(' ')
991 Token::TagEnd Token::Character(' ')
993 Token::ProcessingInstructionEnd
994 Token::Character(' ')
995 Token::Character('\n')
996 Token::Character(']')
997 Token::TagEnd );
999 assert_none!(for lex and buf);
1000 }
1001
1002 #[test]
1003 fn end_of_stream_handling_ok() {
1004 macro_rules! eof_check(
1005 ($data:expr ; $token:expr) => ({
1006 let (mut lex, mut buf) = make_lex_and_buf($data);
1007 assert_oks!(for lex and buf ; $token);
1008 assert_none!(for lex and buf);
1009 })
1010 );
1011 eof_check!("?" ; Token::Character('?'));
1012 eof_check!("/" ; Token::Character('/'));
1013 eof_check!("-" ; Token::Character('-'));
1014 eof_check!("]" ; Token::Character(']'));
1015 eof_check!("]" ; Token::Character(']'));
1016 eof_check!("]" ; Token::Character(']'));
1017 }
1018
1019 #[test]
1020 fn end_of_stream_handling_error() {
1021 macro_rules! eof_check(
1022 ($data:expr; $r:expr, $c:expr) => ({
1023 let (mut lex, mut buf) = make_lex_and_buf($data);
1024 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
1025 assert_none!(for lex and buf);
1026 })
1027 );
1028 eof_check!("<" ; 0, 1);
1029 eof_check!("<!" ; 0, 2);
1030 eof_check!("<!-" ; 0, 3);
1031 eof_check!("<![" ; 0, 3);
1032 eof_check!("<![C" ; 0, 4);
1033 eof_check!("<![CD" ; 0, 5);
1034 eof_check!("<![CDA" ; 0, 6);
1035 eof_check!("<![CDAT" ; 0, 7);
1036 eof_check!("<![CDATA" ; 0, 8);
1037 }
1038
1039 #[test]
1040 fn error_in_comment_or_cdata_prefix() {
1041 let (mut lex, mut buf) = make_lex_and_buf("<!x");
1042 assert_err!(for lex and buf expect row 0 ; 0,
1043 "Unexpected token '<!' before 'x'"
1044 );
1045
1046 let (mut lex, mut buf) = make_lex_and_buf("<!x");
1047 lex.disable_errors();
1048 assert_oks!(for lex and buf ;
1049 Token::Character('<')
1050 Token::Character('!')
1051 Token::Character('x')
1052 );
1053 assert_none!(for lex and buf);
1054 }
1055
1056 #[test]
1057 fn error_in_comment_started() {
1058 let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1059 assert_err!(for lex and buf expect row 0 ; 0,
1060 "Unexpected token '<!-' before '\t'"
1061 );
1062
1063 let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1064 lex.disable_errors();
1065 assert_oks!(for lex and buf ;
1066 Token::Character('<')
1067 Token::Character('!')
1068 Token::Character('-')
1069 Token::Character('\t')
1070 );
1071 assert_none!(for lex and buf);
1072 }
1073
1074 #[test]
1075 fn error_in_comment_two_dashes_not_at_end() {
1076 let (mut lex, mut buf) = make_lex_and_buf("--x");
1077 lex.st = super::State::InsideComment;
1078 assert_err!(for lex and buf expect row 0; 0,
1079 "Unexpected token '--' before 'x'"
1080 );
1081
1082 let (mut lex, mut buf) = make_lex_and_buf("--x");
1083 assert_oks!(for lex and buf ;
1084 Token::Character('-')
1085 Token::Character('-')
1086 Token::Character('x')
1087 );
1088 }
1089
1090 macro_rules! check_case(
1091 ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
1092 let (mut lex, mut buf) = make_lex_and_buf($data);
1093 assert_err!(for lex and buf expect row $r ; $c, $s);
1094
1095 let (mut lex, mut buf) = make_lex_and_buf($data);
1096 lex.disable_errors();
1097 for c in $chunk.chars() {
1098 assert_eq!(Ok(Token::Character(c)), lex.next_token(&mut buf));
1099 }
1100 assert_oks!(for lex and buf ;
1101 Token::Character($app)
1102 );
1103 assert_none!(for lex and buf);
1104 })
1105 );
1106
1107 #[test]
1108 fn token_size() {
1109 assert_eq!(4, std::mem::size_of::<Token>());
1110 assert_eq!(2, std::mem::size_of::<super::State>());
1111 }
1112
1113 #[test]
1114 fn error_in_cdata_started() {
1115 check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['");
1116 check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['");
1117 check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['");
1118 check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['");
1119 check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['");
1120 check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
1121 }
1122
1123 #[test]
1124 fn error_in_doctype_started() {
1125 check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'");
1126 check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'");
1127 check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'");
1128 check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
1129 check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
1130 check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
1131 }
1132
1133 #[test]
1134 fn issue_98_cdata_ending_with_right_bracket() {
1135 let (mut lex, mut buf) = make_lex_and_buf(
1136 r"<![CDATA[Foo [Bar]]]>"
1137 );
1138
1139 assert_oks!(for lex and buf ;
1140 Token::CDataStart
1141 Token::Character('F')
1142 Token::Character('o')
1143 Token::Character('o')
1144 Token::Character(' ')
1145 Token::Character('[')
1146 Token::Character('B')
1147 Token::Character('a')
1148 Token::Character('r')
1149 Token::Character(']')
1150 Token::CDataEnd
1151 );
1152 assert_none!(for lex and buf);
1153 }
1154}