xml_no_std/reader/lexer.rs
1//! Contains simple lexer for XML documents.
2//!
3//! This module is for internal use. Use `xml_no_std::pull` module to do parsing.
4extern crate alloc;
5
6use alloc::string::String;
7
8use crate::reader::ErrorKind;
9use crate::reader::error::SyntaxError;
10use alloc::collections::VecDeque;
11use core::fmt;
12use core::result;
13use crate::common::{is_name_char, is_whitespace_char, Position, TextPosition, is_xml10_char, is_xml11_char};
14use crate::reader::Error;
15use crate::util::{CharReader, Encoding};
16
17use super::ParserConfig2;
18
19/// `Token` represents a single lexeme of an XML document. These lexemes
20/// are used to perform actual parsing.
21#[derive(Copy, Clone, PartialEq, Eq, Debug)]
22pub(crate) enum Token {
23 /// `<?`
24 ProcessingInstructionStart,
25 /// `?>`
26 ProcessingInstructionEnd,
27 /// `<!DOCTYPE…`
28 DoctypeStart,
29 /// `<`
30 OpeningTagStart,
31 /// `</`
32 ClosingTagStart,
33 /// `>`
34 TagEnd,
35 /// `/>`
36 EmptyTagEnd,
37 /// `<!--`
38 CommentStart,
39 /// `-->`
40 CommentEnd,
41 /// Any non-special character except whitespace.
42 Character(char),
43 /// `=`
44 EqualsSign,
45 /// `'`
46 SingleQuote,
47 /// `"`
48 DoubleQuote,
49 /// `<![CDATA[`
50 CDataStart,
51 /// `]]>`
52 CDataEnd,
53 /// `&`
54 ReferenceStart,
55 /// `;`
56 ReferenceEnd,
57 /// `<!` of `ENTITY`
58 MarkupDeclarationStart,
59 /// End of file
60 Eof,
61}
62
63impl fmt::Display for Token {
64 #[cold]
65 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
66 match *self {
67 Token::Character(c) => c.fmt(f),
68 other => match other {
69 Token::OpeningTagStart => "<",
70 Token::ProcessingInstructionStart => "<?",
71 Token::DoctypeStart => "<!DOCTYPE",
72 Token::ClosingTagStart => "</",
73 Token::CommentStart => "<!--",
74 Token::CDataStart => "<![CDATA[",
75 Token::TagEnd => ">",
76 Token::EmptyTagEnd => "/>",
77 Token::ProcessingInstructionEnd => "?>",
78 Token::CommentEnd => "-->",
79 Token::CDataEnd => "]]>",
80 Token::ReferenceStart => "&",
81 Token::ReferenceEnd => ";",
82 Token::EqualsSign => "=",
83 Token::SingleQuote => "'",
84 Token::DoubleQuote => "\"",
85 Token::MarkupDeclarationStart => "<!",
86 Token::Eof | Token::Character(_) => {
87 debug_assert!(false);
88 ""
89 },
90 }.fmt(f),
91 }
92 }
93}
94
95impl Token {
96 pub fn as_static_str(self) -> Option<&'static str> {
97 match self {
98 Token::OpeningTagStart => Some("<"),
99 Token::ProcessingInstructionStart => Some("<?"),
100 Token::DoctypeStart => Some("<!DOCTYPE"),
101 Token::ClosingTagStart => Some("</"),
102 Token::CommentStart => Some("<!--"),
103 Token::CDataStart => Some("<![CDATA["),
104 Token::TagEnd => Some(">"),
105 Token::EmptyTagEnd => Some("/>"),
106 Token::ProcessingInstructionEnd => Some("?>"),
107 Token::CommentEnd => Some("-->"),
108 Token::CDataEnd => Some("]]>"),
109 Token::ReferenceStart => Some("&"),
110 Token::ReferenceEnd => Some(";"),
111 Token::EqualsSign => Some("="),
112 Token::SingleQuote => Some("'"),
113 Token::DoubleQuote => Some("\""),
114 _ => None
115 }
116 }
117
118 // using String.push_str(token.to_string()) is simply way too slow
119 pub fn push_to_string(self, target: &mut String) {
120 match self {
121 Token::Character(c) => {
122 debug_assert!(is_xml10_char(c) || is_xml11_char(c));
123 target.push(c);
124 },
125 _ => if let Some(s) = self.as_static_str() {
126 target.push_str(s);
127 }
128 }
129 }
130}
131
132#[derive(Copy, Clone)]
133enum State {
134 /// Default state
135 Normal,
136 /// Triggered on '<'
137 TagStarted,
138 /// Triggered on '<!'
139 CommentOrCDataOrDoctypeStarted,
140 /// Triggered on '<!-'
141 CommentStarted,
142 /// Triggered on '<!D' up to '<!DOCTYPE'
143 DoctypeStarted(DoctypeStartedSubstate),
144 /// Other items like `<!ELEMENT` in DTD
145 InsideMarkupDeclaration,
146 /// Triggered after `DoctypeStarted` to handle sub elements
147 InsideDoctype,
148 /// Triggered on '<![' up to '<![CDATA'
149 CDataStarted(CDataStartedSubstate),
150 /// Triggered on '?'
151 ProcessingInstructionClosing,
152 /// Triggered on '/'
153 EmptyTagClosing,
154 /// Triggered on '-' up to '--'
155 CommentClosing(ClosingSubstate),
156 /// Triggered on ']' up to ']]' inside CDATA
157 CDataClosing(ClosingSubstate),
158 /// Triggered on ']' up to ']]' outside CDATA
159 InvalidCDataClosing(ClosingSubstate),
160 /// After `<!--`
161 InsideComment,
162 /// After `<[[`
163 InsideCdata,
164 /// After `<?`
165 InsideProcessingInstruction,
166 /// `<!ENTITY "here">`
167 InsideMarkupDeclarationQuotedString(QuoteStyle),
168}
169
170#[derive(Copy, Clone, Eq, PartialEq)]
171enum QuoteStyle {
172 Single, Double
173}
174
175#[derive(Copy, Clone)]
176enum ClosingSubstate {
177 First, Second
178}
179
180#[derive(Copy, Clone)]
181#[allow(clippy::upper_case_acronyms)]
182enum DoctypeStartedSubstate {
183 D, DO, DOC, DOCT, DOCTY, DOCTYP
184}
185
186#[derive(Copy, Clone)]
187#[allow(clippy::upper_case_acronyms)]
188enum CDataStartedSubstate {
189 E, C, CD, CDA, CDAT, CDATA
190}
191
192/// `Result` represents lexing result. It is either a token or an error message.
193pub(crate) type Result<T = Option<Token>, E = Error> = result::Result<T, E>;
194
195/// Helps to set up a dispatch table for lexing large unambigous tokens like
196/// `<![CDATA[` or `<!DOCTYPE `.
197macro_rules! dispatch_on_enum_state(
198 ($_self:ident, $s:expr, $c:expr, $is:expr,
199 $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
200 $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
201 match $s {
202 $(
203 $st => match $c {
204 $stc => Ok($_self.move_to($is($next_st))),
205 _ => $_self.handle_error($chunk, $c)
206 },
207 )+
208 $end_st => match $c {
209 $end_c => $e,
210 _ => $_self.handle_error($end_chunk, $c)
211 }
212 }
213 )
214);
215
216/// `Lexer` is a lexer for XML documents, which implements pull API.
217///
218/// Main method is `next_token` which accepts an `std::io::Read` instance and
219/// tries to read the next lexeme from it.
220///
221/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s.
222/// When it is not set, errors will be reported as `Err` objects with a string message.
223/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
224/// to toggle the behavior.
225pub(crate) struct Lexer {
226 st: State,
227 reader: CharReader,
228 pos: TextPosition,
229 head_pos: TextPosition,
230 char_queue: VecDeque<char>,
231 /// Default state to go back to after a tag end (may be `InsideDoctype`)
232 normal_state: State,
233 inside_token: bool,
234 eof_handled: bool,
235 reparse_depth: u8,
236 #[cfg(test)]
237 skip_errors: bool,
238
239 max_entity_expansion_depth: u8,
240 max_entity_expansion_length: usize,
241}
242
243impl Position for Lexer {
244 #[inline]
245 /// Returns the position of the last token produced by the lexer
246 fn position(&self) -> TextPosition { self.pos }
247}
248
249impl Lexer {
250 /// Returns a new lexer with default state.
251 pub(crate) fn new(config: &ParserConfig2) -> Lexer {
252 Lexer {
253 reader: CharReader::new(),
254 pos: TextPosition::new(),
255 head_pos: TextPosition::new(),
256 char_queue: VecDeque::with_capacity(4), // TODO: check size
257 st: State::Normal,
258 normal_state: State::Normal,
259 inside_token: false,
260 eof_handled: false,
261 reparse_depth: 0,
262 #[cfg(test)]
263 skip_errors: false,
264
265 max_entity_expansion_depth: config.max_entity_expansion_depth,
266 max_entity_expansion_length: config.max_entity_expansion_length,
267 }
268 }
269
270 pub(crate) fn encoding(&mut self) -> Encoding {
271 self.reader.encoding
272 }
273
274 pub(crate) fn set_encoding(&mut self, encoding: Encoding) {
275 self.reader.encoding = encoding;
276 }
277
278 /// Disables error handling so `next_token` will return `Some(Chunk(..))`
279 /// upon invalid lexeme with this lexeme content.
280 #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; }
281
282 /// Reset the eof handled flag of the lexer.
283 #[inline]
284 pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
285
286 /// Tries to read the next token from the buffer.
287 ///
288 /// It is possible to pass different instaces of `BufReader` each time
289 /// this method is called, but the resulting behavior is undefined in this case.
290 ///
291 /// Return value:
292 /// * `Err(reason) where reason: reader::Error` - when an error occurs;
293 /// * `Ok(None)` - upon end of stream is reached;
294 /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream.
295 pub fn next_token<'a, S: Iterator<Item = &'a u8>>(&mut self, b: &mut S) -> Result<Token> {
296 // Already reached end of buffer
297 if self.eof_handled {
298 return Ok(Token::Eof);
299 }
300
301 if !self.inside_token {
302 self.pos = self.head_pos;
303 self.inside_token = true;
304 }
305
306 // Check if we have saved a char or two for ourselves
307 while let Some(c) = self.char_queue.pop_front() {
308 if let Some(t) = self.dispatch_char(c)? {
309 self.inside_token = false;
310 return Ok(t);
311 }
312 }
313 // if char_queue is empty, all circular reparsing is done
314 self.reparse_depth = 0;
315 loop {
316 let c = match self.reader.next_char_from(b)? {
317 Some(c) => c, // got next char
318 None => break, // nothing to read left
319 };
320
321 if c == '\n' {
322 self.head_pos.new_line();
323 } else {
324 self.head_pos.advance(1);
325 }
326
327 if let Some(t) = self.dispatch_char(c)? {
328 self.inside_token = false;
329 return Ok(t);
330 }
331 }
332
333 self.end_of_stream()
334 }
335
336 #[inline(never)]
337 fn end_of_stream(&mut self) -> Result<Token> {
338 // Handle end of stream
339 self.eof_handled = true;
340 self.pos = self.head_pos;
341 match self.st {
342 State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)),
343 State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
344 State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
345 State::CommentClosing(ClosingSubstate::Second) |
346 State::InsideComment | State::InsideMarkupDeclaration |
347 State::InsideProcessingInstruction | State::ProcessingInstructionClosing |
348 State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) =>
349 Err(self.error(SyntaxError::UnexpectedEof)),
350 State::EmptyTagClosing =>
351 Ok(Token::Character('/')),
352 State::CommentClosing(ClosingSubstate::First) =>
353 Ok(Token::Character('-')),
354 State::InvalidCDataClosing(ClosingSubstate::First) =>
355 Ok(Token::Character(']')),
356 State::InvalidCDataClosing(ClosingSubstate::Second) => {
357 self.eof_handled = false;
358 Ok(self.move_to_with_unread(State::Normal, &[']'], Token::Character(']')))
359 },
360 State::Normal => Ok(Token::Eof),
361 }
362 }
363
364 #[cold]
365 #[allow(clippy::needless_pass_by_value)]
366 fn error(&self, e: SyntaxError) -> Error {
367 Error {
368 pos: self.position(),
369 kind: ErrorKind::Syntax(e.to_cow()),
370 }
371 }
372
373
374 #[inline(never)]
375 fn dispatch_char(&mut self, c: char) -> Result {
376 match self.st {
377 State::Normal => Ok(self.normal(c)),
378 State::TagStarted => self.tag_opened(c),
379 State::EmptyTagClosing => Ok(Some(self.empty_element_closing(c))),
380 State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
381 State::InsideCdata => Ok(self.inside_cdata(c)),
382 State::CDataStarted(s) => self.cdata_started(c, s),
383 State::InsideComment => Ok(self.inside_comment_state(c)),
384 State::CommentStarted => self.comment_started(c),
385 State::InsideProcessingInstruction => Ok(self.inside_processing_instruction(c)),
386 State::ProcessingInstructionClosing => Ok(Some(self.processing_instruction_closing(c))),
387 State::CommentClosing(s) => self.comment_closing(c, s),
388 State::CDataClosing(s) => Ok(self.cdata_closing(c, s)),
389 State::InsideDoctype => Ok(self.inside_doctype(c)),
390 State::DoctypeStarted(s) => self.doctype_started(c, s),
391 State::InvalidCDataClosing(s) => Ok(self.invalid_cdata_closing(c, s)),
392 State::InsideMarkupDeclaration => self.markup_declaration(c),
393 State::InsideMarkupDeclarationQuotedString(q) => Ok(Some(self.markup_declaration_string(c, q))),
394 }
395 }
396
397 #[inline]
398 fn move_to(&mut self, st: State) -> Option<Token> {
399 self.st = st;
400 None
401 }
402
403 #[inline]
404 fn move_to_with(&mut self, st: State, token: Token) -> Token {
405 self.st = st;
406 token
407 }
408
409 #[inline]
410 fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Token {
411 self.normal_state = st;
412 self.st = st;
413 token
414 }
415
416 fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Token {
417 for c in cs.iter().rev().copied() {
418 self.char_queue.push_front(c);
419 }
420 self.move_to_with(st, token)
421 }
422
423 pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> {
424 if markup.is_empty() {
425 return Ok(());
426 }
427
428 self.reparse_depth += 1;
429 if self.reparse_depth > self.max_entity_expansion_depth || self.char_queue.len() > self.max_entity_expansion_length {
430 return Err(self.error(SyntaxError::EntityTooBig))
431 }
432
433 self.eof_handled = false;
434 self.char_queue.reserve(markup.len());
435 for c in markup.chars().rev() {
436 self.char_queue.push_front(c);
437 }
438
439 Ok(())
440 }
441
442 fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
443 debug_assert!(!chunk.is_empty());
444
445 #[cfg(test)]
446 if self.skip_errors {
447 let mut chars = chunk.chars();
448 let first = chars.next().unwrap_or('\0');
449 self.char_queue.extend(chars);
450 self.char_queue.push_back(c);
451 return Ok(Some(self.move_to_with(State::Normal, Token::Character(first))));
452 }
453 Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c)))
454 }
455
456 /// Encountered a char
457 fn normal(&mut self, c: char) -> Option<Token> {
458 match c {
459 '<' => self.move_to(State::TagStarted),
460 '>' => Some(Token::TagEnd),
461 '/' => self.move_to(State::EmptyTagClosing),
462 '=' => Some(Token::EqualsSign),
463 '"' => Some(Token::DoubleQuote),
464 '\'' => Some(Token::SingleQuote),
465 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)),
466 '&' => Some(Token::ReferenceStart),
467 ';' => Some(Token::ReferenceEnd),
468 _ => Some(Token::Character(c))
469 }
470 }
471
472 fn inside_cdata(&mut self, c: char) -> Option<Token> {
473 match c {
474 ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)),
475 _ => Some(Token::Character(c)),
476 }
477 }
478
479 fn inside_processing_instruction(&mut self, c: char) -> Option<Token> {
480 // These tokens are used by `<?xml?>` parser
481 match c {
482 '?' => self.move_to(State::ProcessingInstructionClosing),
483 '<' => Some(Token::OpeningTagStart),
484 '>' => Some(Token::TagEnd),
485 '/' => Some(Token::ClosingTagStart),
486 '=' => Some(Token::EqualsSign),
487 '"' => Some(Token::DoubleQuote),
488 '\'' => Some(Token::SingleQuote),
489 '&' => Some(Token::ReferenceStart),
490 ';' => Some(Token::ReferenceEnd),
491 _ => Some(Token::Character(c))
492 }
493 }
494
495 fn inside_comment_state(&mut self, c: char) -> Option<Token> {
496 match c {
497 '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)),
498 _ => Some(Token::Character(c)),
499 }
500 }
501
502 /// Encountered '<'
503 fn tag_opened(&mut self, c: char) -> Result {
504 match c {
505 '?' => Ok(Some(self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart))),
506 '/' => Ok(Some(self.move_to_with(self.normal_state, Token::ClosingTagStart))),
507 '!' => Ok(self.move_to(State::CommentOrCDataOrDoctypeStarted)),
508 _ if is_whitespace_char(c) => Ok(Some(self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart))),
509 _ if is_name_char(c) => Ok(Some(self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart))),
510 _ => self.handle_error("<", c)
511 }
512 }
513
514 /// Encountered '<!'
515 fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
516 match c {
517 '-' => Ok(self.move_to(State::CommentStarted)),
518 '[' => Ok(self.move_to(State::CDataStarted(CDataStartedSubstate::E))),
519 'D' => Ok(self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D))),
520 'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => {
521 Ok(Some(self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart)))
522 },
523 _ => self.handle_error("<!", c),
524 }
525 }
526
527 /// Encountered '<!-'
528 fn comment_started(&mut self, c: char) -> Result {
529 match c {
530 '-' => Ok(Some(self.move_to_with(State::InsideComment, Token::CommentStart))),
531 _ => self.handle_error("<!-", c),
532 }
533 }
534
535 /// Encountered '<!['
536 fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
537 use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E};
538 dispatch_on_enum_state!(self, s, c, State::CDataStarted,
539 E ; 'C' ; C ; "<![",
540 C ; 'D' ; CD ; "<![C",
541 CD ; 'A' ; CDA ; "<![CD",
542 CDA ; 'T' ; CDAT ; "<![CDA",
543 CDAT ; 'A' ; CDATA ; "<![CDAT";
544 CDATA ; '[' ; "<![CDATA" ; Ok(Some(self.move_to_with(State::InsideCdata, Token::CDataStart)))
545 )
546 }
547
548 /// Encountered '<!…' that isn't DOCTYPE or CDATA
549 fn markup_declaration(&mut self, c: char) -> Result {
550 match c {
551 '<' => self.handle_error("<!", c),
552 '>' => Ok(Some(self.move_to_with(self.normal_state, Token::TagEnd))),
553 '&' => Ok(Some(Token::ReferenceStart)),
554 ';' => Ok(Some(Token::ReferenceEnd)),
555 '"' => Ok(Some(self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote))),
556 '\'' => Ok(Some(self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote))),
557 _ => Ok(Some(Token::Character(c))),
558 }
559 }
560
561 fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Token {
562 match c {
563 '"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote),
564 '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote),
565 _ => Token::Character(c),
566 }
567 }
568
569 /// Encountered '<!D'
570 fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
571 use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
572 dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
573 D ; 'O' ; DO ; "<!D",
574 DO ; 'C' ; DOC ; "<!DO",
575 DOC ; 'T' ; DOCT ; "<!DOC",
576 DOCT ; 'Y' ; DOCTY ; "<!DOCT",
577 DOCTY ; 'P' ; DOCTYP ; "<!DOCTY";
578 DOCTYP ; 'E' ; "<!DOCTYP" ; Ok(Some(self.move_to_and_reset_normal(State::InsideDoctype, Token::DoctypeStart)))
579 )
580 }
581
582 /// State used while awaiting the closing bracket for the <!DOCTYPE tag
583 fn inside_doctype(&mut self, c: char) -> Option<Token> {
584 match c {
585 '>' => Some(self.move_to_and_reset_normal(State::Normal, Token::TagEnd)),
586 '<' => self.move_to(State::TagStarted),
587 '&' => Some(Token::ReferenceStart),
588 ';' => Some(Token::ReferenceEnd),
589 '"' => Some(Token::DoubleQuote),
590 '\'' => Some(Token::SingleQuote),
591 _ => Some(Token::Character(c)),
592 }
593 }
594
595 /// Encountered '?'
596 fn processing_instruction_closing(&mut self, c: char) -> Token {
597 match c {
598 '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd),
599 _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')),
600 }
601 }
602
603 /// Encountered '/'
604 fn empty_element_closing(&mut self, c: char) -> Token {
605 match c {
606 '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd),
607 _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')),
608 }
609 }
610
611 /// Encountered '-'
612 fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
613 match s {
614 ClosingSubstate::First => match c {
615 '-' => Ok(self.move_to(State::CommentClosing(ClosingSubstate::Second))),
616 _ => Ok(Some(self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')))),
617 },
618 ClosingSubstate::Second => match c {
619 '>' => Ok(Some(self.move_to_with(self.normal_state, Token::CommentEnd))),
620 // double dash not followed by a greater-than is a hard error inside comment
621 _ => self.handle_error("--", c),
622 },
623 }
624 }
625
626 /// Encountered ']'
627 fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Option<Token> {
628 match s {
629 ClosingSubstate::First => match c {
630 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
631 _ => Some(self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']'))),
632 },
633 ClosingSubstate::Second => match c {
634 '>' => Some(self.move_to_with(State::Normal, Token::CDataEnd)),
635 _ => Some(self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']'))),
636 },
637 }
638 }
639
640 /// Encountered ']'
641 fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Option<Token> {
642 match s {
643 ClosingSubstate::First => match c {
644 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)),
645 _ => Some(self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))),
646 },
647 ClosingSubstate::Second => match c {
648 '>' => Some(self.move_to_with(self.normal_state, Token::CDataEnd)),
649 _ => Some(self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))),
650 },
651 }
652 }
653}
654
655#[cfg(test)]
656mod tests {
657 // use crate::{common::Position, reader::ParserConfig2};
658 // use std::io::{BufReader, Cursor};
659
660 // use super::{Lexer, Token};
661
662 // macro_rules! assert_oks(
663 // (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
664 // $(
665 // assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf));
666 // )+
667 // })
668 // );
669
670 // macro_rules! assert_err(
671 // (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
672 // let err = $lex.next_token(&mut $buf);
673 // assert!(err.is_err());
674 // let err = err.unwrap_err();
675 // assert_eq!($r as u64, err.position().row);
676 // assert_eq!($c as u64, err.position().column);
677 // })
678 // );
679
680 // macro_rules! assert_none(
681 // (for $lex:ident and $buf:ident) => (
682 // assert_eq!(Ok(None), $lex.next_token(&mut $buf))
683 // )
684 // );
685
686 // fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
687 // (Lexer::new(&ParserConfig2::default()), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
688 // }
689
690 // #[test]
691 // fn tricky_pi() {
692 // let (mut lex, mut buf) = make_lex_and_buf(r"<?x<!-- &??><x>");
693
694 // assert_oks!(for lex and buf ;
695 // Token::ProcessingInstructionStart
696 // Token::Character('x')
697 // Token::OpeningTagStart // processing of <?xml?> relies on the extra tokens
698 // Token::Character('!')
699 // Token::Character('-')
700 // Token::Character('-')
701 // Token::Character(' ')
702 // Token::ReferenceStart
703 // Token::Character('?')
704 // Token::ProcessingInstructionEnd
705 // Token::OpeningTagStart
706 // Token::Character('x')
707 // Token::TagEnd
708 // );
709 // assert_none!(for lex and buf);
710 // }
711
712 // #[test]
713 // fn reparser() {
714 // let (mut lex, mut buf) = make_lex_and_buf(r"&a;");
715
716 // assert_oks!(for lex and buf ;
717 // Token::ReferenceStart
718 // Token::Character('a')
719 // Token::ReferenceEnd
720 // );
721 // lex.reparse("<hi/>").unwrap();
722 // assert_oks!(for lex and buf ;
723 // Token::OpeningTagStart
724 // Token::Character('h')
725 // Token::Character('i')
726 // Token::EmptyTagEnd
727 // );
728 // assert_none!(for lex and buf);
729 // }
730
731 // #[test]
732 // fn simple_lexer_test() {
733 // let (mut lex, mut buf) = make_lex_and_buf(
734 // r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "#
735 // );
736
737 // assert_oks!(for lex and buf ;
738 // Token::OpeningTagStart
739 // Token::Character('a')
740 // Token::Character(' ')
741 // Token::Character('p')
742 // Token::EqualsSign
743 // Token::SingleQuote
744 // Token::Character('q')
745 // Token::SingleQuote
746 // Token::TagEnd
747 // Token::Character(' ')
748 // Token::Character('x')
749 // Token::OpeningTagStart
750 // Token::Character('b')
751 // Token::Character(' ')
752 // Token::Character('z')
753 // Token::EqualsSign
754 // Token::DoubleQuote
755 // Token::Character('y')
756 // Token::DoubleQuote
757 // Token::TagEnd
758 // Token::Character('d')
759 // Token::Character('\t')
760 // Token::ClosingTagStart
761 // Token::Character('b')
762 // Token::TagEnd
763 // Token::ClosingTagStart
764 // Token::Character('a')
765 // Token::TagEnd
766 // Token::OpeningTagStart
767 // Token::Character('p')
768 // Token::EmptyTagEnd
769 // Token::Character(' ')
770 // Token::ProcessingInstructionStart
771 // Token::Character('n')
772 // Token::Character('m')
773 // Token::Character(' ')
774 // Token::ProcessingInstructionEnd
775 // Token::Character(' ')
776 // Token::CommentStart
777 // Token::Character(' ')
778 // Token::Character('a')
779 // Token::Character(' ')
780 // Token::Character('c')
781 // Token::Character(' ')
782 // Token::CommentEnd
783 // Token::Character(' ')
784 // Token::ReferenceStart
785 // Token::Character('n')
786 // Token::Character('b')
787 // Token::Character('s')
788 // Token::Character('p')
789 // Token::ReferenceEnd
790 // );
791 // assert_none!(for lex and buf);
792 // }
793
794 // #[test]
795 // fn special_chars_test() {
796 // let (mut lex, mut buf) = make_lex_and_buf(
797 // r"?x!+ // -| ]z]]"
798 // );
799
800 // assert_oks!(for lex and buf ;
801 // Token::Character('?')
802 // Token::Character('x')
803 // Token::Character('!')
804 // Token::Character('+')
805 // Token::Character(' ')
806 // Token::Character('/')
807 // Token::Character('/')
808 // Token::Character(' ')
809 // Token::Character('-')
810 // Token::Character('|')
811 // Token::Character(' ')
812 // Token::Character(']')
813 // Token::Character('z')
814 // Token::Character(']')
815 // Token::Character(']')
816 // );
817 // assert_none!(for lex and buf);
818 // }
819
820 // #[test]
821 // fn cdata_test() {
822 // let (mut lex, mut buf) = make_lex_and_buf(
823 // r"<a><![CDATA[x y ?]]> </a>"
824 // );
825
826 // assert_oks!(for lex and buf ;
827 // Token::OpeningTagStart
828 // Token::Character('a')
829 // Token::TagEnd
830 // Token::CDataStart
831 // Token::Character('x')
832 // Token::Character(' ')
833 // Token::Character('y')
834 // Token::Character(' ')
835 // Token::Character('?')
836 // Token::CDataEnd
837 // Token::Character(' ')
838 // Token::ClosingTagStart
839 // Token::Character('a')
840 // Token::TagEnd
841 // );
842 // assert_none!(for lex and buf);
843 // }
844
845 // #[test]
846 // fn cdata_closers_test() {
847 // let (mut lex, mut buf) = make_lex_and_buf(
848 // r"<![CDATA[] > ]> ]]><!---->]]<a>"
849 // );
850
851 // assert_oks!(for lex and buf ;
852 // Token::CDataStart
853 // Token::Character(']')
854 // Token::Character(' ')
855 // Token::Character('>')
856 // Token::Character(' ')
857 // Token::Character(']')
858 // Token::Character('>')
859 // Token::Character(' ')
860 // Token::CDataEnd
861 // Token::CommentStart
862 // Token::CommentEnd
863 // Token::Character(']')
864 // Token::Character(']')
865 // Token::OpeningTagStart
866 // Token::Character('a')
867 // Token::TagEnd
868 // );
869 // assert_none!(for lex and buf);
870 // }
871
872 // #[test]
873 // fn doctype_test() {
874 // let (mut lex, mut buf) = make_lex_and_buf(
875 // r"<a><!DOCTYPE ab xx z> "
876 // );
877 // assert_oks!(for lex and buf ;
878 // Token::OpeningTagStart
879 // Token::Character('a')
880 // Token::TagEnd
881 // Token::DoctypeStart
882 // Token::Character(' ')
883 // Token::Character('a')
884 // Token::Character('b')
885 // Token::Character(' ')
886 // Token::Character('x')
887 // Token::Character('x')
888 // Token::Character(' ')
889 // Token::Character('z')
890 // Token::TagEnd
891 // Token::Character(' ')
892 // );
893 // assert_none!(for lex and buf);
894 // }
895
896 // #[test]
897 // fn tricky_comments() {
898 // let (mut lex, mut buf) = make_lex_and_buf(
899 // r"<a><!-- C ->--></a>"
900 // );
901 // assert_oks!(for lex and buf ;
902 // Token::OpeningTagStart
903 // Token::Character('a')
904 // Token::TagEnd
905 // Token::CommentStart
906 // Token::Character(' ')
907 // Token::Character('C')
908 // Token::Character(' ')
909 // Token::Character('-')
910 // Token::Character('>')
911 // Token::CommentEnd
912 // Token::ClosingTagStart
913 // Token::Character('a')
914 // Token::TagEnd
915 // );
916 // assert_none!(for lex and buf);
917 // }
918
919 // #[test]
920 // fn doctype_with_internal_subset_test() {
921 // let (mut lex, mut buf) = make_lex_and_buf(
922 // r#"<a><!DOCTYPE ab[<!ELEMENT ba ">>>"> ]> "#
923 // );
924 // assert_oks!(for lex and buf ;
925 // Token::OpeningTagStart
926 // Token::Character('a')
927 // Token::TagEnd
928 // Token::DoctypeStart
929 // Token::Character(' ')
930 // Token::Character('a')
931 // Token::Character('b')
932 // Token::Character('[')
933 // Token::MarkupDeclarationStart
934 // Token::Character('E')
935 // Token::Character('L')
936 // Token::Character('E')
937 // Token::Character('M')
938 // Token::Character('E')
939 // Token::Character('N')
940 // Token::Character('T')
941 // Token::Character(' ')
942 // Token::Character('b')
943 // Token::Character('a')
944 // Token::Character(' ')
945 // Token::DoubleQuote
946 // Token::Character('>')
947 // Token::Character('>')
948 // Token::Character('>')
949 // Token::DoubleQuote
950 // Token::TagEnd
951 // Token::Character(' ')
952 // Token::Character(']')
953 // Token::TagEnd
954 // Token::Character(' ')
955 // );
956 // assert_none!(for lex and buf);
957 // }
958
959 // #[test]
960 // fn doctype_internal_pi_comment() {
961 // let (mut lex, mut buf) = make_lex_and_buf(
962 // "<!DOCTYPE a [\n<!ELEMENT l ANY> <!-- <?non?>--> <?pi > ?> \n]>"
963 // );
964 // assert_oks!(for lex and buf ;
965 // Token::DoctypeStart
966 // Token::Character(' ')
967 // Token::Character('a')
968 // Token::Character(' ')
969 // Token::Character('[')
970 // Token::Character('\n')
971 // Token::MarkupDeclarationStart
972 // Token::Character('E')
973 // Token::Character('L')
974 // Token::Character('E')
975 // Token::Character('M')
976 // Token::Character('E')
977 // Token::Character('N')
978 // Token::Character('T')
979 // Token::Character(' ')
980 // Token::Character('l')
981 // Token::Character(' ')
982 // Token::Character('A')
983 // Token::Character('N')
984 // Token::Character('Y')
985 // Token::TagEnd
986 // Token::Character(' ')
987 // Token::CommentStart
988 // Token::Character(' ')
989 // Token::Character('<')
990 // Token::Character('?')
991 // Token::Character('n')
992 // Token::Character('o')
993 // Token::Character('n')
994 // Token::Character('?')
995 // Token::Character('>')
996 // Token::CommentEnd
997 // Token::Character(' ')
998 // Token::ProcessingInstructionStart
999 // Token::Character('p')
1000 // Token::Character('i')
1001 // Token::Character(' ')
1002 // Token::TagEnd // not really
1003 // Token::Character(' ')
1004 // Token::ProcessingInstructionEnd
1005 // Token::Character(' ')
1006 // Token::Character('\n')
1007 // Token::Character(']')
1008 // Token::TagEnd // DTD
1009 // );
1010 // assert_none!(for lex and buf);
1011 // }
1012
1013 // #[test]
1014 // fn end_of_stream_handling_ok() {
1015 // macro_rules! eof_check(
1016 // ($data:expr ; $token:expr) => ({
1017 // let (mut lex, mut buf) = make_lex_and_buf($data);
1018 // assert_oks!(for lex and buf ; $token);
1019 // assert_none!(for lex and buf);
1020 // })
1021 // );
1022 // eof_check!("?" ; Token::Character('?'));
1023 // eof_check!("/" ; Token::Character('/'));
1024 // eof_check!("-" ; Token::Character('-'));
1025 // eof_check!("]" ; Token::Character(']'));
1026 // eof_check!("]" ; Token::Character(']'));
1027 // eof_check!("]" ; Token::Character(']'));
1028 // }
1029
1030 // #[test]
1031 // fn end_of_stream_handling_error() {
1032 // macro_rules! eof_check(
1033 // ($data:expr; $r:expr, $c:expr) => ({
1034 // let (mut lex, mut buf) = make_lex_and_buf($data);
1035 // assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
1036 // assert_none!(for lex and buf);
1037 // })
1038 // );
1039 // eof_check!("<" ; 0, 1);
1040 // eof_check!("<!" ; 0, 2);
1041 // eof_check!("<!-" ; 0, 3);
1042 // eof_check!("<![" ; 0, 3);
1043 // eof_check!("<![C" ; 0, 4);
1044 // eof_check!("<![CD" ; 0, 5);
1045 // eof_check!("<![CDA" ; 0, 6);
1046 // eof_check!("<![CDAT" ; 0, 7);
1047 // eof_check!("<![CDATA" ; 0, 8);
1048 // }
1049
1050 // #[test]
1051 // fn error_in_comment_or_cdata_prefix() {
1052 // let (mut lex, mut buf) = make_lex_and_buf("<!x");
1053 // assert_err!(for lex and buf expect row 0 ; 0,
1054 // "Unexpected token '<!' before 'x'"
1055 // );
1056
1057 // let (mut lex, mut buf) = make_lex_and_buf("<!x");
1058 // lex.disable_errors();
1059 // assert_oks!(for lex and buf ;
1060 // Token::Character('<')
1061 // Token::Character('!')
1062 // Token::Character('x')
1063 // );
1064 // assert_none!(for lex and buf);
1065 // }
1066
1067 // #[test]
1068 // fn error_in_comment_started() {
1069 // let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1070 // assert_err!(for lex and buf expect row 0 ; 0,
1071 // "Unexpected token '<!-' before '\t'"
1072 // );
1073
1074 // let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1075 // lex.disable_errors();
1076 // assert_oks!(for lex and buf ;
1077 // Token::Character('<')
1078 // Token::Character('!')
1079 // Token::Character('-')
1080 // Token::Character('\t')
1081 // );
1082 // assert_none!(for lex and buf);
1083 // }
1084
1085 // #[test]
1086 // fn error_in_comment_two_dashes_not_at_end() {
1087 // let (mut lex, mut buf) = make_lex_and_buf("--x");
1088 // lex.st = super::State::InsideComment;
1089 // assert_err!(for lex and buf expect row 0; 0,
1090 // "Unexpected token '--' before 'x'"
1091 // );
1092
1093 // let (mut lex, mut buf) = make_lex_and_buf("--x");
1094 // assert_oks!(for lex and buf ;
1095 // Token::Character('-')
1096 // Token::Character('-')
1097 // Token::Character('x')
1098 // );
1099 // }
1100
1101 // macro_rules! check_case(
1102 // ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
1103 // let (mut lex, mut buf) = make_lex_and_buf($data);
1104 // assert_err!(for lex and buf expect row $r ; $c, $s);
1105
1106 // let (mut lex, mut buf) = make_lex_and_buf($data);
1107 // lex.disable_errors();
1108 // for c in $chunk.chars() {
1109 // assert_eq!(Ok(Some(Token::Character(c))), lex.next_token(&mut buf));
1110 // }
1111 // assert_oks!(for lex and buf ;
1112 // Token::Character($app)
1113 // );
1114 // assert_none!(for lex and buf);
1115 // })
1116 // );
1117
1118 // #[test]
1119 // fn token_size() {
1120 // assert_eq!(4, std::mem::size_of::<Token>());
1121 // assert_eq!(2, std::mem::size_of::<super::State>());
1122 // }
1123
1124 // #[test]
1125 // fn error_in_cdata_started() {
1126 // check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['");
1127 // check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['");
1128 // check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['");
1129 // check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['");
1130 // check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['");
1131 // check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
1132 // }
1133
1134 // #[test]
1135 // fn error_in_doctype_started() {
1136 // check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'");
1137 // check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'");
1138 // check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'");
1139 // check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
1140 // check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
1141 // check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
1142 // }
1143
1144
1145
1146 // #[test]
1147 // fn issue_98_cdata_ending_with_right_bracket() {
1148 // let (mut lex, mut buf) = make_lex_and_buf(
1149 // r"<![CDATA[Foo [Bar]]]>"
1150 // );
1151
1152 // assert_oks!(for lex and buf ;
1153 // Token::CDataStart
1154 // Token::Character('F')
1155 // Token::Character('o')
1156 // Token::Character('o')
1157 // Token::Character(' ')
1158 // Token::Character('[')
1159 // Token::Character('B')
1160 // Token::Character('a')
1161 // Token::Character('r')
1162 // Token::Character(']')
1163 // Token::CDataEnd
1164 // );
1165 // assert_none!(for lex and buf);
1166 // }
1167}