1extern crate alloc;
3
4use alloc::string::{String, ToString};
5use alloc::vec::Vec;
6
7use crate::attribute::OwnedAttribute;
8use crate::common::{is_xml10_char, is_xml11_char, is_xml11_char_not_restricted, is_name_char, is_name_start_char, is_whitespace_char};
9use crate::common::{Position, TextPosition, XmlVersion};
10use crate::name::OwnedName;
11use crate::namespace::NamespaceStack;
12use crate::reader::config::ParserConfig2;
13use crate::reader::error::SyntaxError;
14use crate::reader::events::XmlEvent;
15use crate::reader::lexer::{Lexer, Token};
16use super::{Error, ErrorKind};
17
18use alloc::collections::{BTreeMap, BTreeSet};
19
20macro_rules! gen_takes(
21 ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => (
22 $(
23 impl MarkupData {
24 #[inline]
25 #[allow(clippy::mem_replace_option_with_none)]
26 #[allow(clippy::mem_replace_with_default)]
27 fn $method(&mut self) -> $t {
28 core::mem::replace(&mut self.$field, $def)
29 }
30 }
31 )+
32 )
33);
34
35gen_takes!(
36 name -> take_name, String, String::new();
37 ref_data -> take_ref_data, String, String::new();
38
39 encoding -> take_encoding, Option<String>, None;
40
41 element_name -> take_element_name, Option<OwnedName>, None;
42
43 attr_name -> take_attr_name, Option<OwnedName>, None;
44 attributes -> take_attributes, BTreeSet<OwnedAttribute>, BTreeSet::new()
45);
46
47mod inside_cdata;
48mod inside_closing_tag_name;
49mod inside_comment;
50mod inside_declaration;
51mod inside_doctype;
52mod inside_opening_tag;
53mod inside_processing_instruction;
54mod inside_reference;
55mod outside_tag;
56
57static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10;
58static DEFAULT_STANDALONE: Option<bool> = None;
59
60type ElementStack = Vec<OwnedName>;
61pub type Result = super::Result<XmlEvent>;
62
63pub(crate) struct PullParser {
65 config: ParserConfig2,
66 lexer: Lexer,
67 st: State,
68 state_after_reference: State,
69 buf: String,
70
71 entities: BTreeMap<String, String>,
73
74 nst: NamespaceStack,
75
76 data: MarkupData,
77 final_result: Option<Result>,
78 next_event: Option<Result>,
79 est: ElementStack,
80 pos: Vec<TextPosition>,
81
82 encountered: Encountered,
83 inside_whitespace: bool,
84 read_prefix_separator: bool,
85 pop_namespace: bool,
86}
87
88#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
90enum Encountered {
91 None = 0,
92 AnyChars, Declaration,
94 Comment,
95 Doctype,
96 Element,
97}
98
99impl PullParser {
100 #[inline]
102 pub fn new(config: impl Into<ParserConfig2>) -> PullParser {
103 let config = config.into();
104 Self::new_with_config2(config)
105 }
106
107 #[inline]
108 fn new_with_config2(config: ParserConfig2) -> PullParser {
109 let mut lexer = Lexer::new(&config);
110 if let Some(enc) = config.override_encoding {
111 lexer.set_encoding(enc);
112 }
113
114 let mut pos = Vec::with_capacity(16);
115 pos.push(TextPosition::new());
116
117 PullParser {
118 config,
119 lexer,
120 st: State::DocumentStart,
121 state_after_reference: State::OutsideTag,
122 buf: String::new(),
123 entities: BTreeMap::new(),
124 nst: NamespaceStack::default(),
125
126 data: MarkupData {
127 name: String::new(),
128 version: None,
129 encoding: None,
130 standalone: None,
131 ref_data: String::new(),
132 element_name: None,
133 quote: None,
134 attr_name: None,
135 attributes: BTreeSet::new(),
136 },
137 final_result: None,
138 next_event: None,
139 est: Vec::new(),
140 pos,
141
142 encountered: Encountered::None,
143 inside_whitespace: true,
144 read_prefix_separator: false,
145 pop_namespace: false,
146 }
147 }
148
149 pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.c.ignore_end_of_stream }
151
152 #[inline(never)]
153 fn set_encountered(&mut self, new_encounter: Encountered) -> Option<Result> {
154 if new_encounter <= self.encountered {
155 return None;
156 }
157 let prev_enc = self.encountered;
158 self.encountered = new_encounter;
159
160 if prev_enc == Encountered::None {
163 self.push_pos();
164 Some(Ok(XmlEvent::StartDocument {
165 version: DEFAULT_VERSION,
166 encoding: self.lexer.encoding().to_string(),
167 standalone: DEFAULT_STANDALONE,
168 }))
169 } else {
170 None
171 }
172 }
173}
174
175impl Position for PullParser {
176 #[inline]
178 fn position(&self) -> TextPosition {
179 self.pos.get(0).cloned().unwrap_or_else(TextPosition::new)
180 }
181}
182
183#[derive(Copy, Clone, PartialEq)]
184pub enum State {
185 OutsideTag,
186 InsideOpeningTag(OpeningTagSubstate),
187 InsideClosingTag(ClosingTagSubstate),
188 InsideProcessingInstruction(ProcessingInstructionSubstate),
189 InsideComment,
190 InsideCData,
191 InsideDeclaration(DeclarationSubstate),
192 InsideDoctype(DoctypeSubstate),
193 InsideReference,
194 DocumentStart,
195}
196
197#[derive(Copy, Clone, PartialEq)]
198pub enum DoctypeSubstate {
199 Outside,
200 String,
201 InsideName,
202 BeforeEntityName,
203 EntityName,
204 BeforeEntityValue,
205 EntityValue,
206 NumericReferenceStart,
207 NumericReference,
208 PEReferenceInValue,
210 PEReferenceInDtd,
211 PEReferenceDefinitionStart,
213 PEReferenceDefinition,
214 SkipDeclaration,
215 Comment,
216}
217
218#[derive(Copy, Clone, PartialEq)]
219pub enum OpeningTagSubstate {
220 InsideName,
221
222 InsideTag,
223
224 InsideAttributeName,
225 AfterAttributeName,
226
227 InsideAttributeValue,
228 AfterAttributeValue,
229}
230
231#[derive(Copy, Clone, PartialEq)]
232pub enum ClosingTagSubstate {
233 CTInsideName,
234 CTAfterName,
235}
236
237#[derive(Copy, Clone, PartialEq)]
238pub enum ProcessingInstructionSubstate {
239 PIInsideName,
240 PIInsideData,
241}
242
243#[derive(Copy, Clone, PartialEq)]
244pub enum DeclarationSubstate {
245 BeforeVersion,
246 InsideVersion,
247 AfterVersion,
248
249 InsideVersionValue,
250 AfterVersionValue,
251
252 BeforeEncoding,
253 InsideEncoding,
254 AfterEncoding,
255
256 InsideEncodingValue,
257 AfterEncodingValue,
258
259 BeforeStandaloneDecl,
260 InsideStandaloneDecl,
261 AfterStandaloneDecl,
262
263 InsideStandaloneDeclValue,
264 AfterStandaloneDeclValue,
265}
266
267#[derive(Copy, Clone, PartialEq)]
268enum QualifiedNameTarget {
269 AttributeNameTarget,
270 OpeningTagNameTarget,
271 ClosingTagNameTarget,
272}
273
274#[derive(Copy, Clone, PartialEq, Eq)]
275enum QuoteToken {
276 SingleQuoteToken,
277 DoubleQuoteToken,
278}
279
280impl QuoteToken {
281 #[inline]
282 fn from_token(t: Token) -> Option<QuoteToken> {
283 match t {
284 Token::SingleQuote => Some(QuoteToken::SingleQuoteToken),
285 Token::DoubleQuote => Some(QuoteToken::DoubleQuoteToken),
286 _ => {
287 debug_assert!(false);
288 None
289 },
290 }
291 }
292
293 fn as_token(self) -> Token {
294 match self {
295 QuoteToken::SingleQuoteToken => Token::SingleQuote,
296 QuoteToken::DoubleQuoteToken => Token::DoubleQuote,
297 }
298 }
299}
300
301struct MarkupData {
302 name: String, ref_data: String, version: Option<XmlVersion>, encoding: Option<String>, standalone: Option<bool>, element_name: Option<OwnedName>, quote: Option<QuoteToken>, attr_name: Option<OwnedName>, attributes: BTreeSet<OwnedAttribute>, }
315
316impl PullParser {
317 pub fn next<'a, S: Iterator<Item = &'a u8>>(&mut self, r: &mut S) -> Result {
322 if let Some(ref ev) = self.final_result {
323 return ev.clone();
324 }
325
326 if let Some(ev) = self.next_event.take() {
327 return ev;
328 }
329
330 if self.pop_namespace {
331 self.pop_namespace = false;
332 self.nst.pop();
333 }
334
335 loop {
336 debug_assert!(self.next_event.is_none());
337 debug_assert!(!self.pop_namespace);
338
339 match self.lexer.next_token(r) {
342 Ok(Token::Eof) => {
343 self.next_pos();
345 return self.handle_eof()
346 },
347 Ok(token) => {
348 match self.dispatch_token(token) {
349 None => continue,
350 Some(Ok(xml_event)) => {
351 self.next_pos();
352 return Ok(xml_event)
353 },
354 Some(Err(xml_error)) => {
355 self.next_pos();
356 return self.set_final_result(Err(xml_error))
357 },
358 }
359 },
360 Err(lexer_error) => {
361 self.next_pos();
362 return self.set_final_result(Err(lexer_error))
363 },
364 }
365 }
366 }
367
368 #[cold]
370 fn handle_eof(&mut self) -> core::result::Result<XmlEvent, super::Error> {
371 let ev = if self.depth() == 0 {
372 if self.encountered == Encountered::Element && self.st == State::OutsideTag { Ok(XmlEvent::EndDocument)
374 } else if self.encountered < Encountered::Element {
375 self.error(SyntaxError::NoRootElement)
376 } else { self.error(SyntaxError::UnexpectedEof) }
379 } else if self.config.c.ignore_end_of_stream {
380 self.final_result = None;
381 self.lexer.reset_eof_handled();
382 return self.error(SyntaxError::UnbalancedRootElement);
383 } else {
384 self.error(SyntaxError::UnbalancedRootElement)
385 };
386 self.set_final_result(ev)
387 }
388
389 #[inline]
392 fn set_final_result(&mut self, result: Result) -> Result {
393 self.final_result = Some(result.clone());
394 result
395 }
396
397 #[cold]
398 fn error(&self, e: SyntaxError) -> Result {
399 Err(Error {
400 pos: self.lexer.position(),
401 kind: ErrorKind::Syntax(e.to_cow()),
402 })
403 }
404
405 #[inline]
406 fn next_pos(&mut self) {
407 if !self.pos.is_empty() {
410 if self.pos.len() > 1 {
411 self.pos.remove(0);
412 } else {
413 self.pos[0] = self.lexer.position();
414 }
415 }
416 }
417
418 #[inline]
419 #[track_caller]
420 fn push_pos(&mut self) {
421 debug_assert!(self.pos.len() != self.pos.capacity(), "You've found a bug in xml-rs, caused by calls to push_pos() in states that don't end up emitting events.
422 This case is ignored in release mode, and merely causes document positions to be out of sync.
423 Please file a bug and include the XML document that triggers this assert.");
424
425 if self.pos.len() != self.pos.capacity() {
427 self.pos.push(self.lexer.position());
428 } else if self.pos.len() > 1 {
429 self.pos.remove(0); }
431 }
432
433 #[inline(never)]
434 fn dispatch_token(&mut self, t: Token) -> Option<Result> {
435 match self.st {
436 State::OutsideTag => self.outside_tag(t),
437 State::InsideOpeningTag(s) => self.inside_opening_tag(t, s),
438 State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s),
439 State::InsideReference => self.inside_reference(t),
440 State::InsideComment => self.inside_comment(t),
441 State::InsideCData => self.inside_cdata(t),
442 State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
443 State::InsideDoctype(s) => self.inside_doctype(t, s),
444 State::InsideDeclaration(s) => self.inside_declaration(t, s),
445 State::DocumentStart => self.document_start(t),
446 }
447 }
448
449 #[inline]
450 fn depth(&self) -> usize {
451 self.est.len()
452 }
453
454 #[inline]
455 fn buf_has_data(&self) -> bool {
456 !self.buf.is_empty()
457 }
458
459 #[inline]
460 fn take_buf(&mut self) -> String {
461 core::mem::take(&mut self.buf)
462 }
463
464 #[inline]
465 fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> {
466 self.st = st;
467 ev
468 }
469
470 #[inline]
471 fn into_state_continue(&mut self, st: State) -> Option<Result> {
472 self.into_state(st, None)
473 }
474
475 #[inline]
476 fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> {
477 self.into_state(st, Some(ev))
478 }
479
480 fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result>
487 where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result> {
488 if self.buf.len() <= 1 {
491 self.read_prefix_separator = false;
492 }
493
494 let invoke_callback = move |this: &mut PullParser, t| {
495 let name = this.take_buf();
496 match name.parse() {
497 Ok(name) => on_name(this, t, name),
498 Err(()) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))),
499 }
500 };
501
502 match t {
503 Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => {
505 self.buf.push(':');
506 self.read_prefix_separator = true;
507 None
508 }
509
510 Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) ||
511 self.buf_has_data() && is_name_char(c)) => {
512 if self.buf.len() > self.config.max_name_length {
513 return Some(self.error(SyntaxError::ExceededConfiguredLimit));
514 }
515 self.buf.push(c);
516 None
517 },
518
519 Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t),
520
521 Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t),
522
523 Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget ||
524 target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t),
525
526 Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t),
527
528 _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))),
529 }
530 }
531
532 fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result>
538 where F: Fn(&mut PullParser, String) -> Option<Result> {
539 match t {
540 Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, Token::DoubleQuote | Token::SingleQuote => match self.data.quote {
543 None => { self.data.quote = QuoteToken::from_token(t);
545 None
546 }
547 Some(q) if q.as_token() == t => {
548 self.data.quote = None;
549 let value = self.take_buf();
550 on_value(self, value)
551 }
552 _ => {
553 if let Token::Character(c) = t {
554 if !self.is_valid_xml_char_not_restricted(c) {
555 return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)));
556 }
557 }
558 if self.buf.len() > self.config.max_attribute_length {
559 return Some(self.error(SyntaxError::ExceededConfiguredLimit));
560 }
561 t.push_to_string(&mut self.buf);
562 None
563 }
564 },
565
566 Token::ReferenceStart if self.data.quote.is_some() => {
567 self.state_after_reference = self.st;
568 self.into_state_continue(State::InsideReference)
569 },
570
571 Token::OpeningTagStart => Some(self.error(SyntaxError::UnexpectedOpeningTag)),
572
573 Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => {
574 Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
575 },
576
577 _ if self.data.quote.is_some() => {
579 if self.buf.len() > self.config.max_attribute_length {
580 return Some(self.error(SyntaxError::ExceededConfiguredLimit));
581 }
582 t.push_to_string(&mut self.buf);
583 None
584 }
585
586 _ => Some(self.error(SyntaxError::UnexpectedToken(t))),
587 }
588 }
589
590 fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> {
591 let mut name = self.data.take_element_name()?;
592 let mut attributes: Vec<OwnedAttribute> = self.data.take_attributes().into_iter().collect();
593
594 match self.nst.get(name.borrow().prefix_repr()) {
596 Some("") => name.namespace = None, Some(ns) => name.namespace = Some(ns.into()),
598 None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into())))
599 }
600
601 for attr in &mut attributes {
603 if let Some(ref pfx) = attr.name.prefix {
604 let new_ns = match self.nst.get(pfx) {
605 Some("") => None, Some(ns) => Some(ns.into()),
607 None => return Some(self.error(SyntaxError::UnboundAttribute(attr.name.to_string().into())))
608 };
609 attr.name.namespace = new_ns;
610 }
611 }
612
613 if emit_end_element {
614 self.pop_namespace = true;
615 self.next_event = Some(Ok(XmlEvent::EndElement {
616 name: name.clone()
617 }));
618 } else {
619 self.est.push(name.clone());
620 }
621 let namespace = self.nst.squash();
622 self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement {
623 name,
624 attributes,
625 namespace
626 }))
627 }
628
629 fn emit_end_element(&mut self) -> Option<Result> {
630 let mut name = self.data.take_element_name()?;
631
632 match self.nst.get(name.borrow().prefix_repr()) {
634 Some("") => name.namespace = None, Some(ns) => name.namespace = Some(ns.into()),
636 None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into())))
637 }
638
639 let op_name = self.est.pop()?;
640
641 if name == op_name {
642 self.pop_namespace = true;
643 self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name }))
644 } else {
645 Some(self.error(SyntaxError::UnexpectedClosingTag(alloc::format!("{name} != {op_name}").into())))
646 }
647 }
648
649 #[inline]
650 fn is_valid_xml_char(&self, c: char) -> bool {
651 if Some(XmlVersion::Version11) == self.data.version {
652 is_xml11_char(c)
653 } else {
654 is_xml10_char(c)
655 }
656 }
657
658 #[inline]
659 fn is_valid_xml_char_not_restricted(&self, c: char) -> bool {
660 if Some(XmlVersion::Version11) == self.data.version {
661 is_xml11_char_not_restricted(c)
662 } else {
663 is_xml10_char(c)
664 }
665 }
666}
667
668#[cfg(test)]
669mod tests {
670 }