1use alloc::string::{String, ToString};
2use alloc::{vec, vec::Vec};
3use alloc::borrow::Cow;
4use core::ops::Range;
5use core::mem::take;
6use core::fmt;
7use memchr::{memchr, memchr2, memchr_iter};
8
9use crate::{
10 AttributeData, Document, ExpandedNameIndexed, NamespaceIdx, Namespaces, NodeData, NodeId,
11 NodeKind, ShortRange, StringStorage, TextPos, NS_XMLNS_URI, NS_XML_PREFIX, NS_XML_URI, PI,
12 XMLNS,
13};
14
15use crate::tokenizer::{self, Reference, StrSpan, Stream};
16
17type Result<T> = core::result::Result<T, Error>;
18
19#[derive(Clone, PartialEq, Eq, Hash, Debug)]
21pub enum Error {
22 InvalidXmlPrefixUri(TextPos),
24
25 UnexpectedXmlUri(TextPos),
27
28 UnexpectedXmlnsUri(TextPos),
30
31 InvalidElementNamePrefix(TextPos),
33
34 DuplicatedNamespace(String, TextPos),
36
37 UnknownNamespace(String, TextPos),
43
44 UnexpectedCloseTag(String, String, TextPos),
48
49 UnexpectedEntityCloseTag(TextPos),
57
58 UnknownEntityReference(String, TextPos),
60
61 MalformedEntityReference(TextPos),
66
67 EntityReferenceLoop(TextPos),
71
72 InvalidAttributeValue(TextPos),
74
75 DuplicatedAttribute(String, TextPos),
83
84 NoRootNode,
86
87 UnclosedRootNode,
89
90 UnexpectedDeclaration(TextPos),
93
94 DtdDetected,
98
99 NodesLimitReached,
101
102 AttributesLimitReached,
104
105 NamespacesLimitReached,
107
108 InvalidName(TextPos),
110
111 NonXmlChar(char, TextPos),
115
116 InvalidChar(u8, u8, TextPos),
120
121 InvalidChar2(&'static str, u8, TextPos),
125
126 InvalidString(&'static str, TextPos),
130
131 InvalidExternalID(TextPos),
133
134 EntityResolver(TextPos, String),
136
137 InvalidComment(TextPos),
139
140 InvalidCharacterData(TextPos),
144
145 UnknownToken(TextPos),
147
148 UnexpectedEndOfStream,
152}
153
154impl Error {
155 pub fn pos(&self) -> TextPos {
157 match *self {
158 Error::InvalidXmlPrefixUri(pos) => pos,
159 Error::UnexpectedXmlUri(pos) => pos,
160 Error::UnexpectedXmlnsUri(pos) => pos,
161 Error::InvalidElementNamePrefix(pos) => pos,
162 Error::DuplicatedNamespace(_, pos) => pos,
163 Error::UnknownNamespace(_, pos) => pos,
164 Error::UnexpectedCloseTag(_, _, pos) => pos,
165 Error::UnexpectedEntityCloseTag(pos) => pos,
166 Error::UnknownEntityReference(_, pos) => pos,
167 Error::MalformedEntityReference(pos) => pos,
168 Error::EntityReferenceLoop(pos) => pos,
169 Error::InvalidAttributeValue(pos) => pos,
170 Error::DuplicatedAttribute(_, pos) => pos,
171 Error::NoRootNode => TextPos::new(1, 1),
172 Error::UnclosedRootNode => TextPos::new(1, 1),
173 Error::UnexpectedDeclaration(pos) => pos,
174 Error::DtdDetected => TextPos::new(1, 1),
175 Error::NodesLimitReached => TextPos::new(1, 1),
176 Error::AttributesLimitReached => TextPos::new(1, 1),
177 Error::NamespacesLimitReached => TextPos::new(1, 1),
178 Error::InvalidName(pos) => pos,
179 Error::NonXmlChar(_, pos) => pos,
180 Error::InvalidChar(_, _, pos) => pos,
181 Error::InvalidChar2(_, _, pos) => pos,
182 Error::InvalidString(_, pos) => pos,
183 Error::InvalidExternalID(pos) => pos,
184 Error::EntityResolver(pos, _) => pos,
185 Error::InvalidComment(pos) => pos,
186 Error::InvalidCharacterData(pos) => pos,
187 Error::UnknownToken(pos) => pos,
188 Error::UnexpectedEndOfStream => TextPos::new(1, 1),
189 }
190 }
191}
192
193impl core::fmt::Display for Error {
194 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
195 match self {
196 Error::InvalidXmlPrefixUri(pos) => {
197 write!(f, "'xml' namespace prefix mapped to wrong URI at {}", pos)
198 }
199 Error::UnexpectedXmlUri(pos) => {
200 write!(
201 f,
202 "the 'xml' namespace URI is used for not 'xml' prefix at {}",
203 pos
204 )
205 }
206 Error::UnexpectedXmlnsUri(pos) => {
207 write!(
208 f,
209 "the 'xmlns' URI is used at {}, but it must not be declared",
210 pos
211 )
212 }
213 Error::InvalidElementNamePrefix(pos) => {
214 write!(
215 f,
216 "the 'xmlns' prefix is used at {}, but it must not be",
217 pos
218 )
219 }
220 Error::DuplicatedNamespace(ref name, pos) => {
221 write!(f, "namespace '{}' at {} is already defined", name, pos)
222 }
223 Error::UnknownNamespace(ref name, pos) => {
224 write!(f, "an unknown namespace prefix '{}' at {}", name, pos)
225 }
226 Error::UnexpectedCloseTag(ref expected, ref actual, pos) => {
227 write!(
228 f,
229 "expected '{}' tag, not '{}' at {}",
230 expected, actual, pos
231 )
232 }
233 Error::UnexpectedEntityCloseTag(pos) => {
234 write!(f, "unexpected close tag at {}", pos)
235 }
236 Error::MalformedEntityReference(pos) => {
237 write!(f, "malformed entity reference at {}", pos)
238 }
239 Error::UnknownEntityReference(ref name, pos) => {
240 write!(f, "unknown entity reference '{}' at {}", name, pos)
241 }
242 Error::EntityReferenceLoop(pos) => {
243 write!(f, "a possible entity reference loop is detected at {}", pos)
244 }
245 Error::InvalidAttributeValue(pos) => {
246 write!(f, "unescaped '<' found at {}", pos)
247 }
248 Error::DuplicatedAttribute(ref name, pos) => {
249 write!(f, "attribute '{}' at {} is already defined", name, pos)
250 }
251 Error::NoRootNode => {
252 write!(f, "the document does not have a root node")
253 }
254 Error::UnclosedRootNode => {
255 write!(f, "the root node was opened but never closed")
256 }
257 Error::UnexpectedDeclaration(pos) => {
258 write!(f, "unexpected XML declaration at {}", pos)
259 }
260 Error::DtdDetected => {
261 write!(f, "XML with DTD detected")
262 }
263 Error::NodesLimitReached => {
264 write!(f, "nodes limit reached")
265 }
266 Error::AttributesLimitReached => {
267 write!(f, "more than 2^32 attributes were parsed")
268 }
269 Error::NamespacesLimitReached => {
270 write!(f, "more than 2^16 unique namespaces were parsed")
271 }
272 Error::InvalidName(pos) => {
273 write!(f, "invalid name token at {}", pos)
274 }
275 Error::NonXmlChar(c, pos) => {
276 write!(f, "a non-XML character {:?} found at {}", c, pos)
277 }
278 Error::InvalidChar(expected, actual, pos) => {
279 write!(
280 f,
281 "expected '{}' not '{}' at {}",
282 *expected as char, *actual as char, pos
283 )
284 }
285 Error::InvalidChar2(expected, actual, pos) => {
286 write!(
287 f,
288 "expected {} not '{}' at {}",
289 expected, *actual as char, pos
290 )
291 }
292 Error::InvalidString(expected, pos) => {
293 write!(f, "expected '{}' at {}", expected, pos)
294 }
295 Error::InvalidExternalID(pos) => {
296 write!(f, "invalid ExternalID at {}", pos)
297 }
298 Error::EntityResolver(pos, msg) => {
299 write!(f, "entity resolver failed at {}: {}", pos, msg)
300 }
301 Error::InvalidComment(pos) => {
302 write!(f, "comment at {} contains '--'", pos)
303 }
304 Error::InvalidCharacterData(pos) => {
305 write!(f, "']]>' at {} is not allowed inside a character data", pos)
306 }
307 Error::UnknownToken(pos) => {
308 write!(f, "unknown token at {}", pos)
309 }
310 Error::UnexpectedEndOfStream => {
311 write!(f, "unexpected end of stream")
312 }
313 }
314 }
315}
316
317#[cfg(feature = "std")]
318impl std::error::Error for Error {
319 fn description(&self) -> &str {
320 "an XML parsing error"
321 }
322}
323
324pub struct ParsingOptions<'input> {
326 pub allow_dtd: bool,
340
341 pub nodes_limit: u32,
347
348 pub entity_resolver: Option<&'input EntityResolver<'input>>,
353}
354
355pub type EntityResolver<'input> =
369 dyn Fn(Option<&str>, &str) -> core::result::Result<Option<&'input str>, String> + 'input;
370
371impl Default for ParsingOptions<'_> {
372 fn default() -> Self {
373 ParsingOptions {
374 allow_dtd: false,
375 nodes_limit: u32::MAX,
376 entity_resolver: None,
377 }
378 }
379}
380
381impl fmt::Debug for ParsingOptions<'_> {
382 fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
383 let entity_resolver = if self.entity_resolver.is_some() {
384 "Some(..)"
385 } else {
386 "None"
387 };
388
389 fmt.debug_struct("ParsingOptions")
390 .field("allow_dtd", &self.allow_dtd)
391 .field("nodes_limit", &self.nodes_limit)
392 .field("entity_resolver", &entity_resolver)
393 .finish()
394 }
395}
396
397struct TempAttributeData<'input> {
398 prefix: &'input str,
399 local: &'input str,
400 value: StringStorage<'input>,
401 range: Range<usize>,
402 #[cfg(feature = "positions")]
403 qname_len: u16,
404 #[cfg(feature = "positions")]
405 eq_len: u8,
406}
407
408impl<'input> Document<'input> {
409 #[inline]
423 pub fn parse(text: &'input str) -> Result<Self> {
424 Self::parse_with_options(text, ParsingOptions::default())
425 }
426
427 #[inline]
440 pub fn parse_with_options(text: &'input str, opt: ParsingOptions<'input>) -> Result<Self> {
441 parse(text, opt)
442 }
443}
444
445struct Entity<'input> {
446 name: &'input str,
447 value: StrSpan<'input>,
448}
449
450#[derive(Clone, Copy)]
451struct TagNameSpan<'input> {
452 prefix: &'input str,
453 name: &'input str,
454 pos: usize,
455 prefix_pos: usize,
456}
457
458impl<'input> TagNameSpan<'input> {
459 #[inline]
460 fn new_null() -> Self {
461 Self {
462 prefix: "",
463 name: "",
464 pos: 0,
465 prefix_pos: 0,
466 }
467 }
468}
469
470#[derive(Default)]
500struct LoopDetector {
501 depth: u8,
503 references: u8,
505}
506
507impl LoopDetector {
508 #[inline]
509 fn inc_depth(&mut self, stream: &Stream) -> Result<()> {
510 if self.depth < 10 {
511 self.depth += 1;
512 Ok(())
513 } else {
514 Err(Error::EntityReferenceLoop(stream.gen_text_pos()))
515 }
516 }
517
518 #[inline]
519 fn dec_depth(&mut self) {
520 if self.depth > 0 {
521 self.depth -= 1;
522 }
523
524 if self.depth == 0 {
526 self.references = 0;
527 }
528 }
529
530 #[inline]
531 fn inc_references(&mut self, stream: &Stream) -> Result<()> {
532 if self.depth == 0 {
533 Ok(())
535 } else {
536 if self.references == u8::MAX {
537 return Err(Error::EntityReferenceLoop(stream.gen_text_pos()));
538 }
539
540 self.references += 1;
541 Ok(())
542 }
543 }
544}
545
546struct Context<'input> {
547 opt: ParsingOptions<'input>,
548 namespace_start_idx: usize,
549 current_attributes: Vec<TempAttributeData<'input>>,
550 awaiting_subtree: Vec<NodeId>,
551 parent_prefixes: Vec<&'input str>,
552 entities: Vec<Entity<'input>>,
553 after_text: Vec<Cow<'input, str>>,
554 parent_id: NodeId,
555 tag_name: TagNameSpan<'input>,
556 loop_detector: LoopDetector,
557 doc: Document<'input>,
558}
559
560impl<'input> Context<'input> {
561 fn append_node(&mut self, kind: NodeKind<'input>, range: Range<usize>) -> Result<NodeId> {
562 if self.doc.nodes.len() >= self.opt.nodes_limit as usize {
563 return Err(Error::NodesLimitReached);
564 }
565
566 #[cfg(not(feature = "positions"))]
567 let _ = range;
568
569 let new_child_id = NodeId::from(self.doc.nodes.len());
570
571 let appending_element = matches!(kind, NodeKind::Element { .. });
572 self.doc.nodes.push(NodeData {
573 parent: Some(self.parent_id),
574 prev_sibling: None,
575 next_subtree: None,
576 last_child: None,
577 kind,
578 #[cfg(feature = "positions")]
579 range,
580 });
581
582 let last_child_id = self.doc.nodes[self.parent_id.get_usize()].last_child;
583 self.doc.nodes[new_child_id.get_usize()].prev_sibling = last_child_id;
584 self.doc.nodes[self.parent_id.get_usize()].last_child = Some(new_child_id);
585
586 for id in &self.awaiting_subtree {
587 self.doc.nodes[id.get_usize()].next_subtree = Some(new_child_id);
588 }
589 self.awaiting_subtree.clear();
590
591 if !appending_element {
592 self.awaiting_subtree
593 .push(NodeId::from(self.doc.nodes.len() - 1));
594 }
595
596 Ok(new_child_id)
597 }
598
599 fn append_text(
600 &mut self,
601 text: Cow<'input, str>,
602 range: Range<usize>,
603 ) -> Result<()> {
604 if self.after_text.is_empty() {
605 let text = match &text {
606 Cow::Borrowed(text) => StringStorage::Borrowed(text),
607 Cow::Owned(text) => StringStorage::new_owned(text.as_str()),
608 };
609
610 self.append_node(NodeKind::Text(text), range)?;
611 }
612
613 self.after_text.push(text);
614 Ok(())
615 }
616
617 #[cold]
618 #[inline(never)]
619 fn merge_text(&mut self) {
620 let node = &mut self.doc.nodes.last_mut().unwrap();
621
622 let text = match &mut node.kind {
623 NodeKind::Text(text) => text,
624 _ => unreachable!(),
625 };
626
627 *text = StringStorage::new_owned(&self.after_text.join(""));
628 }
629
630 #[inline]
631 fn reset_after_text(&mut self) {
632 if self.after_text.is_empty() {
633 return;
634 }
635
636 if self.after_text.len() > 1 {
637 self.merge_text();
638 }
639
640 self.after_text.clear();
641 }
642}
643
644fn parse<'input>(text: &'input str, opt: ParsingOptions<'input>) -> Result<Document<'input>> {
645 let nodes_capacity = memchr_iter(b'<', text.as_bytes()).count();
647 let attributes_capacity = memchr_iter(b'=', text.as_bytes()).count();
648
649 let mut doc = Document {
651 text,
652 nodes: Vec::with_capacity(nodes_capacity),
653 attributes: Vec::with_capacity(attributes_capacity),
654 namespaces: Namespaces::default(),
655 };
656
657 doc.nodes.push(NodeData {
659 parent: None,
660 prev_sibling: None,
661 next_subtree: None,
662 last_child: None,
663 kind: NodeKind::Root,
664 #[cfg(feature = "positions")]
665 range: 0..text.len(),
666 });
667
668 doc.namespaces
669 .push_ns(Some(NS_XML_PREFIX), StringStorage::Borrowed(NS_XML_URI))?;
670
671 let allow_dtd = opt.allow_dtd;
672
673 let mut ctx = Context {
674 opt,
675 namespace_start_idx: 1,
676 current_attributes: Vec::with_capacity(16),
677 entities: Vec::new(),
678 awaiting_subtree: Vec::new(),
679 parent_prefixes: vec![""],
680 after_text: Vec::with_capacity(1),
681 parent_id: NodeId::new(0),
682 tag_name: TagNameSpan::new_null(),
683 loop_detector: LoopDetector::default(),
684 doc,
685 };
686
687 tokenizer::parse(text, allow_dtd, &mut ctx)?;
688
689 let mut doc = ctx.doc;
690 if !doc.root().children().any(|n| n.is_element()) {
691 return Err(Error::NoRootNode);
692 }
693
694 if ctx.parent_prefixes.len() > 1 {
695 return Err(Error::UnclosedRootNode);
696 }
697
698 doc.nodes.shrink_to_fit();
699 doc.attributes.shrink_to_fit();
700 doc.namespaces.shrink_to_fit();
701
702 Ok(doc)
703}
704
705impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {
706 #[inline(always)]
707 fn token(&mut self, token: tokenizer::Token<'input>) -> Result<()> {
708 match token {
709 tokenizer::Token::ProcessingInstruction(target, value, range) => {
710 self.reset_after_text();
711 let pi = NodeKind::PI(PI { target, value });
712 self.append_node(pi, range)?;
713 }
714 tokenizer::Token::Comment(text, range) => {
715 self.reset_after_text();
716 self.append_node(NodeKind::Comment(StringStorage::Borrowed(text)), range)?;
717 }
718 tokenizer::Token::EntityDeclaration(name, definition) => {
719 self.entities.push(Entity {
720 name,
721 value: definition,
722 });
723 }
724 tokenizer::Token::ElementStart(prefix, local, start) => {
725 self.reset_after_text();
726
727 if prefix == XMLNS {
728 let pos = self.doc.text_pos_at(start + 1);
729 return Err(Error::InvalidElementNamePrefix(pos));
730 }
731
732 self.tag_name = TagNameSpan {
733 prefix,
734 name: local,
735 pos: start,
736 prefix_pos: start + 1,
737 };
738 }
739 tokenizer::Token::Attribute(range, qname_len, eq_len, prefix, local, value) => {
740 process_attribute(range, qname_len, eq_len, prefix, local, value, self)?;
741 }
742 tokenizer::Token::ElementEnd(end, range) => {
743 self.reset_after_text();
744 process_element(end, range, self)?;
745 }
746 tokenizer::Token::Text(text, range) => {
747 process_text(text, range, self)?;
748 }
749 tokenizer::Token::Cdata(text, range) => {
750 process_cdata(text, range, self)?;
751 }
752 }
753
754 Ok(())
755 }
756
757 fn resolve_entity(&mut self, pub_id: Option<&str>, uri: &str) -> core::result::Result<Option<&'input str>, String> {
758 match &mut self.opt.entity_resolver {
759 Some(entity_resolver) => entity_resolver(pub_id, uri),
760 None => Ok(None),
761 }
762 }
763
764}
765
766#[allow(clippy::too_many_arguments)]
767fn process_attribute<'input>(
768 range: Range<usize>,
769 qname_len: u16,
770 eq_len: u8,
771 prefix: &'input str,
772 local: &'input str,
773 value: StrSpan<'input>,
774 ctx: &mut Context<'input>,
775) -> Result<()> {
776 let value = normalize_attribute(value, ctx)?;
777
778 if prefix == XMLNS {
779 if value.as_str() == NS_XMLNS_URI {
781 let pos = ctx.doc.text_pos_at(range.start);
782 return Err(Error::UnexpectedXmlnsUri(pos));
783 }
784
785 let is_xml_ns_uri = value.as_str() == NS_XML_URI;
786
787 if local == NS_XML_PREFIX {
791 if !is_xml_ns_uri {
792 let pos = ctx.doc.text_pos_at(range.start);
793 return Err(Error::InvalidXmlPrefixUri(pos));
794 }
795 } else {
796 if is_xml_ns_uri {
798 let pos = ctx.doc.text_pos_at(range.start);
799 return Err(Error::UnexpectedXmlUri(pos));
800 }
801 }
802
803 if ctx
805 .doc
806 .namespaces
807 .exists(ctx.namespace_start_idx, Some(local))
808 {
809 let pos = ctx.doc.text_pos_at(range.start);
810 return Err(Error::DuplicatedNamespace(local.to_string(), pos));
811 }
812
813 if !is_xml_ns_uri {
815 ctx.doc.namespaces.push_ns(Some(local), value)?;
816 }
817 } else if local == XMLNS {
818 if value.as_str() == NS_XML_URI {
820 let pos = ctx.doc.text_pos_at(range.start);
821 return Err(Error::UnexpectedXmlUri(pos));
822 }
823
824 if value.as_str() == NS_XMLNS_URI {
826 let pos = ctx.doc.text_pos_at(range.start);
827 return Err(Error::UnexpectedXmlnsUri(pos));
828 }
829
830 ctx.doc.namespaces.push_ns(None, value)?;
831 } else {
832 #[cfg(not(feature = "positions"))]
833 let _ = (qname_len, eq_len);
834
835 ctx.current_attributes.push(TempAttributeData {
836 prefix,
837 local,
838 value,
839 range,
840 #[cfg(feature = "positions")]
841 qname_len,
842 #[cfg(feature = "positions")]
843 eq_len,
844 });
845 }
846
847 Ok(())
848}
849
850fn process_element<'input>(
851 end_token: tokenizer::ElementEnd<'input>,
852 token_range: Range<usize>,
853 ctx: &mut Context<'input>,
854) -> Result<()> {
855 if ctx.tag_name.name.is_empty() {
856 if let tokenizer::ElementEnd::Close(..) = end_token {
861 return Err(Error::UnexpectedEntityCloseTag(
862 ctx.doc.text_pos_at(token_range.start),
863 ));
864 } else {
865 unreachable!("should be already checked by the tokenizer");
866 }
867 }
868
869 let namespaces = ctx.resolve_namespaces();
870 ctx.namespace_start_idx = ctx.doc.namespaces.tree_order.len();
871
872 let attributes = resolve_attributes(namespaces, ctx)?;
873
874 match end_token {
875 tokenizer::ElementEnd::Empty => {
876 let tag_ns_idx = get_ns_idx_by_prefix(
877 namespaces,
878 ctx.tag_name.prefix_pos,
879 ctx.tag_name.prefix,
880 &ctx.doc,
881 )?;
882 let new_element_id = ctx.append_node(
883 NodeKind::Element {
884 tag_name: ExpandedNameIndexed {
885 namespace_idx: tag_ns_idx,
886 local_name: ctx.tag_name.name,
887 },
888 attributes,
889 namespaces,
890 },
891 ctx.tag_name.pos..token_range.end,
892 )?;
893 ctx.awaiting_subtree.push(new_element_id);
894 }
895 tokenizer::ElementEnd::Close(prefix, local) => {
896 let parent_node = &mut ctx.doc.nodes[ctx.parent_id.get_usize()];
897 let parent_prefix = *ctx.parent_prefixes.last().unwrap();
900
901 #[cfg(feature = "positions")]
902 {
903 parent_node.range.end = token_range.end;
904 }
905
906 if let NodeKind::Element { ref tag_name, .. } = parent_node.kind {
907 if prefix != parent_prefix || local != tag_name.local_name {
908 return Err(Error::UnexpectedCloseTag(
909 gen_qname_string(parent_prefix, tag_name.local_name),
910 gen_qname_string(prefix, local),
911 ctx.doc.text_pos_at(token_range.start),
912 ));
913 }
914 }
915 ctx.awaiting_subtree.push(ctx.parent_id);
916
917 if let Some(id) = parent_node.parent {
918 ctx.parent_id = id;
919 ctx.parent_prefixes.pop();
920 debug_assert!(!ctx.parent_prefixes.is_empty());
921 } else {
922 return Err(Error::UnexpectedEntityCloseTag(
927 ctx.doc.text_pos_at(token_range.start),
928 ));
929 }
930 }
931 tokenizer::ElementEnd::Open => {
932 let tag_ns_idx = get_ns_idx_by_prefix(
933 namespaces,
934 ctx.tag_name.prefix_pos,
935 ctx.tag_name.prefix,
936 &ctx.doc,
937 )?;
938 ctx.parent_id = ctx.append_node(
939 NodeKind::Element {
940 tag_name: ExpandedNameIndexed {
941 namespace_idx: tag_ns_idx,
942 local_name: ctx.tag_name.name,
943 },
944 attributes,
945 namespaces,
946 },
947 ctx.tag_name.pos..token_range.end,
948 )?;
949 ctx.parent_prefixes.push(ctx.tag_name.prefix);
950 }
951 }
952
953 Ok(())
954}
955
956impl Context<'_> {
957 fn resolve_namespaces(&mut self) -> ShortRange {
958 if let NodeKind::Element { ref namespaces, .. } =
959 self.doc.nodes[self.parent_id.get_usize()].kind
960 {
961 let parent_ns = *namespaces;
962 if self.namespace_start_idx == self.doc.namespaces.tree_order.len() {
963 return parent_ns;
964 }
965
966 for i in parent_ns.to_urange() {
967 if !self.doc.namespaces.exists(
968 self.namespace_start_idx,
969 self.doc
970 .namespaces
971 .get(self.doc.namespaces.tree_order[i])
972 .name,
973 ) {
974 self.doc.namespaces.push_ref(i);
975 }
976 }
977 }
978
979 (self.namespace_start_idx..self.doc.namespaces.tree_order.len()).into()
980 }
981}
982
983fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result<ShortRange> {
984 if ctx.current_attributes.is_empty() {
985 return Ok(ShortRange::new(0, 0));
986 }
987
988 if ctx.doc.attributes.len() + ctx.current_attributes.len() >= u32::MAX as usize {
989 return Err(Error::AttributesLimitReached);
990 }
991
992 let start_idx = ctx.doc.attributes.len();
993
994 for attr in ctx.current_attributes.drain(..) {
995 let namespace_idx = if attr.prefix == NS_XML_PREFIX {
996 Some(NamespaceIdx(0))
1000 } else if attr.prefix.is_empty() {
1001 None
1004 } else {
1005 get_ns_idx_by_prefix(namespaces, attr.range.start, attr.prefix, &ctx.doc)?
1006 };
1007
1008 let attr_name = ExpandedNameIndexed {
1009 namespace_idx,
1010 local_name: attr.local,
1011 };
1012
1013 if ctx.doc.attributes[start_idx..].iter().any(|attr| {
1015 attr.name.as_expanded_name(&ctx.doc) == attr_name.as_expanded_name(&ctx.doc)
1016 }) {
1017 let pos = ctx.doc.text_pos_at(attr.range.start);
1018 return Err(Error::DuplicatedAttribute(attr.local.to_string(), pos));
1019 }
1020
1021 ctx.doc.attributes.push(AttributeData {
1022 name: attr_name,
1023 value: attr.value,
1024 #[cfg(feature = "positions")]
1025 range: attr.range,
1026 #[cfg(feature = "positions")]
1027 qname_len: attr.qname_len,
1028 #[cfg(feature = "positions")]
1029 eq_len: attr.eq_len,
1030 });
1031 }
1032
1033 Ok((start_idx..ctx.doc.attributes.len()).into())
1034}
1035
1036fn process_text<'input>(
1037 text: &'input str,
1038 range: Range<usize>,
1039 ctx: &mut Context<'input>,
1040) -> Result<()> {
1041 if memchr2(b'&', b'\r', text.as_bytes()).is_none() {
1043 ctx.append_text(Cow::Borrowed(text), range)?;
1044 return Ok(());
1045 }
1046
1047 let mut text_buffer = TextBuffer::new();
1048 let mut is_as_is = false; let mut stream = Stream::from_substr(ctx.doc.text, range.clone());
1050 while !stream.at_end() {
1051 match parse_next_chunk(&mut stream, &ctx.entities)? {
1052 NextChunk::Byte(c) => {
1053 if is_as_is {
1054 text_buffer.push_raw(c);
1055 is_as_is = false;
1056 } else {
1057 text_buffer.push_from_text(c, stream.at_end());
1058 }
1059 }
1060 NextChunk::Char(c) => {
1061 for b in CharToBytes::new(c) {
1062 if ctx.loop_detector.depth > 0 {
1063 text_buffer.push_from_text(b, stream.at_end());
1064 } else {
1065 text_buffer.push_raw(b);
1068 is_as_is = true;
1069 }
1070 }
1071 }
1072 NextChunk::Text(fragment) => {
1073 is_as_is = false;
1074
1075 if !text_buffer.is_empty() {
1076 ctx.append_text(Cow::Owned(text_buffer.finish()), range.clone())?;
1077 }
1078
1079 ctx.loop_detector.inc_references(&stream)?;
1080 ctx.loop_detector.inc_depth(&stream)?;
1081
1082 let text = if fragment.range().start == 0 {
1083 fragment.as_str()
1084 } else {
1085 ctx.doc.text
1086 };
1087
1088 let mut stream = Stream::from_substr(text, fragment.range());
1089 let prev_tag_name = ctx.tag_name;
1090 ctx.tag_name = TagNameSpan::new_null();
1091 tokenizer::parse_content(&mut stream, ctx)?;
1092 ctx.tag_name = prev_tag_name;
1093 text_buffer.clear();
1094
1095 ctx.loop_detector.dec_depth();
1096 }
1097 }
1098 }
1099
1100 if !text_buffer.is_empty() {
1101 ctx.append_text(Cow::Owned(text_buffer.finish()), range)?;
1102 }
1103
1104 Ok(())
1105}
1106
1107fn process_cdata<'input>(
1110 mut text: &'input str,
1111 range: Range<usize>,
1112 ctx: &mut Context<'input>,
1113) -> Result<()> {
1114 let mut pos = memchr(b'\r', text.as_bytes());
1115
1116 if pos.is_none() {
1118 ctx.append_text(Cow::Borrowed(text), range)?;
1119 return Ok(());
1120 }
1121
1122 let mut buf = String::new();
1123
1124 while let Some(pos1) = pos {
1125 let (line, rest) = text.split_at(pos1);
1126
1127 buf.push_str(line);
1128 buf.push('\n');
1129
1130 text = if rest.as_bytes().get(1) == Some(&b'\n') {
1131 &rest[2..]
1132 } else {
1133 &rest[1..]
1134 };
1135
1136 pos = memchr(b'\r', text.as_bytes());
1137 }
1138
1139 buf.push_str(text);
1140
1141 ctx.append_text(Cow::Owned(buf), range)?;
1142 Ok(())
1143}
1144
1145enum NextChunk<'a> {
1146 Byte(u8),
1147 Char(char),
1148 Text(StrSpan<'a>),
1149}
1150
1151fn parse_next_chunk<'a>(stream: &mut Stream<'a>, entities: &[Entity<'a>]) -> Result<NextChunk<'a>> {
1152 debug_assert!(!stream.at_end());
1153
1154 let c = stream.curr_byte_unchecked();
1157
1158 if c == b'&' {
1160 let start = stream.pos();
1161 match stream.consume_reference() {
1162 Some(Reference::Char(ch)) => Ok(NextChunk::Char(ch)),
1163 Some(Reference::Entity(name)) => entities
1164 .iter()
1165 .find(|e| e.name == name)
1166 .map(|e| NextChunk::Text(e.value))
1167 .ok_or_else(|| {
1168 let pos = stream.gen_text_pos_from(start);
1169 Error::UnknownEntityReference(name.into(), pos)
1170 }),
1171 None => {
1172 let pos = stream.gen_text_pos_from(start);
1173 Err(Error::MalformedEntityReference(pos))
1174 }
1175 }
1176 } else {
1177 stream.advance(1);
1178 Ok(NextChunk::Byte(c))
1179 }
1180}
1181
1182fn normalize_attribute<'input>(
1184 text: StrSpan<'input>,
1185 ctx: &mut Context<'input>,
1186) -> Result<StringStorage<'input>> {
1187 if memchr2(b'&', b'\t', text.as_str().as_bytes()).is_some() || memchr2(b'\n', b'\r', text.as_str().as_bytes()).is_some() {
1190 let mut text_buffer = TextBuffer::new();
1191 _normalize_attribute(text, &mut text_buffer, ctx)?;
1192 Ok(StringStorage::new_owned(&text_buffer.finish()))
1193 } else {
1194 Ok(StringStorage::Borrowed(text.as_str()))
1195 }
1196}
1197
1198fn _normalize_attribute(text: StrSpan, buffer: &mut TextBuffer, ctx: &mut Context) -> Result<()> {
1199 let mut stream = Stream::from_substr(ctx.doc.text, text.range());
1200 while !stream.at_end() {
1201 let c = stream.curr_byte_unchecked();
1203
1204 if c != b'&' {
1205 stream.advance(1);
1206 buffer.push_from_attr(c, stream.curr_byte().ok());
1207 continue;
1208 }
1209
1210 let start = stream.pos();
1212 match stream.consume_reference() {
1213 Some(Reference::Char(ch)) => {
1214 for b in CharToBytes::new(ch) {
1215 if ctx.loop_detector.depth > 0 {
1216 if b == b'<' {
1219 return Err(Error::InvalidAttributeValue(
1220 stream.gen_text_pos_from(start),
1221 ));
1222 }
1223
1224 buffer.push_from_attr(b, None);
1225 } else {
1226 buffer.push_raw(b);
1229 }
1230 }
1231 }
1232 Some(Reference::Entity(name)) => match ctx.entities.iter().find(|e| e.name == name) {
1233 Some(entity) => {
1234 ctx.loop_detector.inc_references(&stream)?;
1235 ctx.loop_detector.inc_depth(&stream)?;
1236 _normalize_attribute(entity.value, buffer, ctx)?;
1237 ctx.loop_detector.dec_depth();
1238 }
1239 None => {
1240 let pos = stream.gen_text_pos_from(start);
1241 return Err(Error::UnknownEntityReference(name.into(), pos));
1242 }
1243 },
1244 None => {
1245 let pos = stream.gen_text_pos_from(start);
1246 return Err(Error::MalformedEntityReference(pos));
1247 }
1248 }
1249 }
1250
1251 Ok(())
1252}
1253
1254fn get_ns_idx_by_prefix(
1255 namespaces: ShortRange,
1256 prefix_pos: usize,
1257 prefix: &str,
1258 doc: &Document<'_>,
1259) -> Result<Option<NamespaceIdx>> {
1260 let prefix_opt = if prefix.is_empty() {
1265 None
1266 } else {
1267 Some(prefix)
1268 };
1269
1270 let idx = doc.namespaces.tree_order[namespaces.to_urange()]
1271 .iter()
1272 .find(|idx| doc.namespaces.get(**idx).name == prefix_opt);
1273
1274 match idx {
1275 Some(idx) => Ok(Some(*idx)),
1276 None => {
1277 if !prefix.is_empty() {
1278 let pos = doc.text_pos_at(prefix_pos);
1284 Err(Error::UnknownNamespace(prefix.to_string(), pos))
1285 } else {
1286 Ok(None)
1292 }
1293 }
1294 }
1295}
1296
1297fn gen_qname_string(prefix: &str, local: &str) -> String {
1298 if prefix.is_empty() {
1299 local.to_string()
1300 } else {
1301 alloc::format!("{}:{}", prefix, local)
1302 }
1303}
1304
1305struct CharToBytes {
1307 buf: [u8; 4],
1308 idx: u8,
1309}
1310
1311impl CharToBytes {
1312 #[inline]
1313 fn new(c: char) -> Self {
1314 let mut buf = [0xFF; 4];
1315 c.encode_utf8(&mut buf);
1316
1317 CharToBytes { buf, idx: 0 }
1318 }
1319}
1320
1321impl Iterator for CharToBytes {
1322 type Item = u8;
1323
1324 #[inline]
1325 fn next(&mut self) -> Option<Self::Item> {
1326 if self.idx < 4 {
1327 let b = self.buf[self.idx as usize];
1328
1329 if b != 0xFF {
1330 self.idx += 1;
1331 return Some(b);
1332 } else {
1333 self.idx = 4;
1334 }
1335 }
1336
1337 None
1338 }
1339}
1340
1341struct TextBuffer {
1342 buffer: Vec<u8>,
1343}
1344
1345impl TextBuffer {
1346 #[inline]
1347 fn new() -> Self {
1348 TextBuffer {
1349 buffer: Vec::with_capacity(32),
1350 }
1351 }
1352
1353 #[inline]
1354 fn push_raw(&mut self, c: u8) {
1355 self.buffer.push(c);
1356 }
1357
1358 fn push_from_attr(&mut self, mut current: u8, next: Option<u8>) {
1359 if current == b'\r' && next == Some(b'\n') {
1361 return;
1362 }
1363
1364 current = match current {
1366 b'\n' | b'\r' | b'\t' => b' ',
1367 _ => current,
1368 };
1369
1370 self.buffer.push(current);
1371 }
1372
1373 fn push_from_text(&mut self, c: u8, at_end: bool) {
1377 if self.buffer.last() == Some(&b'\r') {
1378 let idx = self.buffer.len() - 1;
1379 self.buffer[idx] = b'\n';
1380
1381 if at_end && c == b'\r' {
1382 self.buffer.push(b'\n');
1383 } else if c != b'\n' {
1384 self.buffer.push(c);
1385 }
1386 } else if at_end && c == b'\r' {
1387 self.buffer.push(b'\n');
1388 } else {
1389 self.buffer.push(c);
1390 }
1391 }
1392
1393 #[inline]
1394 fn clear(&mut self) {
1395 self.buffer.clear();
1396 }
1397
1398 #[inline]
1399 fn is_empty(&self) -> bool {
1400 self.buffer.is_empty()
1401 }
1402
1403 #[inline]
1404 fn finish(&mut self) -> String {
1405 String::from_utf8(take(&mut self.buffer)).unwrap()
1407 }
1408}