1#![doc = include_str!("docs/crate_description.md")]
3pub mod attribute;
5pub mod config;
6mod debug;
7pub mod error;
8pub mod io;
9pub mod misc;
10pub mod namespaces;
11pub mod parse;
12pub mod processing_instruction;
13pub mod prolog;
14pub mod reference;
15pub mod tag;
16pub mod transcode;
17
18use crate::{
19 config::{check_config, Config, ExternalEntityParseConfig},
20 misc::{Misc, MiscState},
21 parse::Parse,
22 processing_instruction::ProcessingInstruction,
23 prolog::{
24 doctype::DocType,
25 subset::{
26 entity::{
27 entity_declaration::EntityDecl, entity_definition::EntityDefinition,
28 entity_value::EntityValue, EntitySource,
29 },
30 markup_declaration::MarkupDeclaration,
31 Subset,
32 },
33 xmldecl::XmlDecl,
34 },
35 reference::Reference,
36 tag::Tag,
37};
38
39use attribute::Attribute;
40
41use error::{ConvertNomError, Error};
42use io::parse_external_entity_file;
43use namespaces::ParseNamespace;
44use nom::{
45 branch::alt,
46 bytes::complete::{tag, take_till, take_until},
47 combinator::{cut, map, map_res, not, opt, value},
48 multi::{many0, many1, many_till},
49 sequence::{pair, preceded, tuple},
50};
51
52use prolog::{external_id::ExternalID, subset::entity::entity_declaration::EntityDeclaration};
53
54use std::{cell::RefCell, collections::HashMap, fmt, fs::File, rc::Rc};
55
56pub type IResult<I, O> = nom::IResult<I, O, Error>;
58
59#[derive(Clone, Hash, Eq, PartialEq)]
60pub struct Name {
61 pub prefix: Option<String>,
62 pub local_part: String,
63}
64
65impl Name {
66 pub fn new(prefix: Option<&str>, local_part: &str) -> Self {
78 Self {
79 prefix: prefix.map(|p| p.to_string()),
80 local_part: local_part.to_string(),
81 }
82 }
83}
84type PrologResult<'a> = IResult<
85 &'a str,
86 (
87 Option<Document>,
88 Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
89 ),
90>;
91
92#[derive(Clone, PartialEq, Eq)]
96pub enum Document {
97 Prolog {
98 xml_decl: Option<XmlDecl>,
99 misc: Option<Vec<Misc>>,
100 doc_type: Option<DocType>,
101 },
102 Element(Tag, Box<Document>, Tag),
103 Content(Option<String>), Nested(Vec<Document>),
105 Empty,
106 EmptyTag(Tag),
107 ProcessingInstruction(ProcessingInstruction),
108 Comment(String),
109 CDATA(String),
110}
111impl<'a> Parse<'a> for Document {
112 type Args = &'a Config;
113 type Output = IResult<&'a str, Self>;
114
115 fn parse(input: &'a str, args: Self::Args) -> Self::Output {
123 match check_config(args) {
124 Ok(_) => {
125 let entity_references = Rc::new(RefCell::new(HashMap::new()));
126 let (input, prolog_and_references) =
127 opt(|i| Self::parse_prolog(i, entity_references.clone(), args))(input)?;
128
129 let (prolog, new_entity_references) = match prolog_and_references {
130 Some((prolog, entity_references)) => (prolog, entity_references),
131 None => (None, entity_references.clone()),
132 };
133
134 let mut documents = Vec::new();
135
136 let mut current_input = input;
137 while !current_input.is_empty() {
138 let (input, mut start_tag) = opt(|i| {
139 Tag::parse_start_tag(
140 i,
141 new_entity_references.clone(),
142 EntitySource::Internal,
143 )
144 })(current_input)?;
145
146 let source = Self::determine_source_from_references(&new_entity_references); let (input, content) = Self::parse_content(
149 input,
150 &new_entity_references,
151 source, )?;
153
154 let (input, end_tag) = opt(Tag::parse_end_tag)(input)?;
155
156 let mut empty_tag = if let Document::EmptyTag(empty_tag) = &content {
157 Some(empty_tag.clone())
158 } else {
159 None
160 };
161
162 if let Some(Document::Prolog {
163 doc_type:
164 Some(DocType {
165 subset: Some(ref subset),
166 ..
167 }),
168 ..
169 }) = prolog
170 {
171 for subset in subset {
172 if let Subset::MarkupDecl(MarkupDeclaration::AttList {
173 name,
174 att_defs: Some(att_defs),
175 }) = subset
176 {
177 if let Some(start_tag) = &mut start_tag {
178 if start_tag.name == *name {
179 start_tag.merge_default_attributes(&att_defs.clone());
180 }
181 }
182 if let Some(empty_tag) = &mut empty_tag {
183 if empty_tag.name == *name {
184 empty_tag.merge_default_attributes(&att_defs.clone());
185 }
186 }
187 }
188 }
189 }
190
191 let (input, doc) = Self::construct_document_element(
192 input, start_tag, content, end_tag, empty_tag,
193 )?;
194 if let Document::Empty = &doc {
195 break;
196 }
197
198 documents.push(doc);
199 current_input = input;
200 }
201
202 let (input, documents) = Self::construct_document(input, prolog, documents)?;
203 Ok((input, documents))
204 }
205 Err(e) => Err(Error::from(e).into()),
206 }
207 }
208}
209
210impl Document {
211 fn determine_source_from_references(
212 refs: &Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
213 ) -> EntitySource {
214 let refs_borrow = refs.borrow();
215 if refs_borrow
216 .keys()
217 .any(|(_name, source)| *source == EntitySource::External)
218 {
219 EntitySource::External
220 } else if refs_borrow
221 .keys()
222 .any(|(_, source)| *source == EntitySource::Internal)
223 {
224 EntitySource::Internal
225 } else {
226 EntitySource::None
227 }
228 }
229
230 pub fn parse_prolog<'a>(
232 input: &'a str,
233 entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
234 config: &'a Config,
235 ) -> PrologResult<'a> {
236 let (input, xml_decl) = opt(|i| XmlDecl::parse(i, ()))(input)?;
237 let (input, _) = Self::parse_multispace0(input)?;
238 let (input, misc_before) =
239 opt(|input| Misc::parse(input, MiscState::BeforeDoctype))(input)?;
240 let (input, doc_type) =
241 opt(|i| DocType::parse(i, (entity_references.clone(), config)))(input)?;
242 let (input, misc_after) = match &doc_type {
243 Some(_) => opt(|input| Misc::parse(input, MiscState::AfterDoctype))(input)?,
244 None => (input, None),
245 };
246 let updated_entity_references = match &doc_type {
247 Some(dt) => Self::collect_entity_references(dt, entity_references.clone()),
248 None => entity_references.clone(),
249 };
250 let miscs: Vec<Option<Misc>> = vec![misc_before, misc_after];
251 let miscs: Vec<Misc> = miscs.into_iter().flatten().collect();
252 let misc = if miscs.is_empty() { None } else { Some(miscs) };
253
254 let prolog = match (&xml_decl, &misc, &doc_type) {
255 (None, None, None) => None,
256 _ => Some(Document::Prolog {
257 xml_decl,
258 misc,
259 doc_type,
260 }),
261 };
262
263 Ok((input, (prolog, updated_entity_references)))
264 }
265
266 fn parse_char_data(input: &str) -> IResult<&str, String> {
268 map(
269 tuple((
270 take_till(|c: char| c == '<' || c == '&'),
271 not(tag::<&str, &str, nom::error::Error<&str>>("]]>")),
272 )),
273 |(data, _)| data.to_string(),
274 )(input)
275 .map_err(|e| e.convert_nom_error())
276 }
277
278 fn parse_cdata(input: &str) -> IResult<&str, String> {
280 map(
281 cut(|i| {
282 let original_input = i;
283 let (input, _) = many_till(Self::parse_char, tag("]]>"))(i)?;
284 let parsed_length = original_input.len() - input.len() - 3; let cdata_slice = &original_input[..parsed_length];
286 Ok((input, cdata_slice.to_string()))
287 }),
288 |s| s,
289 )(input)
290 }
291
292 fn parse_cdata_section(input: &str) -> IResult<&str, Document> {
296 map(
297 preceded(tag("<![CDATA["), Self::parse_cdata),
298 Document::CDATA,
299 )(input)
300 }
301
302 fn parse_element(
304 input: &str,
305 entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
306 ) -> IResult<&str, Document> {
307 let (input, doc) = alt((
308 preceded(
309 Self::parse_multispace0, map(
311 |i| {
312 Tag::parse_empty_element_tag(
313 i,
314 entity_references.clone(),
315 EntitySource::None,
316 )
317 },
318 Document::EmptyTag,
319 ),
320 ),
321 map(
322 tuple((
323 Self::parse_multispace0, |i| Tag::parse_start_tag(i, entity_references.clone(), EntitySource::Internal),
325 |i| Self::parse_content(i, &entity_references, EntitySource::Internal),
326 Tag::parse_end_tag,
327 Self::parse_multispace0, )),
329 |(_whitespace1, start_tag, content, end_tag, _whitespace2)| {
330 Document::Element(start_tag, Box::new(content), end_tag)
331 },
332 ),
333 ))(input)?;
334
335 Ok((input, doc))
336 }
337
338 fn collect_entity_references(
339 doc_type: &DocType,
340 entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
341 ) -> Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>> {
342 if let Some(entities) = doc_type.extract_entities() {
343 for boxed_entity in &entities {
344 if let Subset::MarkupDecl(MarkupDeclaration::Entity(entity_decl)) = &**boxed_entity
345 {
346 match entity_decl {
347 EntityDecl::General(decl) | EntityDecl::Parameter(decl) => {
348 match &decl.entity_def {
349 EntityDefinition::EntityValue(value) => {
350 let mut references = entity_references.borrow_mut();
351 references
352 .entry((decl.name.clone(), EntitySource::Internal))
353 .or_insert(value.clone());
354 }
355 EntityDefinition::External { .. } => {
356 let mut references = entity_references.borrow_mut();
357
358 references.entry((decl.name.clone(), EntitySource::External));
359 }
360 }
361 }
362 }
363 }
364 }
365 }
366
367 if entity_references.borrow().is_empty() {
368 Rc::new(RefCell::new(HashMap::new()))
369 } else {
370 entity_references
371 }
372 }
373
374 fn process_references(
375 entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
376 ) -> impl Fn(Vec<Reference>) -> Document {
377 move |references| {
378 let mut contents: Vec<String> = Vec::new();
379 for reference in references.into_iter() {
380 match reference.normalize_entity(entity_references.clone()) {
381 EntityValue::Document(doc) => return doc,
382 EntityValue::Value(val) => contents.push(val),
383 _ => {}
384 }
385 }
386 let content = contents.concat();
387 Document::Content(Some(content))
388 }
389 }
390
391 fn parse_content<'a>(
394 input: &'a str,
395 entity_references: &Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
396 entity_source: EntitySource,
397 ) -> IResult<&'a str, Document> {
398 let (input, ((_whitespace, maybe_chardata), elements)) = tuple((
399 pair(
400 Self::parse_multispace0, opt(Self::parse_char_data),
402 ),
403 many0(alt((
404 pair(
405 map(
406 many1(|i| Reference::parse(i, entity_source.clone())),
407 Self::process_references(entity_references.clone()),
408 ),
409 pair(
410 Self::parse_multispace0, opt(Self::parse_char_data),
412 ),
413 ),
414 pair(
415 |i| Self::parse_element(i, entity_references.clone()),
416 pair(
417 Self::parse_multispace0, opt(Self::parse_char_data),
419 ),
420 ),
421 pair(
422 Self::parse_cdata_section,
423 pair(
424 Self::parse_multispace0, opt(Self::parse_char_data),
426 ),
427 ),
428 pair(
429 map(
430 |i| ProcessingInstruction::parse(i, ()),
431 Document::ProcessingInstruction,
432 ),
433 pair(
434 Self::parse_multispace0, opt(Self::parse_char_data),
436 ),
437 ),
438 pair(
439 Self::parse_comment,
440 pair(
441 Self::parse_multispace0, opt(Self::parse_char_data),
443 ),
444 ),
445 ))),
446 ))(input)?;
447
448 let mut content = elements
450 .into_iter()
451 .flat_map(|(doc, maybe_chardata)| {
452 let mut vec = Vec::new();
453
454 vec.push(doc);
455
456 if let (_, Some(chardata)) = maybe_chardata {
457 if !chardata.is_empty() {
458 vec.push(Document::Content(Some(chardata)));
459 }
460 }
461 vec
462 })
463 .collect::<Vec<_>>();
464
465 Ok((
466 input,
467 match maybe_chardata {
468 Some(chardata) if !chardata.is_empty() => {
469 let mut vec = Vec::new();
470
471 vec.push(Document::Content(Some(chardata)));
472
473 vec.append(&mut content);
474
475 match vec.as_slice() {
476 [doc] => doc.clone(),
477
478 _ => Document::Nested(vec),
479 }
480 }
481 _ => {
482 if content.is_empty() {
483 Document::Empty
484 } else {
485 match &content[..] {
486 [doc @ Document::Content(_)] => doc.clone(),
487 [doc @ Document::ProcessingInstruction(_)] => doc.clone(),
488 [doc @ Document::CDATA(_)] => doc.clone(),
489 [doc @ Document::Comment(_)] => doc.clone(),
490 [doc @ Document::EmptyTag(_)] => doc.clone(),
491 [doc @ Document::Empty] => doc.clone(),
492 [doc @ Document::Nested(_)] => doc.clone(),
493 _ => Document::Nested(content),
494 }
495 }
496 }
497 },
498 ))
499 }
500
501 fn parse_comment(input: &str) -> IResult<&str, Document> {
503 map_res(
504 pair(tag("<!--"), many_till(Self::parse_char, tag("-->"))),
505 |(_open_comment, (comment_content, _close_comment))| {
506 let comment_string: String = comment_content.into_iter().collect();
507 if comment_string.contains("--") {
508 Err(nom::Err::Failure(nom::error::Error::new(
509 format!("Failed to parse comment: {comment_string}. Content contains '--'"),
510 nom::error::ErrorKind::Verify,
511 )))
512 } else {
513 Ok(Document::Comment(comment_string))
514 }
515 },
516 )(input)
517 }
518
519 fn construct_document_element(
520 input: &str,
521 start_tag: Option<Tag>,
522 content: Document,
523 end_tag: Option<Tag>,
524 empty_tag: Option<Tag>,
525 ) -> IResult<&str, Document> {
526 match (start_tag, end_tag, content, empty_tag) {
527 (Some(start), Some(end), content, None) => {
528 if start.name != end.name {
529 return Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
530 format!(
531 "{start:?} != {end:?}\ninput:{input:#?}",
532 input = input.to_string()
533 ),
534 nom::error::ErrorKind::Verify,
535 ))));
536 }
537
538 let document = Document::Element(start, Box::new(content), end);
539
540 Ok((input, document))
541 }
542 (Some(start), Some(end), _, Some(empty_tag)) => {
543 if start.name != end.name {
544 return Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
545 format!(
546 "{start:?} != {end:?}\ninput:{input:#?}",
547 input = input.to_string()
548 ),
549 nom::error::ErrorKind::Verify,
550 ))));
551 }
552
553 let document =
554 Document::Element(start, Box::new(Document::EmptyTag(empty_tag)), end);
555
556 Ok((input, document))
557 }
558 (Some(_), None, Document::Element(start, inner_content, end), None) => {
559 if start.name != end.name {
560 return Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
561 format!(
562 "{start:?} != {end:?}\ninput:{input:#?}",
563 input = input.to_string()
564 ),
565 nom::error::ErrorKind::Verify,
566 ))));
567 }
568
569 let document = Document::Element(start, inner_content, end);
570
571 Ok((input, document))
572 }
573 (None, None, Document::Element(start, inner_content, end), None) => {
574 if start.name != end.name {
575 return Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
576 format!(
577 "{start:?} != {end:?}\ninput:{input:#?}",
578 input = input.to_string()
579 ),
580 nom::error::ErrorKind::Verify,
581 ))));
582 }
583
584 let document = Document::Element(start, inner_content, end);
585
586 Ok((input, document))
587 }
588 (None, None, _, Some(empty)) => {
589 let document = Document::EmptyTag(empty);
590
591 Ok((input, document))
592 }
593 (None, None, Document::Empty, None) => Ok((input, Document::Empty)),
594 (None, None, Document::ProcessingInstruction(processing_instruction), None) => {
595 let document = Document::ProcessingInstruction(processing_instruction);
596
597 Ok((input, document))
598 }
599 (None, None, Document::Comment(comment), None) => {
600 let document = Document::Comment(comment);
601
602 Ok((input, document))
603 }
604 _ => Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
605 format!(
606 "Error Constructing Document element with input: {:#?}",
607 input.to_string()
608 ),
609 nom::error::ErrorKind::Verify,
610 )))),
611 }
612 }
613
614 fn construct_document(
615 input: &str,
616 prolog: Option<Document>,
617 documents: Vec<Document>,
618 ) -> IResult<&str, Document> {
619 match documents.len() {
620 0 => Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
621 format!(
622 "Error Constructing the Document. Parsed length is 0 with input: {:#?}",
623 input.to_string()
624 ),
625 nom::error::ErrorKind::Verify,
626 )))),
627 1 => match prolog {
628 Some(prolog) => Ok((
629 input,
630 Document::Nested(vec![prolog, documents.into_iter().next().unwrap()]),
631 )),
632 None => Ok((input, documents.into_iter().next().unwrap())),
633 },
634 _ => match prolog {
635 Some(prolog) => {
636 let mut vec = vec![prolog];
637 vec.extend(documents);
638 Ok((input, Document::Nested(vec)))
639 }
640 None => Ok((input, Document::Nested(documents))),
641 },
642 }
643 }
644
645 pub(crate) fn process_external_entity_file(
646 file_path: String,
647 name: &Name,
648 config: &Config,
649 entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
650 ) -> Result<Option<Vec<Subset>>, Box<dyn std::error::Error>> {
651 match File::open(file_path) {
652 Ok(mut file) => {
653 match parse_external_entity_file(&mut file, config, entity_references.clone()) {
654 Ok((entities, subsets)) => {
655 entities.iter().for_each(|entity| {
656 entity_references
657 .borrow_mut()
658 .insert((name.clone(), EntitySource::External), entity.clone());
659 });
660 Ok(subsets)
661 }
662 _ => Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
663 "Failed to match [entity] from `parse_external_entity_file`".to_string(),
664 nom::error::ErrorKind::Fail,
665 )))
666 .into()),
667 }
668 }
669 Err(e) => Err(Error::from(e).into()),
670 }
671 }
672
673 fn get_external_entity_from_declaration(
674 entity_declaration: EntityDecl,
675 entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
676 config: &Config,
677 ) -> Result<Option<Vec<Subset>>, Box<dyn std::error::Error>> {
678 if let Config {
679 external_parse_config:
680 ExternalEntityParseConfig {
681 allow_ext_parse: true,
682 base_directory,
683 ..
684 },
685 } = &config
686 {
687 if let EntityDecl::Parameter(EntityDeclaration {
688 name,
689 entity_def:
690 EntityDefinition::External {
691 id: ExternalID::System(ent_file),
692 ..
693 },
694 })
695 | EntityDecl::General(EntityDeclaration {
696 name,
697 entity_def:
698 EntityDefinition::External {
699 id: ExternalID::System(ent_file),
700 ..
701 },
702 }) = &entity_declaration
703 {
704 let file_path = match base_directory {
705 Some(base) => format!("{}/{}", base, ent_file),
706 None => ent_file.clone(),
707 };
708 Self::process_external_entity_file(file_path, name, config, entity_references)
709 } else if let EntityDecl::General(EntityDeclaration {
710 name,
711 entity_def:
712 EntityDefinition::External {
713 id:
714 ExternalID::Public {
715 system_identifier, ..
716 },
717 ..
718 },
719 }) = entity_declaration
720 {
721 if let ExternalID::System(system_identifier) = *system_identifier {
722 let file_path = match base_directory {
723 Some(base) => format!("{}/{}", base, system_identifier),
724 None => system_identifier.clone(),
725 };
726 Document::process_external_entity_file(
727 file_path,
728 &name,
729 config,
730 entity_references,
731 )
732 } else {
733 Err(nom::Err::Error(nom::error::Error::new(
734 "Failed to match *system_identifier",
735 nom::error::ErrorKind::Fail,
736 ))
737 .into())
738 }
739 } else {
740 Err(nom::Err::Error(nom::error::Error::new(
741 "Failed to match ExternalID::Public",
742 nom::error::ErrorKind::Fail,
743 ))
744 .into())
745 }
746 } else {
747 Err(nom::Err::Error(nom::error::Error::new(
748 "Failed to match &entity_declaration",
749 nom::error::ErrorKind::Fail,
750 ))
751 .into())
752 }
753 }
754
755 pub fn parse_element_by_tag_name<'a>(
767 input: &'a str,
768 tag_name: &'a str,
769 attributes: &Option<Vec<Attribute>>,
770 ) -> IResult<&'a str, Document> {
771 let (input, _) = take_until(format!("<{}", tag_name).as_str())(input)?;
772 let entity_references = &Rc::new(RefCell::new(HashMap::new()));
773 let (input, doc) = alt((
774 preceded(
775 Self::parse_multispace0, map(
777 |i| {
778 Tag::parse_empty_element_tag_by_name(
779 i,
780 tag_name,
781 attributes,
782 entity_references,
783 EntitySource::None,
784 )
785 },
786 Document::EmptyTag,
787 ),
788 ),
789 map(
790 tuple((
791 Self::parse_multispace0, |i| {
793 Tag::parse_start_tag_by_name(
794 i,
795 tag_name,
796 attributes,
797 entity_references,
798 EntitySource::Internal,
799 )
800 },
801 |i| Self::parse_content(i, entity_references, EntitySource::Internal),
802 |i| Tag::parse_end_tag_by_name(i, tag_name),
803 Self::parse_multispace0, )),
805 |(_whitespace1, start_tag, content, end_tag, _whitespace2)| {
806 Document::Element(start_tag, Box::new(content), end_tag)
807 },
808 ),
809 ))(input)?;
810 Ok((input, doc))
811 }
812
813 pub fn parse_elements_by_tag_name<'a>(
821 input: &'a str,
822 tag_name: &'a str,
823 attributes: &Option<Vec<Attribute>>,
824 ) -> IResult<&'a str, Vec<Document>> {
825 warnln!("parse_elements_by_tag_name will parse all elements with the tag name `{tag_name}` no matter the nesting level", );
826 warnln!("parse_element_by_tag_name currently only parses start tags without attributes, in this case`<{tag_name}>`");
827
828 many1(|i| Self::parse_element_by_tag_name(i, tag_name, attributes))(input)
829 }
830
831 #[cfg(feature = "experimental")]
832 pub fn parse_element_from_pattern<'a>(
833 input: &'a str,
834 tag_name: &'a str,
835 pattern: &'a Pattern,
836 strict: bool,
837 entity_references: &Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
838 ) -> IResult<&'a str, Document> {
839 let (_, _pattern_doc) = Self::parse_element(pattern.xml, entity_references.clone())?;
840
841 let pattern = pattern
842 .parse(entity_references)
843 .map_err(|e| nom::Err::Error(Error::from(e)))?;
844 let (input, doc) =
845 peek(|input| Self::parse_element_by_tag_name(input, tag_name, entity_references))(
846 input,
847 )?;
848 match (&doc, &pattern.doc) {
849 (
850 Document::Element(_, inner_element, _),
851 Document::Element(_, pattern_inner_element, _),
852 ) => {
853 if let (Document::Nested(inner_docs), Document::Nested(pattern_inner_docs)) =
854 (&**inner_element, &**pattern_inner_element)
855 {
856 let mut doc_matches = vec![false; pattern_inner_docs.len()];
857
858 for (counter, pattern_doc) in pattern_inner_docs.iter().enumerate() {
859 for inner in inner_docs.iter() {
860 if strict {
861 if Self::compare_documents(
862 inner,
863 pattern.clone(),
864 ComparisonMethod::Strict,
865 ) {}
866 } else if Self::compare_documents(
867 inner,
868 Pattern::new("", pattern_doc.clone()),
869 ComparisonMethod::Partial,
870 ) {
871 doc_matches[counter] = true;
872 }
873 }
874 }
875
876 if doc_matches.iter().all(|&vals| vals) {
877 let (input, doc) =
878 Self::parse_element_by_tag_name(input, tag_name, entity_references)?;
879 Ok((input, doc))
880 } else {
881 Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
882 input.to_string(),
883 nom::error::ErrorKind::Verify,
884 ))))
885 }
886 } else {
887 Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
888 input.to_string(),
889 nom::error::ErrorKind::Verify,
890 ))))
891 }
892 }
893 _ => Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
894 input.to_string(),
895 nom::error::ErrorKind::Verify,
896 )))),
897 }
898 }
899 #[cfg(feature = "experimental")]
900 fn compare_documents(doc1: &Document, pattern: Pattern, method: ComparisonMethod) -> bool {
901 doc1.equals(pattern, method)
902 }
903}
904
905impl Document {}
906
907impl Document {
908 pub fn iter_with_depth(&self, max_level: usize) -> DocumentIterator {
914 DocumentIterator::new(self, Some(max_level))
915 }
916}
917
918impl<'a> IntoIterator for &'a Document {
919 type Item = &'a Document;
920 type IntoIter = DocumentIterator<'a>;
921
922 fn into_iter(self) -> Self::IntoIter {
923 DocumentIterator::new(self, None)
924 }
925}
926
927pub trait DocumentIteratorExt {
928 fn iter_with_depth(&self, max_level: usize) -> DocumentIterator;
929}
930
931impl DocumentIteratorExt for Vec<Document> {
932 fn iter_with_depth(&self, max_level: usize) -> DocumentIterator {
933 DocumentIterator::new_from_slice(self, Some(max_level))
934 }
935}
936
937impl<'a> DocumentIteratorExt for &'a [Document] {
938 fn iter_with_depth(&self, max_level: usize) -> DocumentIterator {
939 DocumentIterator::new_from_slice(self, Some(max_level))
940 }
941}
942
943#[derive(Clone, Debug, PartialEq, Eq)]
944pub struct DocumentIterator<'a> {
945 stack: Vec<(&'a Document, usize)>,
946 max_depth: Option<usize>,
947}
948
949impl<'a> DocumentIterator<'a> {
950 fn new(doc: &'a Document, max_depth: Option<usize>) -> Self {
951 let stack = vec![(doc, 0)];
952 DocumentIterator { stack, max_depth }
953 }
954
955 fn new_from_slice(docs: &'a [Document], max_depth: Option<usize>) -> Self {
956 let stack = docs.iter().map(|d| (d, 0)).collect();
957 DocumentIterator { stack, max_depth }
958 }
959}
960
961impl<'a> Iterator for DocumentIterator<'a> {
962 type Item = &'a Document;
963
964 fn next(&mut self) -> Option<Self::Item> {
965 while let Some((doc, level)) = self.stack.pop() {
966 if self.max_depth.map_or(true, |max| level < max) {
967 match doc {
968 Document::Nested(docs) => {
969 for d in docs.iter().rev() {
970 self.stack.push((d, level + 1));
971 }
972
973 continue;
974 }
975 Document::Element(_, inner_doc, _) => {
976 self.stack.push((inner_doc, level + 1));
978
979 continue;
980 }
981 _ => {}
982 }
983 }
984
985 return Some(doc);
986 }
987 None
988 }
989}
990
991#[derive(Clone, Debug, PartialEq, Eq)]
992pub enum ConditionalState {
993 None,
994 Optional,
995 ZeroOrMore,
996 OneOrMore,
997}
998impl<'a> Parse<'a> for ConditionalState {
999 type Args = ();
1000 type Output = IResult<&'a str, Self>;
1001 fn parse(input: &'a str, _args: Self::Args) -> Self::Output {
1002 alt((
1003 value(ConditionalState::Optional, tag("?")),
1004 value(ConditionalState::ZeroOrMore, tag("*")),
1005 value(ConditionalState::OneOrMore, tag("+")),
1006 ))(input)
1007 }
1008}
1009
1010#[derive(Debug)]
1012pub enum DocumentError {
1013 NoMatchingDocuments,
1014 ExpectedNestedDocument,
1015}
1016
1017impl fmt::Display for DocumentError {
1018 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1019 match self {
1020 DocumentError::NoMatchingDocuments => {
1021 write!(f, "No matching documents found during extraction")
1022 }
1023 DocumentError::ExpectedNestedDocument => {
1024 write!(f, "Expected a nested document, but found another variant")
1025 }
1026 }
1027 }
1028}
1029
1030impl std::error::Error for DocumentError {}
1031#[derive(Clone, Debug, PartialEq, Eq)]
1032pub struct Pattern<'a> {
1033 pub xml: &'a str,
1034 pub doc: Document,
1035}
1036impl<'a> Pattern<'a> {
1037 pub fn new(xml: &'a str, doc: Document) -> Self {
1038 Self { xml, doc }
1039 }
1040 pub fn parse(
1041 &self,
1042 entity_references: &Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
1043 ) -> Result<Pattern, Box<dyn std::error::Error>> {
1044 let (_, doc) = Document::parse_element(self.xml, entity_references.clone())?;
1045
1046 Ok(Self { xml: self.xml, doc })
1047 }
1048}
1049pub(crate) trait PartialEqCustom {
1050 fn partial_eq(&self, pattern: Pattern) -> bool;
1051}
1052
1053impl PartialEqCustom for Document {
1054 fn partial_eq(&self, pattern: Pattern) -> bool {
1055 match (self, &pattern.doc) {
1056 (
1057 Document::Prolog {
1058 xml_decl: a_xml_decl,
1059 misc: a_misc,
1060 doc_type: a_doc_type,
1061 },
1062 Document::Prolog {
1063 xml_decl: pattern_xml_decl,
1064 misc: pattern_misc,
1065 doc_type: pattern_doc_type,
1066 },
1067 ) => {
1068 a_xml_decl == pattern_xml_decl
1069 && a_misc == pattern_misc
1070 && a_doc_type == pattern_doc_type
1071 }
1072
1073 (
1074 Document::Element(a_start_tag, a_docs, a_end_tag),
1075 Document::Element(pattern_start_tag, pattern_docs, pattern_end_tag),
1076 ) if a_start_tag == pattern_start_tag && a_end_tag == pattern_end_tag => {
1077 match (&**a_docs, &**pattern_docs) {
1078 (Document::Nested(a_docs), Document::Nested(pattern_docs)) => a_docs
1079 .iter()
1080 .zip(pattern_docs.iter())
1081 .all(|(pattern_doc, a_doc)| {
1082 a_doc.partial_eq(Pattern::new("", pattern_doc.clone()))
1083 }),
1084 (Document::Content(_), Document::Content(_)) => true,
1085 _ => panic!("Mismatched types"),
1086 }
1087 }
1088
1089 (Document::Content(a_content), Document::Content(pattern_content)) => {
1090 a_content == pattern_content
1091 }
1092
1093 (Document::Nested(a_docs), Document::Nested(pattern_docs)) => a_docs == pattern_docs,
1094
1095 (Document::Empty, Document::Empty) => true,
1096 (Document::EmptyTag(a_tag), Document::EmptyTag(pattern_tag)) => a_tag == pattern_tag,
1097 (
1098 Document::ProcessingInstruction(a_pi),
1099 Document::ProcessingInstruction(pattern_pi),
1100 ) => a_pi == pattern_pi,
1101 (Document::Comment(a_comment), Document::Comment(pattern_comment)) => {
1102 a_comment == pattern_comment
1103 }
1104 (Document::CDATA(a_cdata), Document::CDATA(pattern_cdata)) => a_cdata == pattern_cdata,
1105
1106 _ => false,
1107 }
1108 }
1109}
1110impl<'a> ParseNamespace<'a> for Document {}
1111pub(crate) trait StrictEq {
1112 fn strict_eq(&self, pattern: Pattern) -> bool;
1113}
1114impl StrictEq for Document {
1115 fn strict_eq(&self, pattern: Pattern) -> bool {
1116 self == &pattern.doc
1117 }
1118}
1119pub trait DynamicEquality {
1120 fn equals(&self, pattern: Pattern, method: ComparisonMethod) -> bool;
1121}
1122
1123pub enum ComparisonMethod {
1124 Partial,
1125 Strict,
1126}
1127
1128impl DynamicEquality for Document {
1129 fn equals(&self, pattern: Pattern, method: ComparisonMethod) -> bool {
1130 match method {
1131 ComparisonMethod::Partial => self.partial_eq(pattern),
1132
1133 ComparisonMethod::Strict => self.strict_eq(pattern),
1134 }
1135 }
1136}
1137
1138pub trait UpdateFields {
1139 fn update_fields(&mut self, doc: &Document) -> Result<(), Box<dyn std::error::Error>>
1140 where
1141 Self: std::fmt::Debug,
1142 {
1143 match doc {
1144 Document::Element(tag, nested_doc, _) => {
1145 self.update_attribute_fields(tag)?;
1146 if let Document::Nested(elements) = nested_doc.as_ref() {
1147 elements
1148 .iter_with_depth(0)
1149 .filter_map(|element| {
1150 if let Document::Element(tag, inner_doc, _) = element {
1151 Some((tag, inner_doc))
1152 } else {
1153 None
1154 }
1155 })
1156 .try_for_each(|(tag, inner_doc)| self.update_field(tag, inner_doc))
1157 } else {
1158 self.update_fields(nested_doc)
1159 }
1160 }
1161 Document::Nested(elements) => elements
1162 .iter_with_depth(0)
1163 .filter_map(|element| {
1164 if let Document::Element(tag, inner_doc, _) = element {
1165 Some((tag, inner_doc))
1166 } else {
1167 None
1168 }
1169 })
1170 .try_for_each(|(tag, inner_doc)| self.update_field(tag, inner_doc)),
1171 _ => Ok(()),
1172 }
1173 }
1174 fn update_field(&mut self, tag: &Tag, doc: &Document)
1175 -> Result<(), Box<dyn std::error::Error>>;
1176 fn update_attribute_fields(&mut self, _tag: &Tag) -> Result<(), Box<dyn std::error::Error>> {
1177 Ok(())
1178 }
1179}
1180
1181impl<T> UpdateFields for Option<T>
1182where
1183 T: UpdateFields + Default + std::fmt::Debug,
1184{
1185 fn update_fields(&mut self, doc: &Document) -> Result<(), Box<dyn std::error::Error>> {
1186 match self {
1187 Some(value) => value.update_fields(doc),
1188 None => {
1189 let mut new_value = T::default();
1190 new_value.update_fields(doc)?;
1191 *self = Some(new_value);
1192 Ok(())
1193 }
1194 }
1195 }
1196
1197 fn update_field(
1198 &mut self,
1199 tag: &Tag,
1200 doc: &Document,
1201 ) -> Result<(), Box<dyn std::error::Error>> {
1202 match self {
1203 Some(value) => value.update_field(tag, doc),
1204 None => {
1205 let mut new_value = T::default();
1206 new_value.update_field(tag, doc)?;
1207 *self = Some(new_value);
1208 Ok(())
1209 }
1210 }
1211 }
1212
1213 fn update_attribute_fields(&mut self, tag: &Tag) -> Result<(), Box<dyn std::error::Error>> {
1214 match self {
1215 Some(value) => value.update_attribute_fields(tag),
1216 None => {
1217 let mut new_value = T::default();
1218 new_value.update_attribute_fields(tag)?;
1219 *self = Some(new_value);
1220 Ok(())
1221 }
1222 }
1223 }
1224}