nom_xml/
lib.rs

1//!
2#![doc = include_str!("docs/crate_description.md")]
3//!
4pub mod attribute;
5pub mod config;
6mod debug;
7pub mod error;
8pub mod io;
9pub mod misc;
10pub mod namespaces;
11pub mod parse;
12pub mod processing_instruction;
13pub mod prolog;
14pub mod reference;
15pub mod tag;
16pub mod transcode;
17
18use crate::{
19    config::{check_config, Config, ExternalEntityParseConfig},
20    misc::{Misc, MiscState},
21    parse::Parse,
22    processing_instruction::ProcessingInstruction,
23    prolog::{
24        doctype::DocType,
25        subset::{
26            entity::{
27                entity_declaration::EntityDecl, entity_definition::EntityDefinition,
28                entity_value::EntityValue, EntitySource,
29            },
30            markup_declaration::MarkupDeclaration,
31            Subset,
32        },
33        xmldecl::XmlDecl,
34    },
35    reference::Reference,
36    tag::Tag,
37};
38
39use attribute::Attribute;
40
41use error::{ConvertNomError, Error};
42use io::parse_external_entity_file;
43use namespaces::ParseNamespace;
44use nom::{
45    branch::alt,
46    bytes::complete::{tag, take_till, take_until},
47    combinator::{cut, map, map_res, not, opt, value},
48    multi::{many0, many1, many_till},
49    sequence::{pair, preceded, tuple},
50};
51
52use prolog::{external_id::ExternalID, subset::entity::entity_declaration::EntityDeclaration};
53
54use std::{cell::RefCell, collections::HashMap, fmt, fs::File, rc::Rc};
55
56// pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
57pub type IResult<I, O> = nom::IResult<I, O, Error>;
58
59#[derive(Clone, Hash, Eq, PartialEq)]
60pub struct Name {
61    pub prefix: Option<String>,
62    pub local_part: String,
63}
64
65impl Name {
66    /// A more convenient way to create a new Name.
67    ///
68    /// ```rust
69    /// use nom_xml::Name;
70    /// // Create a new Name without a prefix
71    /// let name = Name::new(None, "actual name");
72    ///
73    /// // Create a new Name with a prefix
74    /// let prefixed_name = Name::new(Some("prefix"), "actual name");
75    /// ```
76    ///
77    pub fn new(prefix: Option<&str>, local_part: &str) -> Self {
78        Self {
79            prefix: prefix.map(|p| p.to_string()),
80            local_part: local_part.to_string(),
81        }
82    }
83}
84type PrologResult<'a> = IResult<
85    &'a str,
86    (
87        Option<Document>,
88        Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
89    ),
90>;
91
92/// Main entry point for parsing XML documents
93///
94/// This enum encapsulates all of the top level types that comprise an XML document. The core variant is the `Element(Tag,Box<Document>,Tag)` type which allows recursive parsing of nested tags and their content.
95#[derive(Clone, PartialEq, Eq)]
96pub enum Document {
97    Prolog {
98        xml_decl: Option<XmlDecl>,
99        misc: Option<Vec<Misc>>,
100        doc_type: Option<DocType>,
101    },
102    Element(Tag, Box<Document>, Tag),
103    Content(Option<String>), //TODO: Investigate if content can ever be None. I think Empty handles this case. If so, remove the Option
104    Nested(Vec<Document>),
105    Empty,
106    EmptyTag(Tag),
107    ProcessingInstruction(ProcessingInstruction),
108    Comment(String),
109    CDATA(String),
110}
111impl<'a> Parse<'a> for Document {
112    type Args = &'a Config;
113    type Output = IResult<&'a str, Self>;
114
115    /// ```rust
116    /// use nom_xml::{parse::Parse, config::Config, Document};
117    ///
118    /// let xml = "<root><child>Content</child></root>";
119    /// let (_, doc) = Document::parse(xml, &Config::default()).unwrap();
120    /// println!("{doc:?}");
121    /// ```
122    fn parse(input: &'a str, args: Self::Args) -> Self::Output {
123        match check_config(args) {
124            Ok(_) => {
125                let entity_references = Rc::new(RefCell::new(HashMap::new()));
126                let (input, prolog_and_references) =
127                    opt(|i| Self::parse_prolog(i, entity_references.clone(), args))(input)?;
128
129                let (prolog, new_entity_references) = match prolog_and_references {
130                    Some((prolog, entity_references)) => (prolog, entity_references),
131                    None => (None, entity_references.clone()),
132                };
133
134                let mut documents = Vec::new();
135
136                let mut current_input = input;
137                while !current_input.is_empty() {
138                    let (input, mut start_tag) = opt(|i| {
139                        Tag::parse_start_tag(
140                            i,
141                            new_entity_references.clone(),
142                            EntitySource::Internal,
143                        )
144                    })(current_input)?;
145
146                    let source = Self::determine_source_from_references(&new_entity_references); //THIS IS THE ISSUE
147
148                    let (input, content) = Self::parse_content(
149                        input,
150                        &new_entity_references,
151                        source, //TODO Investigate how to handle both internal and external
152                    )?;
153
154                    let (input, end_tag) = opt(Tag::parse_end_tag)(input)?;
155
156                    let mut empty_tag = if let Document::EmptyTag(empty_tag) = &content {
157                        Some(empty_tag.clone())
158                    } else {
159                        None
160                    };
161
162                    if let Some(Document::Prolog {
163                        doc_type:
164                            Some(DocType {
165                                subset: Some(ref subset),
166                                ..
167                            }),
168                        ..
169                    }) = prolog
170                    {
171                        for subset in subset {
172                            if let Subset::MarkupDecl(MarkupDeclaration::AttList {
173                                name,
174                                att_defs: Some(att_defs),
175                            }) = subset
176                            {
177                                if let Some(start_tag) = &mut start_tag {
178                                    if start_tag.name == *name {
179                                        start_tag.merge_default_attributes(&att_defs.clone());
180                                    }
181                                }
182                                if let Some(empty_tag) = &mut empty_tag {
183                                    if empty_tag.name == *name {
184                                        empty_tag.merge_default_attributes(&att_defs.clone());
185                                    }
186                                }
187                            }
188                        }
189                    }
190
191                    let (input, doc) = Self::construct_document_element(
192                        input, start_tag, content, end_tag, empty_tag,
193                    )?;
194                    if let Document::Empty = &doc {
195                        break;
196                    }
197
198                    documents.push(doc);
199                    current_input = input;
200                }
201
202                let (input, documents) = Self::construct_document(input, prolog, documents)?;
203                Ok((input, documents))
204            }
205            Err(e) => Err(Error::from(e).into()),
206        }
207    }
208}
209
210impl Document {
211    fn determine_source_from_references(
212        refs: &Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
213    ) -> EntitySource {
214        let refs_borrow = refs.borrow();
215        if refs_borrow
216            .keys()
217            .any(|(_name, source)| *source == EntitySource::External)
218        {
219            EntitySource::External
220        } else if refs_borrow
221            .keys()
222            .any(|(_, source)| *source == EntitySource::Internal)
223        {
224            EntitySource::Internal
225        } else {
226            EntitySource::None
227        }
228    }
229
230    //[22 prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
231    pub fn parse_prolog<'a>(
232        input: &'a str,
233        entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
234        config: &'a Config,
235    ) -> PrologResult<'a> {
236        let (input, xml_decl) = opt(|i| XmlDecl::parse(i, ()))(input)?;
237        let (input, _) = Self::parse_multispace0(input)?;
238        let (input, misc_before) =
239            opt(|input| Misc::parse(input, MiscState::BeforeDoctype))(input)?;
240        let (input, doc_type) =
241            opt(|i| DocType::parse(i, (entity_references.clone(), config)))(input)?;
242        let (input, misc_after) = match &doc_type {
243            Some(_) => opt(|input| Misc::parse(input, MiscState::AfterDoctype))(input)?,
244            None => (input, None),
245        };
246        let updated_entity_references = match &doc_type {
247            Some(dt) => Self::collect_entity_references(dt, entity_references.clone()),
248            None => entity_references.clone(),
249        };
250        let miscs: Vec<Option<Misc>> = vec![misc_before, misc_after];
251        let miscs: Vec<Misc> = miscs.into_iter().flatten().collect();
252        let misc = if miscs.is_empty() { None } else { Some(miscs) };
253
254        let prolog = match (&xml_decl, &misc, &doc_type) {
255            (None, None, None) => None,
256            _ => Some(Document::Prolog {
257                xml_decl,
258                misc,
259                doc_type,
260            }),
261        };
262
263        Ok((input, (prolog, updated_entity_references)))
264    }
265
266    // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
267    fn parse_char_data(input: &str) -> IResult<&str, String> {
268        map(
269            tuple((
270                take_till(|c: char| c == '<' || c == '&'),
271                not(tag::<&str, &str, nom::error::Error<&str>>("]]>")),
272            )),
273            |(data, _)| data.to_string(),
274        )(input)
275        .map_err(|e| e.convert_nom_error())
276    }
277
278    // [20] CData ::= (Char* - (Char* ']]>' Char*))
279    fn parse_cdata(input: &str) -> IResult<&str, String> {
280        map(
281            cut(|i| {
282                let original_input = i;
283                let (input, _) = many_till(Self::parse_char, tag("]]>"))(i)?;
284                let parsed_length = original_input.len() - input.len() - 3; // subtract 3 for ']]>'
285                let cdata_slice = &original_input[..parsed_length];
286                Ok((input, cdata_slice.to_string()))
287            }),
288            |s| s,
289        )(input)
290    }
291
292    // [18] CDSect ::= CDStart CData CDEnd
293    // [19] CDStart ::= '<![CDATA['
294    //[21] CDEnd ::= ']]>'
295    fn parse_cdata_section(input: &str) -> IResult<&str, Document> {
296        map(
297            preceded(tag("<![CDATA["), Self::parse_cdata),
298            Document::CDATA,
299        )(input)
300    }
301
302    // [39] element	::= EmptyElemTag | STag content ETag
303    fn parse_element(
304        input: &str,
305        entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
306    ) -> IResult<&str, Document> {
307        let (input, doc) = alt((
308            preceded(
309                Self::parse_multispace0, // this is not adhering strictly to the spec, but handles the case where there is whitespace before the start tag for human readability
310                map(
311                    |i| {
312                        Tag::parse_empty_element_tag(
313                            i,
314                            entity_references.clone(),
315                            EntitySource::None,
316                        )
317                    },
318                    Document::EmptyTag,
319                ),
320            ),
321            map(
322                tuple((
323                    Self::parse_multispace0, // this is not adhering strictly to the spec, but handles the case where there is whitespace before the start tag for human readability
324                    |i| Tag::parse_start_tag(i, entity_references.clone(), EntitySource::Internal),
325                    |i| Self::parse_content(i, &entity_references, EntitySource::Internal),
326                    Tag::parse_end_tag,
327                    Self::parse_multispace0, // this is not adhering strictly to the spec, but handles the case where there is whitespace after the start tag for human readability
328                )),
329                |(_whitespace1, start_tag, content, end_tag, _whitespace2)| {
330                    Document::Element(start_tag, Box::new(content), end_tag)
331                },
332            ),
333        ))(input)?;
334
335        Ok((input, doc))
336    }
337
338    fn collect_entity_references(
339        doc_type: &DocType,
340        entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
341    ) -> Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>> {
342        if let Some(entities) = doc_type.extract_entities() {
343            for boxed_entity in &entities {
344                if let Subset::MarkupDecl(MarkupDeclaration::Entity(entity_decl)) = &**boxed_entity
345                {
346                    match entity_decl {
347                        EntityDecl::General(decl) | EntityDecl::Parameter(decl) => {
348                            match &decl.entity_def {
349                                EntityDefinition::EntityValue(value) => {
350                                    let mut references = entity_references.borrow_mut();
351                                    references
352                                        .entry((decl.name.clone(), EntitySource::Internal))
353                                        .or_insert(value.clone());
354                                }
355                                EntityDefinition::External { .. } => {
356                                    let mut references = entity_references.borrow_mut();
357
358                                    references.entry((decl.name.clone(), EntitySource::External));
359                                }
360                            }
361                        }
362                    }
363                }
364            }
365        }
366
367        if entity_references.borrow().is_empty() {
368            Rc::new(RefCell::new(HashMap::new()))
369        } else {
370            entity_references
371        }
372    }
373
374    fn process_references(
375        entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
376    ) -> impl Fn(Vec<Reference>) -> Document {
377        move |references| {
378            let mut contents: Vec<String> = Vec::new();
379            for reference in references.into_iter() {
380                match reference.normalize_entity(entity_references.clone()) {
381                    EntityValue::Document(doc) => return doc,
382                    EntityValue::Value(val) => contents.push(val),
383                    _ => {}
384                }
385            }
386            let content = contents.concat();
387            Document::Content(Some(content))
388        }
389    }
390
391    // TODO: add validation for elements using the ConditionalState in the ContentParticle from the prolog
392    // [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
393    fn parse_content<'a>(
394        input: &'a str,
395        entity_references: &Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
396        entity_source: EntitySource,
397    ) -> IResult<&'a str, Document> {
398        let (input, ((_whitespace, maybe_chardata), elements)) = tuple((
399            pair(
400                Self::parse_multispace0, // this is not strictly adhering to the standard; however, it prevents the first Nested element from being Nested([Content(" ")])
401                opt(Self::parse_char_data),
402            ),
403            many0(alt((
404                pair(
405                    map(
406                        many1(|i| Reference::parse(i, entity_source.clone())),
407                        Self::process_references(entity_references.clone()),
408                    ),
409                    pair(
410                        Self::parse_multispace0, // this is not strictly adhering to the standard; however, it prevents the first Nested element from being Nested([Content(" ")])
411                        opt(Self::parse_char_data),
412                    ),
413                ),
414                pair(
415                    |i| Self::parse_element(i, entity_references.clone()),
416                    pair(
417                        Self::parse_multispace0, // this is not strictly adhering to the standard; however, it prevents the first Nested element from being Nested([Content(" ")])
418                        opt(Self::parse_char_data),
419                    ),
420                ),
421                pair(
422                    Self::parse_cdata_section,
423                    pair(
424                        Self::parse_multispace0, // this is not strictly adhering to the standard; however, it prevents the first Nested element from being Nested([Content(" ")])
425                        opt(Self::parse_char_data),
426                    ),
427                ),
428                pair(
429                    map(
430                        |i| ProcessingInstruction::parse(i, ()),
431                        Document::ProcessingInstruction,
432                    ),
433                    pair(
434                        Self::parse_multispace0, // this is not strictly adhering to the standard; however, it prevents the first Nested element from being Nested([Content(" ")])
435                        opt(Self::parse_char_data),
436                    ),
437                ),
438                pair(
439                    Self::parse_comment,
440                    pair(
441                        Self::parse_multispace0, // this is not strictly adhering to the standard; however, it prevents the first Nested element from being Nested([Content(" ")])
442                        opt(Self::parse_char_data),
443                    ),
444                ),
445            ))),
446        ))(input)?;
447
448        // Check if maybe_chardata contains a comma
449        let mut content = elements
450            .into_iter()
451            .flat_map(|(doc, maybe_chardata)| {
452                let mut vec = Vec::new();
453
454                vec.push(doc);
455
456                if let (_, Some(chardata)) = maybe_chardata {
457                    if !chardata.is_empty() {
458                        vec.push(Document::Content(Some(chardata)));
459                    }
460                }
461                vec
462            })
463            .collect::<Vec<_>>();
464
465        Ok((
466            input,
467            match maybe_chardata {
468                Some(chardata) if !chardata.is_empty() => {
469                    let mut vec = Vec::new();
470
471                    vec.push(Document::Content(Some(chardata)));
472
473                    vec.append(&mut content);
474
475                    match vec.as_slice() {
476                        [doc] => doc.clone(),
477
478                        _ => Document::Nested(vec),
479                    }
480                }
481                _ => {
482                    if content.is_empty() {
483                        Document::Empty
484                    } else {
485                        match &content[..] {
486                            [doc @ Document::Content(_)] => doc.clone(),
487                            [doc @ Document::ProcessingInstruction(_)] => doc.clone(),
488                            [doc @ Document::CDATA(_)] => doc.clone(),
489                            [doc @ Document::Comment(_)] => doc.clone(),
490                            [doc @ Document::EmptyTag(_)] => doc.clone(),
491                            [doc @ Document::Empty] => doc.clone(),
492                            [doc @ Document::Nested(_)] => doc.clone(),
493                            _ => Document::Nested(content),
494                        }
495                    }
496                }
497            },
498        ))
499    }
500
501    // [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
502    fn parse_comment(input: &str) -> IResult<&str, Document> {
503        map_res(
504            pair(tag("<!--"), many_till(Self::parse_char, tag("-->"))),
505            |(_open_comment, (comment_content, _close_comment))| {
506                let comment_string: String = comment_content.into_iter().collect();
507                if comment_string.contains("--") {
508                    Err(nom::Err::Failure(nom::error::Error::new(
509                        format!("Failed to parse comment: {comment_string}. Content contains '--'"),
510                        nom::error::ErrorKind::Verify,
511                    )))
512                } else {
513                    Ok(Document::Comment(comment_string))
514                }
515            },
516        )(input)
517    }
518
519    fn construct_document_element(
520        input: &str,
521        start_tag: Option<Tag>,
522        content: Document,
523        end_tag: Option<Tag>,
524        empty_tag: Option<Tag>,
525    ) -> IResult<&str, Document> {
526        match (start_tag, end_tag, content, empty_tag) {
527            (Some(start), Some(end), content, None) => {
528                if start.name != end.name {
529                    return Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
530                        format!(
531                            "{start:?} != {end:?}\ninput:{input:#?}",
532                            input = input.to_string()
533                        ),
534                        nom::error::ErrorKind::Verify,
535                    ))));
536                }
537
538                let document = Document::Element(start, Box::new(content), end);
539
540                Ok((input, document))
541            }
542            (Some(start), Some(end), _, Some(empty_tag)) => {
543                if start.name != end.name {
544                    return Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
545                        format!(
546                            "{start:?} != {end:?}\ninput:{input:#?}",
547                            input = input.to_string()
548                        ),
549                        nom::error::ErrorKind::Verify,
550                    ))));
551                }
552
553                let document =
554                    Document::Element(start, Box::new(Document::EmptyTag(empty_tag)), end);
555
556                Ok((input, document))
557            }
558            (Some(_), None, Document::Element(start, inner_content, end), None) => {
559                if start.name != end.name {
560                    return Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
561                        format!(
562                            "{start:?} != {end:?}\ninput:{input:#?}",
563                            input = input.to_string()
564                        ),
565                        nom::error::ErrorKind::Verify,
566                    ))));
567                }
568
569                let document = Document::Element(start, inner_content, end);
570
571                Ok((input, document))
572            }
573            (None, None, Document::Element(start, inner_content, end), None) => {
574                if start.name != end.name {
575                    return Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
576                        format!(
577                            "{start:?} != {end:?}\ninput:{input:#?}",
578                            input = input.to_string()
579                        ),
580                        nom::error::ErrorKind::Verify,
581                    ))));
582                }
583
584                let document = Document::Element(start, inner_content, end);
585
586                Ok((input, document))
587            }
588            (None, None, _, Some(empty)) => {
589                let document = Document::EmptyTag(empty);
590
591                Ok((input, document))
592            }
593            (None, None, Document::Empty, None) => Ok((input, Document::Empty)),
594            (None, None, Document::ProcessingInstruction(processing_instruction), None) => {
595                let document = Document::ProcessingInstruction(processing_instruction);
596
597                Ok((input, document))
598            }
599            (None, None, Document::Comment(comment), None) => {
600                let document = Document::Comment(comment);
601
602                Ok((input, document))
603            }
604            _ => Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
605                format!(
606                    "Error Constructing Document element with input: {:#?}",
607                    input.to_string()
608                ),
609                nom::error::ErrorKind::Verify,
610            )))),
611        }
612    }
613
614    fn construct_document(
615        input: &str,
616        prolog: Option<Document>,
617        documents: Vec<Document>,
618    ) -> IResult<&str, Document> {
619        match documents.len() {
620            0 => Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
621                format!(
622                    "Error Constructing the Document. Parsed length is 0 with input: {:#?}",
623                    input.to_string()
624                ),
625                nom::error::ErrorKind::Verify,
626            )))),
627            1 => match prolog {
628                Some(prolog) => Ok((
629                    input,
630                    Document::Nested(vec![prolog, documents.into_iter().next().unwrap()]),
631                )),
632                None => Ok((input, documents.into_iter().next().unwrap())),
633            },
634            _ => match prolog {
635                Some(prolog) => {
636                    let mut vec = vec![prolog];
637                    vec.extend(documents);
638                    Ok((input, Document::Nested(vec)))
639                }
640                None => Ok((input, Document::Nested(documents))),
641            },
642        }
643    }
644
645    pub(crate) fn process_external_entity_file(
646        file_path: String,
647        name: &Name,
648        config: &Config,
649        entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
650    ) -> Result<Option<Vec<Subset>>, Box<dyn std::error::Error>> {
651        match File::open(file_path) {
652            Ok(mut file) => {
653                match parse_external_entity_file(&mut file, config, entity_references.clone()) {
654                    Ok((entities, subsets)) => {
655                        entities.iter().for_each(|entity| {
656                            entity_references
657                                .borrow_mut()
658                                .insert((name.clone(), EntitySource::External), entity.clone());
659                        });
660                        Ok(subsets)
661                    }
662                    _ => Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
663                        "Failed to match [entity] from `parse_external_entity_file`".to_string(),
664                        nom::error::ErrorKind::Fail,
665                    )))
666                    .into()),
667                }
668            }
669            Err(e) => Err(Error::from(e).into()),
670        }
671    }
672
673    fn get_external_entity_from_declaration(
674        entity_declaration: EntityDecl,
675        entity_references: Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
676        config: &Config,
677    ) -> Result<Option<Vec<Subset>>, Box<dyn std::error::Error>> {
678        if let Config {
679            external_parse_config:
680                ExternalEntityParseConfig {
681                    allow_ext_parse: true,
682                    base_directory,
683                    ..
684                },
685        } = &config
686        {
687            if let EntityDecl::Parameter(EntityDeclaration {
688                name,
689                entity_def:
690                    EntityDefinition::External {
691                        id: ExternalID::System(ent_file),
692                        ..
693                    },
694            })
695            | EntityDecl::General(EntityDeclaration {
696                name,
697                entity_def:
698                    EntityDefinition::External {
699                        id: ExternalID::System(ent_file),
700                        ..
701                    },
702            }) = &entity_declaration
703            {
704                let file_path = match base_directory {
705                    Some(base) => format!("{}/{}", base, ent_file),
706                    None => ent_file.clone(),
707                };
708                Self::process_external_entity_file(file_path, name, config, entity_references)
709            } else if let EntityDecl::General(EntityDeclaration {
710                name,
711                entity_def:
712                    EntityDefinition::External {
713                        id:
714                            ExternalID::Public {
715                                system_identifier, ..
716                            },
717                        ..
718                    },
719            }) = entity_declaration
720            {
721                if let ExternalID::System(system_identifier) = *system_identifier {
722                    let file_path = match base_directory {
723                        Some(base) => format!("{}/{}", base, system_identifier),
724                        None => system_identifier.clone(),
725                    };
726                    Document::process_external_entity_file(
727                        file_path,
728                        &name,
729                        config,
730                        entity_references,
731                    )
732                } else {
733                    Err(nom::Err::Error(nom::error::Error::new(
734                        "Failed to match *system_identifier",
735                        nom::error::ErrorKind::Fail,
736                    ))
737                    .into())
738                }
739            } else {
740                Err(nom::Err::Error(nom::error::Error::new(
741                    "Failed to match ExternalID::Public",
742                    nom::error::ErrorKind::Fail,
743                ))
744                .into())
745            }
746        } else {
747            Err(nom::Err::Error(nom::error::Error::new(
748                "Failed to match &entity_declaration",
749                nom::error::ErrorKind::Fail,
750            ))
751            .into())
752        }
753    }
754
755    /// The main interface for parsing the first element that matches criteria
756    ///
757    /// See the [`parse_first_matching_element`](https://github.com/RodogInfinite/NomExML/blob/main/examples/parse_first_matching_element.rs) example for more information
758    ///
759    /// Run with `cargo run --example parse_first_matching_element`
760    ///
761    /// Also see the [`parse_element_with_specific_attribute_value`](https://github.com/RodogInfinite/NomExML/blob/main/examples/parse_element_with_specific_attribute_value.rs) example
762    ///
763    /// Run with `cargo run --example parse_element_with_specific_attribute_value`
764    ///
765    // [39] element	::= EmptyElemTag | STag content ETag
766    pub fn parse_element_by_tag_name<'a>(
767        input: &'a str,
768        tag_name: &'a str,
769        attributes: &Option<Vec<Attribute>>,
770    ) -> IResult<&'a str, Document> {
771        let (input, _) = take_until(format!("<{}", tag_name).as_str())(input)?;
772        let entity_references = &Rc::new(RefCell::new(HashMap::new()));
773        let (input, doc) = alt((
774            preceded(
775                Self::parse_multispace0, // this is not adhering strictly to the spec, but handles the case where there is whitespace before the start tag for human readability
776                map(
777                    |i| {
778                        Tag::parse_empty_element_tag_by_name(
779                            i,
780                            tag_name,
781                            attributes,
782                            entity_references,
783                            EntitySource::None,
784                        )
785                    },
786                    Document::EmptyTag,
787                ),
788            ),
789            map(
790                tuple((
791                    Self::parse_multispace0, // this is not adhering strictly to the spec, but handles the case where there is whitespace before the start tag for human readability
792                    |i| {
793                        Tag::parse_start_tag_by_name(
794                            i,
795                            tag_name,
796                            attributes,
797                            entity_references,
798                            EntitySource::Internal,
799                        )
800                    },
801                    |i| Self::parse_content(i, entity_references, EntitySource::Internal),
802                    |i| Tag::parse_end_tag_by_name(i, tag_name),
803                    Self::parse_multispace0, // this is not adhering strictly to the spec, but handles the case where there is whitespace after the start tag for human readability
804                )),
805                |(_whitespace1, start_tag, content, end_tag, _whitespace2)| {
806                    Document::Element(start_tag, Box::new(content), end_tag)
807                },
808            ),
809        ))(input)?;
810        Ok((input, doc))
811    }
812
813    /// The main interface for parsing many elements with the same tag name
814    ///
815    /// See the [`parse_all_of_specific_tag`](https://github.com/RodogInfinite/NomExML/blob/main/examples/parse_all_of_specific_tag.rs) example for more information
816    ///
817    /// Run with `cargo run --example parse_all_of_specific_tag`
818    ///
819    // [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
820    pub fn parse_elements_by_tag_name<'a>(
821        input: &'a str,
822        tag_name: &'a str,
823        attributes: &Option<Vec<Attribute>>,
824    ) -> IResult<&'a str, Vec<Document>> {
825        warnln!("parse_elements_by_tag_name will parse all elements with the tag name `{tag_name}` no matter the nesting level", );
826        warnln!("parse_element_by_tag_name currently only parses start tags without attributes, in this case`<{tag_name}>`");
827
828        many1(|i| Self::parse_element_by_tag_name(i, tag_name, attributes))(input)
829    }
830
831    #[cfg(feature = "experimental")]
832    pub fn parse_element_from_pattern<'a>(
833        input: &'a str,
834        tag_name: &'a str,
835        pattern: &'a Pattern,
836        strict: bool,
837        entity_references: &Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
838    ) -> IResult<&'a str, Document> {
839        let (_, _pattern_doc) = Self::parse_element(pattern.xml, entity_references.clone())?;
840
841        let pattern = pattern
842            .parse(entity_references)
843            .map_err(|e| nom::Err::Error(Error::from(e)))?;
844        let (input, doc) =
845            peek(|input| Self::parse_element_by_tag_name(input, tag_name, entity_references))(
846                input,
847            )?;
848        match (&doc, &pattern.doc) {
849            (
850                Document::Element(_, inner_element, _),
851                Document::Element(_, pattern_inner_element, _),
852            ) => {
853                if let (Document::Nested(inner_docs), Document::Nested(pattern_inner_docs)) =
854                    (&**inner_element, &**pattern_inner_element)
855                {
856                    let mut doc_matches = vec![false; pattern_inner_docs.len()];
857
858                    for (counter, pattern_doc) in pattern_inner_docs.iter().enumerate() {
859                        for inner in inner_docs.iter() {
860                            if strict {
861                                if Self::compare_documents(
862                                    inner,
863                                    pattern.clone(),
864                                    ComparisonMethod::Strict,
865                                ) {}
866                            } else if Self::compare_documents(
867                                inner,
868                                Pattern::new("", pattern_doc.clone()),
869                                ComparisonMethod::Partial,
870                            ) {
871                                doc_matches[counter] = true;
872                            }
873                        }
874                    }
875
876                    if doc_matches.iter().all(|&vals| vals) {
877                        let (input, doc) =
878                            Self::parse_element_by_tag_name(input, tag_name, entity_references)?;
879                        Ok((input, doc))
880                    } else {
881                        Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
882                            input.to_string(),
883                            nom::error::ErrorKind::Verify,
884                        ))))
885                    }
886                } else {
887                    Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
888                        input.to_string(),
889                        nom::error::ErrorKind::Verify,
890                    ))))
891                }
892            }
893            _ => Err(nom::Err::Error(Error::NomError(nom::error::Error::new(
894                input.to_string(),
895                nom::error::ErrorKind::Verify,
896            )))),
897        }
898    }
899    #[cfg(feature = "experimental")]
900    fn compare_documents(doc1: &Document, pattern: Pattern, method: ComparisonMethod) -> bool {
901        doc1.equals(pattern, method)
902    }
903}
904
905impl Document {}
906
907impl Document {
908    // pub fn iter(&self) -> DocumentIterator {
909    //     DocumentIterator::new(self, None)
910    // }
911    /// The main interface for exracting content from the Document tree
912    /// See the  [`extract_information_manual`](https://github.com/RodogInfinite/NomExML/blob/main/examples/extract_information_manual.rs) example for more information
913    pub fn iter_with_depth(&self, max_level: usize) -> DocumentIterator {
914        DocumentIterator::new(self, Some(max_level))
915    }
916}
917
918impl<'a> IntoIterator for &'a Document {
919    type Item = &'a Document;
920    type IntoIter = DocumentIterator<'a>;
921
922    fn into_iter(self) -> Self::IntoIter {
923        DocumentIterator::new(self, None)
924    }
925}
926
927pub trait DocumentIteratorExt {
928    fn iter_with_depth(&self, max_level: usize) -> DocumentIterator;
929}
930
931impl DocumentIteratorExt for Vec<Document> {
932    fn iter_with_depth(&self, max_level: usize) -> DocumentIterator {
933        DocumentIterator::new_from_slice(self, Some(max_level))
934    }
935}
936
937impl<'a> DocumentIteratorExt for &'a [Document] {
938    fn iter_with_depth(&self, max_level: usize) -> DocumentIterator {
939        DocumentIterator::new_from_slice(self, Some(max_level))
940    }
941}
942
943#[derive(Clone, Debug, PartialEq, Eq)]
944pub struct DocumentIterator<'a> {
945    stack: Vec<(&'a Document, usize)>,
946    max_depth: Option<usize>,
947}
948
949impl<'a> DocumentIterator<'a> {
950    fn new(doc: &'a Document, max_depth: Option<usize>) -> Self {
951        let stack = vec![(doc, 0)];
952        DocumentIterator { stack, max_depth }
953    }
954
955    fn new_from_slice(docs: &'a [Document], max_depth: Option<usize>) -> Self {
956        let stack = docs.iter().map(|d| (d, 0)).collect();
957        DocumentIterator { stack, max_depth }
958    }
959}
960
961impl<'a> Iterator for DocumentIterator<'a> {
962    type Item = &'a Document;
963
964    fn next(&mut self) -> Option<Self::Item> {
965        while let Some((doc, level)) = self.stack.pop() {
966            if self.max_depth.map_or(true, |max| level < max) {
967                match doc {
968                    Document::Nested(docs) => {
969                        for d in docs.iter().rev() {
970                            self.stack.push((d, level + 1));
971                        }
972
973                        continue;
974                    }
975                    Document::Element(_, inner_doc, _) => {
976                        // Add the inner document of an element
977                        self.stack.push((inner_doc, level + 1));
978
979                        continue;
980                    }
981                    _ => {}
982                }
983            }
984
985            return Some(doc);
986        }
987        None
988    }
989}
990
991#[derive(Clone, Debug, PartialEq, Eq)]
992pub enum ConditionalState {
993    None,
994    Optional,
995    ZeroOrMore,
996    OneOrMore,
997}
998impl<'a> Parse<'a> for ConditionalState {
999    type Args = ();
1000    type Output = IResult<&'a str, Self>;
1001    fn parse(input: &'a str, _args: Self::Args) -> Self::Output {
1002        alt((
1003            value(ConditionalState::Optional, tag("?")),
1004            value(ConditionalState::ZeroOrMore, tag("*")),
1005            value(ConditionalState::OneOrMore, tag("+")),
1006        ))(input)
1007    }
1008}
1009
1010// TODO: migrate this to error.rs possibly combine with CustomError
1011#[derive(Debug)]
1012pub enum DocumentError {
1013    NoMatchingDocuments,
1014    ExpectedNestedDocument,
1015}
1016
1017impl fmt::Display for DocumentError {
1018    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1019        match self {
1020            DocumentError::NoMatchingDocuments => {
1021                write!(f, "No matching documents found during extraction")
1022            }
1023            DocumentError::ExpectedNestedDocument => {
1024                write!(f, "Expected a nested document, but found another variant")
1025            }
1026        }
1027    }
1028}
1029
1030impl std::error::Error for DocumentError {}
1031#[derive(Clone, Debug, PartialEq, Eq)]
1032pub struct Pattern<'a> {
1033    pub xml: &'a str,
1034    pub doc: Document,
1035}
1036impl<'a> Pattern<'a> {
1037    pub fn new(xml: &'a str, doc: Document) -> Self {
1038        Self { xml, doc }
1039    }
1040    pub fn parse(
1041        &self,
1042        entity_references: &Rc<RefCell<HashMap<(Name, EntitySource), EntityValue>>>,
1043    ) -> Result<Pattern, Box<dyn std::error::Error>> {
1044        let (_, doc) = Document::parse_element(self.xml, entity_references.clone())?;
1045
1046        Ok(Self { xml: self.xml, doc })
1047    }
1048}
1049pub(crate) trait PartialEqCustom {
1050    fn partial_eq(&self, pattern: Pattern) -> bool;
1051}
1052
1053impl PartialEqCustom for Document {
1054    fn partial_eq(&self, pattern: Pattern) -> bool {
1055        match (self, &pattern.doc) {
1056            (
1057                Document::Prolog {
1058                    xml_decl: a_xml_decl,
1059                    misc: a_misc,
1060                    doc_type: a_doc_type,
1061                },
1062                Document::Prolog {
1063                    xml_decl: pattern_xml_decl,
1064                    misc: pattern_misc,
1065                    doc_type: pattern_doc_type,
1066                },
1067            ) => {
1068                a_xml_decl == pattern_xml_decl
1069                    && a_misc == pattern_misc
1070                    && a_doc_type == pattern_doc_type
1071            }
1072
1073            (
1074                Document::Element(a_start_tag, a_docs, a_end_tag),
1075                Document::Element(pattern_start_tag, pattern_docs, pattern_end_tag),
1076            ) if a_start_tag == pattern_start_tag && a_end_tag == pattern_end_tag => {
1077                match (&**a_docs, &**pattern_docs) {
1078                    (Document::Nested(a_docs), Document::Nested(pattern_docs)) => a_docs
1079                        .iter()
1080                        .zip(pattern_docs.iter())
1081                        .all(|(pattern_doc, a_doc)| {
1082                            a_doc.partial_eq(Pattern::new("", pattern_doc.clone()))
1083                        }),
1084                    (Document::Content(_), Document::Content(_)) => true,
1085                    _ => panic!("Mismatched types"),
1086                }
1087            }
1088
1089            (Document::Content(a_content), Document::Content(pattern_content)) => {
1090                a_content == pattern_content
1091            }
1092
1093            (Document::Nested(a_docs), Document::Nested(pattern_docs)) => a_docs == pattern_docs,
1094
1095            (Document::Empty, Document::Empty) => true,
1096            (Document::EmptyTag(a_tag), Document::EmptyTag(pattern_tag)) => a_tag == pattern_tag,
1097            (
1098                Document::ProcessingInstruction(a_pi),
1099                Document::ProcessingInstruction(pattern_pi),
1100            ) => a_pi == pattern_pi,
1101            (Document::Comment(a_comment), Document::Comment(pattern_comment)) => {
1102                a_comment == pattern_comment
1103            }
1104            (Document::CDATA(a_cdata), Document::CDATA(pattern_cdata)) => a_cdata == pattern_cdata,
1105
1106            _ => false,
1107        }
1108    }
1109}
1110impl<'a> ParseNamespace<'a> for Document {}
1111pub(crate) trait StrictEq {
1112    fn strict_eq(&self, pattern: Pattern) -> bool;
1113}
1114impl StrictEq for Document {
1115    fn strict_eq(&self, pattern: Pattern) -> bool {
1116        self == &pattern.doc
1117    }
1118}
1119pub trait DynamicEquality {
1120    fn equals(&self, pattern: Pattern, method: ComparisonMethod) -> bool;
1121}
1122
1123pub enum ComparisonMethod {
1124    Partial,
1125    Strict,
1126}
1127
1128impl DynamicEquality for Document {
1129    fn equals(&self, pattern: Pattern, method: ComparisonMethod) -> bool {
1130        match method {
1131            ComparisonMethod::Partial => self.partial_eq(pattern),
1132
1133            ComparisonMethod::Strict => self.strict_eq(pattern),
1134        }
1135    }
1136}
1137
1138pub trait UpdateFields {
1139    fn update_fields(&mut self, doc: &Document) -> Result<(), Box<dyn std::error::Error>>
1140    where
1141        Self: std::fmt::Debug,
1142    {
1143        match doc {
1144            Document::Element(tag, nested_doc, _) => {
1145                self.update_attribute_fields(tag)?;
1146                if let Document::Nested(elements) = nested_doc.as_ref() {
1147                    elements
1148                        .iter_with_depth(0)
1149                        .filter_map(|element| {
1150                            if let Document::Element(tag, inner_doc, _) = element {
1151                                Some((tag, inner_doc))
1152                            } else {
1153                                None
1154                            }
1155                        })
1156                        .try_for_each(|(tag, inner_doc)| self.update_field(tag, inner_doc))
1157                } else {
1158                    self.update_fields(nested_doc)
1159                }
1160            }
1161            Document::Nested(elements) => elements
1162                .iter_with_depth(0)
1163                .filter_map(|element| {
1164                    if let Document::Element(tag, inner_doc, _) = element {
1165                        Some((tag, inner_doc))
1166                    } else {
1167                        None
1168                    }
1169                })
1170                .try_for_each(|(tag, inner_doc)| self.update_field(tag, inner_doc)),
1171            _ => Ok(()),
1172        }
1173    }
1174    fn update_field(&mut self, tag: &Tag, doc: &Document)
1175        -> Result<(), Box<dyn std::error::Error>>;
1176    fn update_attribute_fields(&mut self, _tag: &Tag) -> Result<(), Box<dyn std::error::Error>> {
1177        Ok(())
1178    }
1179}
1180
1181impl<T> UpdateFields for Option<T>
1182where
1183    T: UpdateFields + Default + std::fmt::Debug,
1184{
1185    fn update_fields(&mut self, doc: &Document) -> Result<(), Box<dyn std::error::Error>> {
1186        match self {
1187            Some(value) => value.update_fields(doc),
1188            None => {
1189                let mut new_value = T::default();
1190                new_value.update_fields(doc)?;
1191                *self = Some(new_value);
1192                Ok(())
1193            }
1194        }
1195    }
1196
1197    fn update_field(
1198        &mut self,
1199        tag: &Tag,
1200        doc: &Document,
1201    ) -> Result<(), Box<dyn std::error::Error>> {
1202        match self {
1203            Some(value) => value.update_field(tag, doc),
1204            None => {
1205                let mut new_value = T::default();
1206                new_value.update_field(tag, doc)?;
1207                *self = Some(new_value);
1208                Ok(())
1209            }
1210        }
1211    }
1212
1213    fn update_attribute_fields(&mut self, tag: &Tag) -> Result<(), Box<dyn std::error::Error>> {
1214        match self {
1215            Some(value) => value.update_attribute_fields(tag),
1216            None => {
1217                let mut new_value = T::default();
1218                new_value.update_attribute_fields(tag)?;
1219                *self = Some(new_value);
1220                Ok(())
1221            }
1222        }
1223    }
1224}