oxirs_core/rdfxml/
parser.rs

1use crate::model::literal::LanguageTag;
2use crate::model::term::{Object, Predicate, Subject};
3use crate::model::{BlankNode, Literal, NamedNode, NamedOrBlankNode, Term, Triple};
4use crate::rdfxml::error::{RdfXmlParseError, RdfXmlSyntaxError};
5use crate::rdfxml::utils::*;
6use oxiri::{Iri, IriParseError};
7use quick_xml::escape::{resolve_xml_entity, unescape_with};
8use quick_xml::events::attributes::Attribute;
9use quick_xml::events::*;
10use quick_xml::name::{LocalName, PrefixDeclaration, PrefixIter, QName, ResolveResult};
11use quick_xml::{Decoder, Error, NsReader, Writer};
12use std::borrow::Cow;
13use std::collections::{HashMap, HashSet};
14use std::io::{BufReader, Read};
15use std::str;
16#[cfg(feature = "async-tokio")]
17use tokio::io::{AsyncRead, BufReader as AsyncBufReader};
18
19impl From<NamedOrBlankNode> for Term {
20    fn from(node: NamedOrBlankNode) -> Self {
21        match node {
22            NamedOrBlankNode::NamedNode(n) => Term::NamedNode(n),
23            NamedOrBlankNode::BlankNode(n) => Term::BlankNode(n),
24        }
25    }
26}
27
28/// A [RDF/XML](https://www.w3.org/TR/rdf-syntax-grammar/) streaming parser.
29///
30/// It reads the file in streaming.
31/// It does not keep data in memory except a stack for handling nested XML tags, and a set of all
32/// seen `rdf:ID`s to detect duplicate ids and fail according to the specification.
33///
34/// Its performances are not optimized yet and hopefully could be significantly enhanced by reducing the
35/// number of allocations and copies done by the parser.
36///
37/// Count the number of people:
38/// ```
39/// use oxirs_core::model::NamedNode;
40/// use oxirs_core::rdfxml::RdfXmlParser;
41///
42/// let file = br#"<?xml version="1.0"?>
43/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
44///  <rdf:Description rdf:about="http://example.com/foo">
45///    <rdf:type rdf:resource="http://schema.org/Person" />
46///    <schema:name>Foo</schema:name>
47///  </rdf:Description>
48///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
49/// </rdf:RDF>"#;
50///
51/// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
52/// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
53/// let mut count = 0;
54/// for triple in RdfXmlParser::new().for_reader(file.as_ref()) {
55///     let triple = triple.unwrap();
56///     if triple.predicate == Term::NamedNode(rdf_type.clone()) && triple.object == Term::NamedNode(schema_person.clone()) {
57///         count += 1;
58///     }
59/// }
60/// assert_eq!(2, count);
61/// ```
62#[derive(Default, Clone)]
63#[must_use]
64pub struct RdfXmlParser {
65    lenient: bool,
66    base: Option<Iri<String>>,
67}
68
69impl RdfXmlParser {
70    /// Builds a new [`RdfXmlParser`].
71    #[inline]
72    pub fn new() -> Self {
73        Self::default()
74    }
75
76    /// Assumes the file is valid to make parsing faster.
77    ///
78    /// It will skip some validations.
79    ///
80    /// Note that if the file is actually not valid, the parser might emit broken RDF.
81    #[inline]
82    pub fn lenient(mut self) -> Self {
83        self.lenient = true;
84        self
85    }
86
87    #[deprecated(note = "Use `lenient()` instead", since = "0.2.0")]
88    #[inline]
89    pub fn unchecked(self) -> Self {
90        self.lenient()
91    }
92
93    #[inline]
94    pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
95        self.base = Some(Iri::parse(base_iri.into())?);
96        Ok(self)
97    }
98
99    /// Parses a RDF/XML file from a [`Read`] implementation.
100    ///
101    /// Count the number of people:
102    /// ```
103    /// use oxirs_core::model::NamedNode;
104    /// use oxirs_core::rdfxml::RdfXmlParser;
105    ///
106    /// let file = br#"<?xml version="1.0"?>
107    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
108    ///  <rdf:Description rdf:about="http://example.com/foo">
109    ///    <rdf:type rdf:resource="http://schema.org/Person" />
110    ///    <schema:name>Foo</schema:name>
111    ///  </rdf:Description>
112    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
113    /// </rdf:RDF>"#;
114    ///
115    /// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
116    /// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
117    /// let mut count = 0;
118    /// for triple in RdfXmlParser::new().for_reader(file.as_ref()) {
119    ///     let triple = triple.unwrap();
120    ///     if triple.predicate == Term::NamedNode(rdf_type.clone()) && triple.object == Term::NamedNode(schema_person.clone()) {
121    ///         count += 1;
122    ///     }
123    /// }
124    /// assert_eq!(2, count);
125    /// ```
126    pub fn for_reader<R: Read>(self, reader: R) -> ReaderRdfXmlParser<R> {
127        ReaderRdfXmlParser {
128            results: Vec::new(),
129            parser: self.into_internal(BufReader::new(reader)),
130            reader_buffer: Vec::default(),
131        }
132    }
133
134    /// Parses a RDF/XML file from a [`AsyncRead`] implementation.
135    ///
136    /// Count the number of people:
137    /// ```
138    /// # #[tokio::main(flavor = "current_thread")]
139    /// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
140    /// use oxirs_core::model::NamedNode;
141    /// use oxirs_core::rdfxml::RdfXmlParser;
142    ///
143    /// let file = br#"<?xml version="1.0"?>
144    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
145    ///   <rdf:Description rdf:about="http://example.com/foo">
146    ///     <rdf:type rdf:resource="http://schema.org/Person" />
147    ///     <schema:name>Foo</schema:name>
148    ///   </rdf:Description>
149    ///   <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
150    /// </rdf:RDF>"#;
151    ///
152    /// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
153    /// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
154    /// let mut count = 0;
155    /// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
156    /// while let Some(triple) = parser.next().await {
157    ///     let triple = triple.unwrap();
158    ///     if triple.predicate == Term::NamedNode(rdf_type.clone()) && triple.object == Term::NamedNode(schema_person.clone()) {
159    ///         count += 1;
160    ///     }
161    /// }
162    /// assert_eq!(2, count);
163    /// # Ok(())
164    /// # }
165    /// ```
166    #[cfg(feature = "async-tokio")]
167    pub fn for_tokio_async_reader<R: AsyncRead + Unpin>(
168        self,
169        reader: R,
170    ) -> TokioAsyncReaderRdfXmlParser<R> {
171        TokioAsyncReaderRdfXmlParser {
172            results: Vec::new(),
173            parser: self.into_internal(AsyncBufReader::new(reader)),
174            reader_buffer: Vec::default(),
175        }
176    }
177
178    /// Parses a RDF/XML file from a byte slice.
179    ///
180    /// Count the number of people:
181    /// ```
182    /// use oxirs_core::model::NamedNode;
183    /// use oxirs_core::rdfxml::RdfXmlParser;
184    ///
185    /// let file = br#"<?xml version="1.0"?>
186    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
187    ///  <rdf:Description rdf:about="http://example.com/foo">
188    ///    <rdf:type rdf:resource="http://schema.org/Person" />
189    ///    <schema:name>Foo</schema:name>
190    ///  </rdf:Description>
191    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
192    /// </rdf:RDF>"#;
193    ///
194    /// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
195    /// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
196    /// let mut count = 0;
197    /// for triple in RdfXmlParser::new().for_slice(file) {
198    ///     let triple = triple.unwrap();
199    ///     if triple.predicate == Term::NamedNode(rdf_type.clone()) && triple.object == Term::NamedNode(schema_person.clone()) {
200    ///         count += 1;
201    ///     }
202    /// }
203    /// assert_eq!(2, count);
204    /// ```
205    pub fn for_slice(self, slice: &[u8]) -> SliceRdfXmlParser<'_> {
206        SliceRdfXmlParser {
207            results: Vec::new(),
208            parser: self.into_internal(slice),
209            reader_buffer: Vec::default(),
210        }
211    }
212
213    fn into_internal<T>(self, reader: T) -> InternalRdfXmlParser<T> {
214        let mut reader = NsReader::from_reader(reader);
215        reader.config_mut().expand_empty_elements = true;
216        InternalRdfXmlParser {
217            reader,
218            state: vec![RdfXmlState::Doc {
219                base_iri: self.base.clone(),
220            }],
221            custom_entities: HashMap::new(),
222            in_literal_depth: 0,
223            known_rdf_id: HashSet::default(),
224            is_end: false,
225            lenient: self.lenient,
226        }
227    }
228}
229
230/// Parses a RDF/XML file from a [`Read`] implementation.
231///
232/// Can be built using [`RdfXmlParser::for_reader`].
233///
234/// Count the number of people:
235/// ```
236/// use oxirs_core::model::NamedNode;
237/// use oxirs_core::rdfxml::RdfXmlParser;
238///
239/// let file = br#"<?xml version="1.0"?>
240/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
241///  <rdf:Description rdf:about="http://example.com/foo">
242///    <rdf:type rdf:resource="http://schema.org/Person" />
243///    <schema:name>Foo</schema:name>
244///  </rdf:Description>
245///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
246/// </rdf:RDF>"#;
247///
248/// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
249/// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
250/// let mut count = 0;
251/// for triple in RdfXmlParser::new().for_reader(file.as_ref()) {
252///     let triple = triple.unwrap();
253///     if triple.predicate == Term::NamedNode(rdf_type.clone()) && triple.object == Term::NamedNode(schema_person.clone()) {
254///         count += 1;
255///     }
256/// }
257/// assert_eq!(2, count);
258/// ```
259#[must_use]
260pub struct ReaderRdfXmlParser<R: Read> {
261    results: Vec<Triple>,
262    parser: InternalRdfXmlParser<BufReader<R>>,
263    reader_buffer: Vec<u8>,
264}
265
266impl<R: Read> Iterator for ReaderRdfXmlParser<R> {
267    type Item = Result<Triple, RdfXmlParseError>;
268
269    fn next(&mut self) -> Option<Self::Item> {
270        loop {
271            if let Some(triple) = self.results.pop() {
272                return Some(Ok(triple));
273            } else if self.parser.is_end {
274                return None;
275            }
276            if let Err(e) = self.parse_step() {
277                return Some(Err(e));
278            }
279        }
280    }
281}
282
283impl<R: Read> ReaderRdfXmlParser<R> {
284    /// The list of IRI prefixes considered at the current step of the parsing.
285    ///
286    /// This method returns (prefix name, prefix value) tuples.
287    /// It is empty at the beginning of the parsing and gets updated when prefixes are encountered.
288    /// It should be full at the end of the parsing (but if a prefix is overridden, only the latest version will be returned).
289    ///
290    /// ```
291    /// use oxirs_core::rdfxml::RdfXmlParser;
292    ///
293    /// let file = br#"<?xml version="1.0"?>
294    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
295    ///  <rdf:Description rdf:about="http://example.com/foo">
296    ///    <rdf:type rdf:resource="http://schema.org/Person" />
297    ///    <schema:name>Foo</schema:name>
298    ///  </rdf:Description>
299    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
300    /// </rdf:RDF>"#;
301    ///
302    /// let mut parser = RdfXmlParser::new().for_reader(file.as_ref());
303    /// assert_eq!(parser.prefixes().collect::<Vec<_>>(), []); // No prefix at the beginning
304    ///
305    /// parser.next().unwrap().unwrap(); // We read the first triple
306    /// assert_eq!(
307    ///     parser.prefixes().collect::<Vec<_>>(),
308    ///     [
309    ///         ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
310    ///         ("schema", "http://schema.org/")
311    ///     ]
312    /// ); // There are now prefixes
313    /// ```
314    pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
315        RdfXmlPrefixesIter {
316            inner: self.parser.reader.prefixes(),
317            decoder: self.parser.reader.decoder(),
318            lenient: self.parser.lenient,
319        }
320    }
321
322    /// The base IRI considered at the current step of the parsing.
323    ///
324    /// ```
325    /// use oxirs_core::rdfxml::RdfXmlParser;
326    ///
327    /// let file = br#"<?xml version="1.0"?>
328    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xml:base="http://example.com/">
329    ///  <rdf:Description rdf:about="foo">
330    ///    <rdf:type rdf:resource="http://schema.org/Person" />
331    ///  </rdf:Description>
332    /// </rdf:RDF>"#;
333    ///
334    /// let mut parser = RdfXmlParser::new().for_reader(file.as_ref());
335    /// assert!(parser.base_iri().is_none()); // No base at the beginning because none has been given to the parser.
336    ///
337    /// parser.next().unwrap().unwrap(); // We read the first triple
338    /// assert_eq!(parser.base_iri(), Some("http://example.com/")); // There is now a base IRI.
339    /// ```
340    pub fn base_iri(&self) -> Option<&str> {
341        Some(self.parser.current_base_iri()?.as_str())
342    }
343
344    /// The current byte position in the input data.
345    pub fn buffer_position(&self) -> u64 {
346        self.parser.reader.buffer_position()
347    }
348
349    fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
350        self.reader_buffer.clear();
351        let event = self
352            .parser
353            .reader
354            .read_event_into(&mut self.reader_buffer)?;
355        self.parser.parse_event(event, &mut self.results)
356    }
357}
358
359/// Parses a RDF/XML file from a [`AsyncRead`] implementation.
360///
361/// Can be built using [`RdfXmlParser::for_tokio_async_reader`].
362///
363/// Count the number of people:
364/// ```
365/// # #[tokio::main(flavor = "current_thread")]
366/// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
367/// use oxirs_core::model::NamedNode;
368/// use oxirs_core::rdfxml::RdfXmlParser;
369///
370/// let file = br#"<?xml version="1.0"?>
371/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
372///   <rdf:Description rdf:about="http://example.com/foo">
373///     <rdf:type rdf:resource="http://schema.org/Person" />
374///     <schema:name>Foo</schema:name>
375///   </rdf:Description>
376///   <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
377/// </rdf:RDF>"#;
378///
379/// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
380/// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
381/// let mut count = 0;
382/// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
383/// while let Some(triple) = parser.next().await {
384///     let triple = triple.unwrap();
385///     if triple.predicate == Term::NamedNode(rdf_type.clone()) && triple.object == Term::NamedNode(schema_person.clone()) {
386///         count += 1;
387///     }
388/// }
389/// assert_eq!(2, count);
390/// # Ok(())
391/// # }
392/// ```
393#[cfg(feature = "async-tokio")]
394#[must_use]
395pub struct TokioAsyncReaderRdfXmlParser<R: AsyncRead + Unpin> {
396    results: Vec<Triple>,
397    parser: InternalRdfXmlParser<AsyncBufReader<R>>,
398    reader_buffer: Vec<u8>,
399}
400
401#[cfg(feature = "async-tokio")]
402impl<R: AsyncRead + Unpin> TokioAsyncReaderRdfXmlParser<R> {
403    /// Reads the next triple or returns `None` if the file is finished.
404    pub async fn next(&mut self) -> Option<Result<Triple, RdfXmlParseError>> {
405        loop {
406            if let Some(triple) = self.results.pop() {
407                return Some(Ok(triple));
408            } else if self.parser.is_end {
409                return None;
410            }
411            if let Err(e) = self.parse_step().await {
412                return Some(Err(e));
413            }
414        }
415    }
416
417    /// The list of IRI prefixes considered at the current step of the parsing.
418    ///
419    /// This method returns (prefix name, prefix value) tuples.
420    /// It is empty at the beginning of the parsing and gets updated when prefixes are encountered.
421    /// It should be full at the end of the parsing (but if a prefix is overridden, only the latest version will be returned).
422    ///
423    /// ```
424    /// # #[tokio::main(flavor = "current_thread")]
425    /// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
426    /// use oxirs_core::rdfxml::RdfXmlParser;
427    ///
428    /// let file = br#"<?xml version="1.0"?>
429    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
430    ///  <rdf:Description rdf:about="http://example.com/foo">
431    ///    <rdf:type rdf:resource="http://schema.org/Person" />
432    ///    <schema:name>Foo</schema:name>
433    ///  </rdf:Description>
434    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
435    /// </rdf:RDF>"#;
436    ///
437    /// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
438    /// assert_eq!(parser.prefixes().collect::<Vec<_>>(), []); // No prefix at the beginning
439    ///
440    /// parser.next().await.unwrap().unwrap(); // We read the first triple
441    /// assert_eq!(
442    ///     parser.prefixes().collect::<Vec<_>>(),
443    ///     [
444    ///         ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
445    ///         ("schema", "http://schema.org/")
446    ///     ]
447    /// ); // There are now prefixes
448    /// //
449    /// # Ok(())
450    /// # }
451    /// ```
452    pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
453        RdfXmlPrefixesIter {
454            inner: self.parser.reader.prefixes(),
455            decoder: self.parser.reader.decoder(),
456            lenient: self.parser.lenient,
457        }
458    }
459
460    /// The base IRI considered at the current step of the parsing.
461    ///
462    /// ```
463    /// # #[tokio::main(flavor = "current_thread")]
464    /// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
465    /// use oxirs_core::rdfxml::RdfXmlParser;
466    ///
467    /// let file = br#"<?xml version="1.0"?>
468    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xml:base="http://example.com/">
469    ///  <rdf:Description rdf:about="foo">
470    ///    <rdf:type rdf:resource="http://schema.org/Person" />
471    ///  </rdf:Description>
472    /// </rdf:RDF>"#;
473    ///
474    /// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
475    /// assert!(parser.base_iri().is_none()); // No base at the beginning because none has been given to the parser.
476    ///
477    /// parser.next().await.unwrap().unwrap(); // We read the first triple
478    /// assert_eq!(parser.base_iri(), Some("http://example.com/")); // There is now a base IRI.
479    /// # Ok(())
480    /// # }
481    /// ```
482    pub fn base_iri(&self) -> Option<&str> {
483        Some(self.parser.current_base_iri()?.as_str())
484    }
485
486    /// The current byte position in the input data.
487    pub fn buffer_position(&self) -> u64 {
488        self.parser.reader.buffer_position()
489    }
490
491    async fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
492        self.reader_buffer.clear();
493        let event = self
494            .parser
495            .reader
496            .read_event_into_async(&mut self.reader_buffer)
497            .await?;
498        self.parser.parse_event(event, &mut self.results)
499    }
500}
501
502/// Parses a RDF/XML file from a byte slice.
503///
504/// Can be built using [`RdfXmlParser::for_slice`].
505///
506/// Count the number of people:
507/// ```
508/// use oxirs_core::model::NamedNode;
509/// use oxirs_core::rdfxml::RdfXmlParser;
510///
511/// let file = br#"<?xml version="1.0"?>
512/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
513///  <rdf:Description rdf:about="http://example.com/foo">
514///    <rdf:type rdf:resource="http://schema.org/Person" />
515///    <schema:name>Foo</schema:name>
516///  </rdf:Description>
517///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
518/// </rdf:RDF>"#;
519///
520/// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
521/// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
522/// let mut count = 0;
523/// for triple in RdfXmlParser::new().for_slice(file) {
524///     let triple = triple.unwrap();
525///     if triple.predicate == Term::NamedNode(rdf_type.clone()) && triple.object == Term::NamedNode(schema_person.clone()) {
526///         count += 1;
527///     }
528/// }
529/// assert_eq!(2, count);
530/// ```
531#[must_use]
532pub struct SliceRdfXmlParser<'a> {
533    results: Vec<Triple>,
534    parser: InternalRdfXmlParser<&'a [u8]>,
535    reader_buffer: Vec<u8>,
536}
537
538impl Iterator for SliceRdfXmlParser<'_> {
539    type Item = Result<Triple, RdfXmlSyntaxError>;
540
541    fn next(&mut self) -> Option<Self::Item> {
542        loop {
543            if let Some(triple) = self.results.pop() {
544                return Some(Ok(triple));
545            } else if self.parser.is_end {
546                return None;
547            }
548            if let Err(RdfXmlParseError::Syntax(e)) = self.parse_step() {
549                // I/O errors can't happen
550                return Some(Err(e));
551            }
552        }
553    }
554}
555
556impl SliceRdfXmlParser<'_> {
557    /// The list of IRI prefixes considered at the current step of the parsing.
558    ///
559    /// This method returns (prefix name, prefix value) tuples.
560    /// It is empty at the beginning of the parsing and gets updated when prefixes are encountered.
561    /// It should be full at the end of the parsing (but if a prefix is overridden, only the latest version will be returned).
562    ///
563    /// ```
564    /// use oxirs_core::rdfxml::RdfXmlParser;
565    ///
566    /// let file = br#"<?xml version="1.0"?>
567    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
568    ///  <rdf:Description rdf:about="http://example.com/foo">
569    ///    <rdf:type rdf:resource="http://schema.org/Person" />
570    ///    <schema:name>Foo</schema:name>
571    ///  </rdf:Description>
572    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
573    /// </rdf:RDF>"#;
574    ///
575    /// let mut parser = RdfXmlParser::new().for_slice(file);
576    /// assert_eq!(parser.prefixes().collect::<Vec<_>>(), []); // No prefix at the beginning
577    ///
578    /// parser.next().unwrap().unwrap(); // We read the first triple
579    /// assert_eq!(
580    ///     parser.prefixes().collect::<Vec<_>>(),
581    ///     [
582    ///         ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
583    ///         ("schema", "http://schema.org/")
584    ///     ]
585    /// ); // There are now prefixes
586    /// ```
587    pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
588        RdfXmlPrefixesIter {
589            inner: self.parser.reader.prefixes(),
590            decoder: self.parser.reader.decoder(),
591            lenient: self.parser.lenient,
592        }
593    }
594
595    /// The base IRI considered at the current step of the parsing.
596    ///
597    /// ```
598    /// use oxirs_core::rdfxml::RdfXmlParser;
599    ///
600    /// let file = br#"<?xml version="1.0"?>
601    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xml:base="http://example.com/">
602    ///  <rdf:Description rdf:about="foo">
603    ///    <rdf:type rdf:resource="http://schema.org/Person" />
604    ///  </rdf:Description>
605    /// </rdf:RDF>"#;
606    ///
607    /// let mut parser = RdfXmlParser::new().for_slice(file);
608    /// assert!(parser.base_iri().is_none()); // No base at the beginning because none has been given to the parser.
609    ///
610    /// parser.next().unwrap().unwrap(); // We read the first triple
611    /// assert_eq!(parser.base_iri(), Some("http://example.com/")); // There is now a base IRI.
612    /// ```
613    pub fn base_iri(&self) -> Option<&str> {
614        Some(self.parser.current_base_iri()?.as_str())
615    }
616
617    /// The current byte position in the input data.
618    pub fn buffer_position(&self) -> u64 {
619        self.parser.reader.buffer_position()
620    }
621
622    fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
623        self.reader_buffer.clear();
624        let event = self
625            .parser
626            .reader
627            .read_event_into(&mut self.reader_buffer)?;
628        self.parser.parse_event(event, &mut self.results)
629    }
630}
631
632/// Iterator on the file prefixes.
633///
634/// See [`ReaderRdfXmlParser::prefixes`].
635pub struct RdfXmlPrefixesIter<'a> {
636    inner: PrefixIter<'a>,
637    decoder: Decoder,
638    lenient: bool,
639}
640
641impl<'a> Iterator for RdfXmlPrefixesIter<'a> {
642    type Item = (&'a str, &'a str);
643
644    #[inline]
645    fn next(&mut self) -> Option<Self::Item> {
646        loop {
647            let (key, value) = self.inner.next()?;
648            return Some((
649                match key {
650                    PrefixDeclaration::Default => "",
651                    PrefixDeclaration::Named(name) => {
652                        let Ok(Cow::Borrowed(name)) = self.decoder.decode(name) else {
653                            continue;
654                        };
655                        let Ok(Cow::Borrowed(name)) = unescape_with(name, |_| None) else {
656                            continue;
657                        };
658                        if !self.lenient && !is_nc_name(name) {
659                            continue; // We don't return invalid prefixes
660                        }
661                        name
662                    }
663                },
664                {
665                    let Ok(Cow::Borrowed(value)) = self.decoder.decode(value.0) else {
666                        continue;
667                    };
668                    let Ok(Cow::Borrowed(value)) = unescape_with(value, |_| None) else {
669                        continue;
670                    };
671                    if !self.lenient && Iri::parse(value).is_err() {
672                        continue; // We don't return invalid prefixes
673                    }
674                    value
675                },
676            ));
677        }
678    }
679
680    #[inline]
681    fn size_hint(&self) -> (usize, Option<usize>) {
682        self.inner.size_hint()
683    }
684}
685
686const RDF_ABOUT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about";
687const RDF_ABOUT_EACH: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach";
688const RDF_ABOUT_EACH_PREFIX: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix";
689const RDF_BAG_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID";
690const RDF_DATATYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype";
691const RDF_DESCRIPTION: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description";
692const RDF_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID";
693const RDF_LI: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li";
694const RDF_NODE_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID";
695const RDF_PARSE_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType";
696const RDF_RDF: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF";
697const RDF_RESOURCE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource";
698
699// RDF vocabulary constants
700const RDF_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
701const RDF_NIL: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil";
702const RDF_FIRST: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first";
703const RDF_REST: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest";
704const RDF_STATEMENT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement";
705const RDF_SUBJECT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject";
706const RDF_PREDICATE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate";
707const RDF_OBJECT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object";
708const RDF_XML_LITERAL: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral";
709
710const RESERVED_RDF_ELEMENTS: [&str; 11] = [
711    RDF_ABOUT,
712    RDF_ABOUT_EACH,
713    RDF_ABOUT_EACH_PREFIX,
714    RDF_BAG_ID,
715    RDF_DATATYPE,
716    RDF_ID,
717    RDF_LI,
718    RDF_NODE_ID,
719    RDF_PARSE_TYPE,
720    RDF_RDF,
721    RDF_RESOURCE,
722];
723const RESERVED_RDF_ATTRIBUTES: [&str; 5] = [
724    RDF_ABOUT_EACH,
725    RDF_ABOUT_EACH_PREFIX,
726    RDF_LI,
727    RDF_RDF,
728    RDF_RESOURCE,
729];
730
731#[derive(Clone, Debug)]
732enum NodeOrText {
733    Node(NamedOrBlankNode),
734    Text(String),
735}
736
737enum RdfXmlState {
738    Doc {
739        base_iri: Option<Iri<String>>,
740    },
741    Rdf {
742        base_iri: Option<Iri<String>>,
743        language: Option<String>,
744    },
745    NodeElt {
746        base_iri: Option<Iri<String>>,
747        language: Option<String>,
748        subject: NamedOrBlankNode,
749        li_counter: u64,
750    },
751    PropertyElt {
752        // Resource, Literal or Empty property element
753        iri: NamedNode,
754        base_iri: Option<Iri<String>>,
755        language: Option<String>,
756        subject: NamedOrBlankNode,
757        object: Option<NodeOrText>,
758        id_attr: Option<NamedNode>,
759        datatype_attr: Option<NamedNode>,
760    },
761    ParseTypeCollectionPropertyElt {
762        iri: NamedNode,
763        base_iri: Option<Iri<String>>,
764        language: Option<String>,
765        subject: NamedOrBlankNode,
766        objects: Vec<NamedOrBlankNode>,
767        id_attr: Option<NamedNode>,
768    },
769    ParseTypeLiteralPropertyElt {
770        iri: NamedNode,
771        base_iri: Option<Iri<String>>,
772        language: Option<String>,
773        subject: NamedOrBlankNode,
774        writer: Writer<Vec<u8>>,
775        id_attr: Option<NamedNode>,
776        emit: bool, // false for parseTypeOtherPropertyElt support
777    },
778}
779
780struct InternalRdfXmlParser<R> {
781    reader: NsReader<R>,
782    state: Vec<RdfXmlState>,
783    custom_entities: HashMap<String, String>,
784    in_literal_depth: usize,
785    known_rdf_id: HashSet<String>,
786    is_end: bool,
787    lenient: bool,
788}
789
790/// Attributes for a node element
791struct NodeElementAttributes {
792    id_attr: Option<NamedNode>,
793    node_id_attr: Option<BlankNode>,
794    about_attr: Option<NamedNode>,
795    type_attr: Option<NamedNode>,
796    property_attrs: Vec<(NamedNode, String)>,
797}
798
799impl<R> InternalRdfXmlParser<R> {
800    fn parse_event(
801        &mut self,
802        event: Event<'_>,
803        results: &mut Vec<Triple>,
804    ) -> Result<(), RdfXmlParseError> {
805        match event {
806            Event::Start(event) => self.parse_start_event(&event, results),
807            Event::End(event) => self.parse_end_event(&event, results),
808            Event::Empty(_) => Err(RdfXmlSyntaxError::msg(
809                "The expand_empty_elements option must be enabled",
810            )
811            .into()),
812            Event::Text(event) => self.parse_text_event(&event),
813            Event::CData(event) => self.parse_text_event(&event.escape()?),
814            Event::Comment(_) | Event::PI(_) | Event::GeneralRef(_) => Ok(()),
815            Event::Decl(decl) => {
816                if let Some(encoding) = decl.encoding() {
817                    if !is_utf8(&encoding?) {
818                        return Err(RdfXmlSyntaxError::msg(
819                            "Only UTF-8 is supported by the RDF/XML parser",
820                        )
821                        .into());
822                    }
823                }
824                Ok(())
825            }
826            Event::DocType(dt) => self.parse_doctype(&dt),
827            Event::Eof => {
828                self.is_end = true;
829                Ok(())
830            }
831        }
832    }
833
834    fn parse_doctype(&mut self, dt: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
835        // we extract entities
836        for input in self
837            .reader
838            .decoder()
839            .decode(dt.as_ref())?
840            .split('<')
841            .skip(1)
842        {
843            if let Some(input) = input.strip_prefix("!ENTITY") {
844                let input = input.trim_start().strip_prefix('%').unwrap_or(input);
845                let (entity_name, input) = input.trim_start().split_once(|c: char| c.is_ascii_whitespace()).ok_or_else(|| {
846                    RdfXmlSyntaxError::msg(
847                        "<!ENTITY declarations should contain both an entity name and an entity value",
848                    )
849                })?;
850                let input = input.trim_start().strip_prefix('\"').ok_or_else(|| {
851                    RdfXmlSyntaxError::msg("<!ENTITY values should be enclosed in double quotes")
852                })?;
853                let (entity_value, input) = input.split_once('"').ok_or_else(|| {
854                    RdfXmlSyntaxError::msg(
855                        "<!ENTITY declarations values should be enclosed in double quotes",
856                    )
857                })?;
858                input.trim_start().strip_prefix('>').ok_or_else(|| {
859                    RdfXmlSyntaxError::msg("<!ENTITY declarations values should end with >")
860                })?;
861
862                // Resolves custom entities within the current entity definition.
863                let entity_value =
864                    unescape_with(entity_value, |e| self.resolve_entity(e)).map_err(Error::from)?;
865                self.custom_entities
866                    .insert(entity_name.to_owned(), entity_value.to_string());
867            }
868        }
869        Ok(())
870    }
871
872    fn parse_start_event(
873        &mut self,
874        event: &BytesStart<'_>,
875        results: &mut Vec<Triple>,
876    ) -> Result<(), RdfXmlParseError> {
877        #[derive(PartialEq, Eq)]
878        enum RdfXmlParseType {
879            Default,
880            Collection,
881            Literal,
882            Resource,
883            Other,
884        }
885
886        #[derive(PartialEq, Eq)]
887        enum RdfXmlNextProduction {
888            Rdf,
889            NodeElt,
890            PropertyElt { subject: NamedOrBlankNode },
891        }
892
893        // Literal case
894        if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) = self.state.last_mut()
895        {
896            let mut clean_event = BytesStart::new(
897                self.reader
898                    .decoder()
899                    .decode(event.name().as_ref())?
900                    .to_string(),
901            );
902            for attr in event.attributes() {
903                clean_event.push_attribute(attr.map_err(Error::InvalidAttr)?);
904            }
905            writer.write_event(Event::Start(clean_event))?;
906            self.in_literal_depth += 1;
907            return Ok(());
908        }
909
910        let tag_name = self.resolve_tag_name(event.name())?;
911
912        // We read attributes
913        let mut language = None;
914        let mut base_iri = None;
915        let mut id_attr = None;
916        let mut node_id_attr = None;
917        let mut about_attr = None;
918        let mut property_attrs = Vec::default();
919        let mut resource_attr = None;
920        let mut datatype_attr = None;
921        let mut parse_type = RdfXmlParseType::Default;
922        let mut type_attr = None;
923
924        for attribute in event.attributes() {
925            let attribute = attribute.map_err(Error::InvalidAttr)?;
926            if attribute.key.as_ref().starts_with(b"xml") {
927                if attribute.key.as_ref() == b"xml:lang" {
928                    let tag = self.convert_attribute(&attribute)?.to_ascii_lowercase();
929                    language = Some(if self.lenient {
930                        tag
931                    } else {
932                        LanguageTag::parse(tag.to_ascii_lowercase())
933                            .map_err(|error| RdfXmlSyntaxError::invalid_language_tag(tag, error))?
934                            .into_inner()
935                    });
936                } else if attribute.key.as_ref() == b"xml:base" {
937                    let iri = self.convert_attribute(&attribute)?;
938                    base_iri = Some(if self.lenient {
939                        Iri::parse_unchecked(iri.clone())
940                    } else {
941                        Iri::parse(iri.clone())
942                            .map_err(|error| RdfXmlSyntaxError::invalid_iri(iri, error))?
943                    })
944                } else {
945                    // We ignore other xml attributes
946                }
947            } else {
948                let attribute_url = self.resolve_attribute_name(attribute.key)?;
949                if *attribute_url == *RDF_ID {
950                    let mut id = self.convert_attribute(&attribute)?;
951                    if !is_nc_name(&id) {
952                        return Err(RdfXmlSyntaxError::msg(format!(
953                            "{id} is not a valid rdf:ID value"
954                        ))
955                        .into());
956                    }
957                    id.insert(0, '#');
958                    id_attr = Some(id);
959                } else if *attribute_url == *RDF_BAG_ID {
960                    let bag_id = self.convert_attribute(&attribute)?;
961                    if !is_nc_name(&bag_id) {
962                        return Err(RdfXmlSyntaxError::msg(format!(
963                            "{bag_id} is not a valid rdf:bagID value"
964                        ))
965                        .into());
966                    }
967                } else if *attribute_url == *RDF_NODE_ID {
968                    let id = self.convert_attribute(&attribute)?;
969                    if !is_nc_name(&id) {
970                        return Err(RdfXmlSyntaxError::msg(format!(
971                            "{id} is not a valid rdf:nodeID value"
972                        ))
973                        .into());
974                    }
975                    node_id_attr = Some(BlankNode::new_unchecked(id));
976                } else if *attribute_url == *RDF_ABOUT {
977                    about_attr = Some(attribute);
978                } else if *attribute_url == *RDF_RESOURCE {
979                    resource_attr = Some(attribute);
980                } else if *attribute_url == *RDF_DATATYPE {
981                    datatype_attr = Some(attribute);
982                } else if *attribute_url == *RDF_PARSE_TYPE {
983                    parse_type = match attribute.value.as_ref() {
984                        b"Collection" => RdfXmlParseType::Collection,
985                        b"Literal" => RdfXmlParseType::Literal,
986                        b"Resource" => RdfXmlParseType::Resource,
987                        _ => RdfXmlParseType::Other,
988                    };
989                } else if attribute_url == RDF_TYPE {
990                    type_attr = Some(attribute);
991                } else if RESERVED_RDF_ATTRIBUTES.contains(&&*attribute_url) {
992                    return Err(RdfXmlSyntaxError::msg(format!(
993                        "{attribute_url} is not a valid attribute"
994                    ))
995                    .into());
996                } else {
997                    property_attrs.push((
998                        self.parse_iri(attribute_url)?,
999                        self.convert_attribute(&attribute)?,
1000                    ));
1001                }
1002            }
1003        }
1004
1005        // Parsing with the base URI
1006        let id_attr = match id_attr {
1007            Some(iri) => {
1008                let iri = self.resolve_iri(base_iri.as_ref(), iri)?;
1009                if !self.lenient {
1010                    if self.known_rdf_id.contains(iri.as_str()) {
1011                        return Err(RdfXmlSyntaxError::msg(format!(
1012                            "{iri} has already been used as rdf:ID value"
1013                        ))
1014                        .into());
1015                    }
1016                    self.known_rdf_id.insert(iri.as_str().into());
1017                }
1018                Some(iri)
1019            }
1020            None => None,
1021        };
1022        let about_attr = match about_attr {
1023            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1024            None => None,
1025        };
1026        let resource_attr = match resource_attr {
1027            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1028            None => None,
1029        };
1030        let datatype_attr = match datatype_attr {
1031            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1032            None => None,
1033        };
1034        let type_attr = match type_attr {
1035            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1036            None => None,
1037        };
1038
1039        let expected_production = match self.state.last() {
1040            Some(RdfXmlState::Doc { .. }) => RdfXmlNextProduction::Rdf,
1041            Some(
1042                RdfXmlState::Rdf { .. }
1043                | RdfXmlState::PropertyElt { .. }
1044                | RdfXmlState::ParseTypeCollectionPropertyElt { .. },
1045            ) => RdfXmlNextProduction::NodeElt,
1046            Some(RdfXmlState::NodeElt { subject, .. }) => RdfXmlNextProduction::PropertyElt {
1047                subject: subject.clone(),
1048            },
1049            Some(RdfXmlState::ParseTypeLiteralPropertyElt { .. }) => {
1050                return Err(
1051                    RdfXmlSyntaxError::msg("ParseTypeLiteralPropertyElt production children should never be considered as a RDF/XML content").into()
1052                );
1053            }
1054            None => {
1055                return Err(RdfXmlSyntaxError::msg(
1056                    "No state in the stack: the XML is not balanced",
1057                )
1058                .into());
1059            }
1060        };
1061
1062        let new_state = match expected_production {
1063            RdfXmlNextProduction::Rdf => {
1064                if *tag_name == *RDF_RDF {
1065                    RdfXmlState::Rdf { base_iri, language }
1066                } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1067                    return Err(RdfXmlSyntaxError::msg(format!(
1068                        "Invalid node element tag name: {tag_name}"
1069                    ))
1070                    .into());
1071                } else {
1072                    self.build_node_elt(
1073                        self.parse_iri(tag_name)?,
1074                        base_iri,
1075                        language,
1076                        NodeElementAttributes {
1077                            id_attr,
1078                            node_id_attr,
1079                            about_attr,
1080                            type_attr,
1081                            property_attrs,
1082                        },
1083                        results,
1084                    )?
1085                }
1086            }
1087            RdfXmlNextProduction::NodeElt => {
1088                if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1089                    return Err(RdfXmlSyntaxError::msg(format!(
1090                        "Invalid property element tag name: {tag_name}"
1091                    ))
1092                    .into());
1093                }
1094                self.build_node_elt(
1095                    self.parse_iri(tag_name)?,
1096                    base_iri,
1097                    language,
1098                    NodeElementAttributes {
1099                        id_attr,
1100                        node_id_attr,
1101                        about_attr,
1102                        type_attr,
1103                        property_attrs,
1104                    },
1105                    results,
1106                )?
1107            }
1108            RdfXmlNextProduction::PropertyElt { subject } => {
1109                let iri = if *tag_name == *RDF_LI {
1110                    let Some(RdfXmlState::NodeElt { li_counter, .. }) = self.state.last_mut()
1111                    else {
1112                        return Err(RdfXmlSyntaxError::msg(format!(
1113                            "Invalid property element tag name: {tag_name}"
1114                        ))
1115                        .into());
1116                    };
1117                    *li_counter += 1;
1118                    NamedNode::new_unchecked(format!(
1119                        "http://www.w3.org/1999/02/22-rdf-syntax-ns#_{li_counter}"
1120                    ))
1121                } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name)
1122                    || *tag_name == *RDF_DESCRIPTION
1123                {
1124                    return Err(RdfXmlSyntaxError::msg(format!(
1125                        "Invalid property element tag name: {tag_name}"
1126                    ))
1127                    .into());
1128                } else {
1129                    self.parse_iri(tag_name)?
1130                };
1131                match parse_type {
1132                    RdfXmlParseType::Default => {
1133                        if resource_attr.is_some()
1134                            || node_id_attr.is_some()
1135                            || !property_attrs.is_empty()
1136                        {
1137                            let object = match (resource_attr, node_id_attr)
1138                            {
1139                                (Some(resource_attr), None) => NamedOrBlankNode::from(resource_attr),
1140                                (None, Some(node_id_attr)) => node_id_attr.into(),
1141                                (None, None) => BlankNode::default().into(),
1142                                (Some(_), Some(_)) => return Err(RdfXmlSyntaxError::msg("Not both rdf:resource and rdf:nodeID could be set at the same time").into())
1143                            };
1144                            self.emit_property_attrs(
1145                                &object,
1146                                property_attrs,
1147                                language.as_deref(),
1148                                results,
1149                            );
1150                            if let Some(type_attr) = type_attr {
1151                                results.push(Triple::new(
1152                                    crate::model::term::Subject::from(object.clone()),
1153                                    NamedNode::new_unchecked(RDF_TYPE),
1154                                    type_attr,
1155                                ));
1156                            }
1157                            RdfXmlState::PropertyElt {
1158                                iri,
1159                                base_iri,
1160                                language,
1161                                subject,
1162                                object: Some(NodeOrText::Node(object)),
1163                                id_attr,
1164                                datatype_attr,
1165                            }
1166                        } else {
1167                            RdfXmlState::PropertyElt {
1168                                iri,
1169                                base_iri,
1170                                language,
1171                                subject,
1172                                object: None,
1173                                id_attr,
1174                                datatype_attr,
1175                            }
1176                        }
1177                    }
1178                    RdfXmlParseType::Literal => RdfXmlState::ParseTypeLiteralPropertyElt {
1179                        iri,
1180                        base_iri,
1181                        language,
1182                        subject,
1183                        writer: Writer::new(Vec::default()),
1184                        id_attr,
1185                        emit: true,
1186                    },
1187                    RdfXmlParseType::Resource => Self::build_parse_type_resource_property_elt(
1188                        iri, base_iri, language, subject, id_attr, results,
1189                    ),
1190                    RdfXmlParseType::Collection => RdfXmlState::ParseTypeCollectionPropertyElt {
1191                        iri,
1192                        base_iri,
1193                        language,
1194                        subject,
1195                        objects: Vec::default(),
1196                        id_attr,
1197                    },
1198                    RdfXmlParseType::Other => RdfXmlState::ParseTypeLiteralPropertyElt {
1199                        iri,
1200                        base_iri,
1201                        language,
1202                        subject,
1203                        writer: Writer::new(Vec::default()),
1204                        id_attr,
1205                        emit: false,
1206                    },
1207                }
1208            }
1209        };
1210        self.state.push(new_state);
1211        Ok(())
1212    }
1213
1214    fn parse_end_event(
1215        &mut self,
1216        event: &BytesEnd<'_>,
1217        results: &mut Vec<Triple>,
1218    ) -> Result<(), RdfXmlParseError> {
1219        // Literal case
1220        if self.in_literal_depth > 0 {
1221            if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) =
1222                self.state.last_mut()
1223            {
1224                writer.write_event(Event::End(BytesEnd::new(
1225                    self.reader.decoder().decode(event.name().as_ref())?,
1226                )))?;
1227                self.in_literal_depth -= 1;
1228                return Ok(());
1229            }
1230        }
1231
1232        if let Some(current_state) = self.state.pop() {
1233            self.end_state(current_state, results)?;
1234        }
1235        Ok(())
1236    }
1237
1238    fn parse_text_event(&mut self, event: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
1239        let text =
1240            unescape_with(std::str::from_utf8(event)?, |e| self.resolve_entity(e))?.to_string();
1241        match self.state.last_mut() {
1242            Some(RdfXmlState::PropertyElt { object, .. }) => {
1243                if is_object_defined(object) {
1244                    if text.bytes().all(is_whitespace) {
1245                        Ok(()) // whitespace anyway, we ignore
1246                    } else {
1247                        Err(
1248                            RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'"))
1249                                .into(),
1250                        )
1251                    }
1252                } else {
1253                    *object = Some(NodeOrText::Text(text));
1254                    Ok(())
1255                }
1256            }
1257            Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) => {
1258                writer.write_event(Event::Text(BytesText::new(&text)))?;
1259                Ok(())
1260            }
1261            _ => {
1262                if text.bytes().all(is_whitespace) {
1263                    Ok(())
1264                } else {
1265                    Err(RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'")).into())
1266                }
1267            }
1268        }
1269    }
1270
1271    fn resolve_tag_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1272        let (namespace, local_name) = self.reader.resolve_element(qname);
1273        self.resolve_ns_name(namespace, local_name)
1274    }
1275
1276    fn resolve_attribute_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1277        let (namespace, local_name) = self.reader.resolve_attribute(qname);
1278        self.resolve_ns_name(namespace, local_name)
1279    }
1280
1281    fn resolve_ns_name(
1282        &self,
1283        namespace: ResolveResult<'_>,
1284        local_name: LocalName<'_>,
1285    ) -> Result<String, RdfXmlParseError> {
1286        match namespace {
1287            ResolveResult::Bound(ns) => {
1288                let mut value = Vec::with_capacity(ns.as_ref().len() + local_name.as_ref().len());
1289                value.extend_from_slice(ns.as_ref());
1290                value.extend_from_slice(local_name.as_ref());
1291                Ok(unescape_with(&self.reader.decoder().decode(&value)?, |e| {
1292                    self.resolve_entity(e)
1293                })
1294                .map_err(Error::from)?
1295                .to_string())
1296            }
1297            ResolveResult::Unbound => {
1298                Err(RdfXmlSyntaxError::msg("XML namespaces are required in RDF/XML").into())
1299            }
1300            ResolveResult::Unknown(v) => Err(RdfXmlSyntaxError::msg(format!(
1301                "Unknown prefix {}:",
1302                self.reader.decoder().decode(&v)?
1303            ))
1304            .into()),
1305        }
1306    }
1307
1308    fn build_node_elt(
1309        &self,
1310        iri: NamedNode,
1311        base_iri: Option<Iri<String>>,
1312        language: Option<String>,
1313        attrs: NodeElementAttributes,
1314        results: &mut Vec<Triple>,
1315    ) -> Result<RdfXmlState, RdfXmlSyntaxError> {
1316        let subject = match (attrs.id_attr, attrs.node_id_attr, attrs.about_attr) {
1317            (Some(id_attr), None, None) => NamedOrBlankNode::from(id_attr),
1318            (None, Some(node_id_attr), None) => node_id_attr.into(),
1319            (None, None, Some(about_attr)) => about_attr.into(),
1320            (None, None, None) => BlankNode::default().into(),
1321            (Some(_), Some(_), _) => {
1322                return Err(RdfXmlSyntaxError::msg(
1323                    "Not both rdf:ID and rdf:nodeID could be set at the same time",
1324                ));
1325            }
1326            (_, Some(_), Some(_)) => {
1327                return Err(RdfXmlSyntaxError::msg(
1328                    "Not both rdf:nodeID and rdf:resource could be set at the same time",
1329                ));
1330            }
1331            (Some(_), _, Some(_)) => {
1332                return Err(RdfXmlSyntaxError::msg(
1333                    "Not both rdf:ID and rdf:resource could be set at the same time",
1334                ));
1335            }
1336        };
1337
1338        self.emit_property_attrs(&subject, attrs.property_attrs, language.as_deref(), results);
1339
1340        if let Some(type_attr) = attrs.type_attr {
1341            results.push(Triple::new(
1342                crate::model::term::Subject::from(subject.clone()),
1343                NamedNode::new_unchecked(RDF_TYPE),
1344                type_attr,
1345            ));
1346        }
1347
1348        if iri.as_str() != RDF_DESCRIPTION {
1349            results.push(Triple::new(
1350                crate::model::term::Subject::from(subject.clone()),
1351                NamedNode::new_unchecked(RDF_TYPE),
1352                iri,
1353            ));
1354        }
1355        Ok(RdfXmlState::NodeElt {
1356            base_iri,
1357            language,
1358            subject,
1359            li_counter: 0,
1360        })
1361    }
1362
1363    fn build_parse_type_resource_property_elt(
1364        iri: NamedNode,
1365        base_iri: Option<Iri<String>>,
1366        language: Option<String>,
1367        subject: NamedOrBlankNode,
1368        id_attr: Option<NamedNode>,
1369        results: &mut Vec<Triple>,
1370    ) -> RdfXmlState {
1371        let object = BlankNode::default();
1372        let triple = Triple::new(
1373            crate::model::term::Subject::from(subject),
1374            iri,
1375            object.clone(),
1376        );
1377        if let Some(id_attr) = id_attr {
1378            Self::reify(triple.clone(), id_attr, results);
1379        }
1380        results.push(triple);
1381        RdfXmlState::NodeElt {
1382            base_iri,
1383            language,
1384            subject: object.into(),
1385            li_counter: 0,
1386        }
1387    }
1388
1389    fn end_state(
1390        &mut self,
1391        state: RdfXmlState,
1392        results: &mut Vec<Triple>,
1393    ) -> Result<(), RdfXmlSyntaxError> {
1394        match state {
1395            RdfXmlState::PropertyElt {
1396                iri,
1397                language,
1398                subject,
1399                id_attr,
1400                datatype_attr,
1401                object,
1402                ..
1403            } => {
1404                let object = match object {
1405                    Some(NodeOrText::Node(node)) => match node {
1406                        NamedOrBlankNode::NamedNode(n) => Object::NamedNode(n),
1407                        NamedOrBlankNode::BlankNode(b) => Object::BlankNode(b),
1408                    },
1409                    Some(NodeOrText::Text(text)) => {
1410                        Object::Literal(self.new_literal(text, language, datatype_attr))
1411                    }
1412                    None => {
1413                        Object::Literal(self.new_literal(String::new(), language, datatype_attr))
1414                    }
1415                };
1416                let triple = Triple::new(crate::model::term::Subject::from(subject), iri, object);
1417                if let Some(id_attr) = id_attr {
1418                    Self::reify(triple.clone(), id_attr, results);
1419                }
1420                results.push(triple);
1421            }
1422            RdfXmlState::ParseTypeCollectionPropertyElt {
1423                iri,
1424                subject,
1425                id_attr,
1426                objects,
1427                ..
1428            } => {
1429                let mut current_node = NamedOrBlankNode::from(NamedNode::new_unchecked(RDF_NIL));
1430                for object in objects.into_iter().rev() {
1431                    let subject = NamedOrBlankNode::from(BlankNode::default());
1432                    results.push(Triple::new(
1433                        crate::model::term::Subject::from(subject.clone()),
1434                        NamedNode::new_unchecked(RDF_FIRST),
1435                        object,
1436                    ));
1437                    results.push(Triple::new(
1438                        crate::model::term::Subject::from(subject.clone()),
1439                        NamedNode::new_unchecked(RDF_REST),
1440                        crate::model::term::Object::from(current_node.clone()),
1441                    ));
1442                    current_node = subject;
1443                }
1444                let triple = Triple::new(
1445                    crate::model::term::Subject::from(subject),
1446                    iri,
1447                    crate::model::term::Object::from(current_node),
1448                );
1449                if let Some(id_attr) = id_attr {
1450                    Self::reify(triple.clone(), id_attr, results);
1451                }
1452                results.push(triple);
1453            }
1454            RdfXmlState::ParseTypeLiteralPropertyElt {
1455                iri,
1456                subject,
1457                id_attr,
1458                writer,
1459                emit,
1460                ..
1461            } => {
1462                if emit {
1463                    let object = writer.into_inner();
1464                    if object.is_empty() {
1465                        return Err(RdfXmlSyntaxError::msg(format!(
1466                            "No value found for rdf:XMLLiteral value of property {iri}"
1467                        )));
1468                    }
1469                    let triple = Triple::new(
1470                        crate::model::term::Subject::from(subject),
1471                        iri,
1472                        Literal::new_typed_literal(
1473                            str::from_utf8(&object).map_err(|_| {
1474                                RdfXmlSyntaxError::msg(
1475                                    "The XML literal is not in valid UTF-8".to_owned(),
1476                                )
1477                            })?,
1478                            NamedNode::new_unchecked(RDF_XML_LITERAL),
1479                        ),
1480                    );
1481                    if let Some(id_attr) = id_attr {
1482                        Self::reify(triple.clone(), id_attr, results);
1483                    }
1484                    results.push(triple);
1485                }
1486            }
1487            RdfXmlState::NodeElt { subject, .. } => match self.state.last_mut() {
1488                Some(RdfXmlState::PropertyElt { object, .. }) => {
1489                    if is_object_defined(object) {
1490                        return Err(RdfXmlSyntaxError::msg(
1491                            "Unexpected node, a text value is already present",
1492                        ));
1493                    }
1494                    *object = Some(NodeOrText::Node(subject))
1495                }
1496                Some(RdfXmlState::ParseTypeCollectionPropertyElt { objects, .. }) => {
1497                    objects.push(subject)
1498                }
1499                _ => (),
1500            },
1501            _ => (),
1502        }
1503        Ok(())
1504    }
1505
1506    fn new_literal(
1507        &self,
1508        value: String,
1509        language: Option<String>,
1510        datatype: Option<NamedNode>,
1511    ) -> Literal {
1512        if let Some(datatype) = datatype {
1513            Literal::new_typed_literal(value, datatype)
1514        } else if let Some(language) =
1515            language.or_else(|| self.current_language().map(ToOwned::to_owned))
1516        {
1517            Literal::new_language_tagged_literal_unchecked(value, language)
1518        } else {
1519            Literal::new_simple_literal(value)
1520        }
1521    }
1522
1523    fn reify(triple: Triple, statement_id: NamedNode, results: &mut Vec<Triple>) {
1524        results.push(Triple::new(
1525            statement_id.clone(),
1526            NamedNode::new_unchecked(RDF_TYPE),
1527            NamedNode::new_unchecked(RDF_STATEMENT),
1528        ));
1529        results.push(Triple::new(
1530            statement_id.clone(),
1531            NamedNode::new_unchecked(RDF_SUBJECT),
1532            match triple.subject() {
1533                Subject::NamedNode(n) => Object::NamedNode(n.clone()),
1534                Subject::BlankNode(b) => Object::BlankNode(b.clone()),
1535                Subject::Variable(v) => Object::Variable(v.clone()),
1536                Subject::QuotedTriple(qt) => Object::QuotedTriple(qt.clone()),
1537            },
1538        ));
1539        results.push(Triple::new(
1540            statement_id.clone(),
1541            NamedNode::new_unchecked(RDF_PREDICATE),
1542            match triple.predicate() {
1543                Predicate::NamedNode(n) => Object::NamedNode(n.clone()),
1544                Predicate::Variable(v) => Object::Variable(v.clone()),
1545            },
1546        ));
1547        results.push(Triple::new(
1548            statement_id,
1549            NamedNode::new_unchecked(RDF_OBJECT),
1550            triple.object().clone(),
1551        ));
1552    }
1553
1554    fn emit_property_attrs(
1555        &self,
1556        subject: &NamedOrBlankNode,
1557        literal_attributes: Vec<(NamedNode, String)>,
1558        language: Option<&str>,
1559        results: &mut Vec<Triple>,
1560    ) {
1561        for (literal_predicate, literal_value) in literal_attributes {
1562            results.push(Triple::new(
1563                crate::model::term::Subject::from(subject.clone()),
1564                literal_predicate,
1565                if let Some(language) = language.or_else(|| self.current_language()) {
1566                    Literal::new_lang(&literal_value, language)
1567                        .unwrap_or_else(|_| Literal::new(literal_value))
1568                } else {
1569                    Literal::new(literal_value)
1570                },
1571            ));
1572        }
1573    }
1574
1575    fn convert_attribute(&self, attribute: &Attribute<'_>) -> Result<String, RdfXmlParseError> {
1576        Ok(attribute
1577            .decode_and_unescape_value_with(self.reader.decoder(), |e| self.resolve_entity(e))?
1578            .into_owned())
1579    }
1580
1581    fn convert_iri_attribute(
1582        &self,
1583        base_iri: Option<&Iri<String>>,
1584        attribute: &Attribute<'_>,
1585    ) -> Result<NamedNode, RdfXmlParseError> {
1586        let converted = self.convert_attribute(attribute)?;
1587        self.resolve_iri(base_iri, converted)
1588            .map_err(RdfXmlParseError::Syntax)
1589    }
1590
1591    fn resolve_iri(
1592        &self,
1593        base_iri: Option<&Iri<String>>,
1594        relative_iri: String,
1595    ) -> Result<NamedNode, RdfXmlSyntaxError> {
1596        if let Some(base_iri) = base_iri.or_else(|| self.current_base_iri()) {
1597            Ok(NamedNode::new_unchecked(if self.lenient {
1598                base_iri.resolve_unchecked(&relative_iri).into_inner()
1599            } else {
1600                base_iri
1601                    .resolve(&relative_iri)
1602                    .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1603                    .into_inner()
1604            }))
1605        } else {
1606            self.parse_iri(relative_iri)
1607        }
1608    }
1609
1610    fn parse_iri(&self, relative_iri: String) -> Result<NamedNode, RdfXmlSyntaxError> {
1611        Ok(NamedNode::new_unchecked(if self.lenient {
1612            relative_iri
1613        } else {
1614            Iri::parse(relative_iri.clone())
1615                .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1616                .into_inner()
1617        }))
1618    }
1619
1620    fn current_language(&self) -> Option<&str> {
1621        for state in self.state.iter().rev() {
1622            match state {
1623                RdfXmlState::Doc { .. } => (),
1624                RdfXmlState::Rdf { language, .. }
1625                | RdfXmlState::NodeElt { language, .. }
1626                | RdfXmlState::PropertyElt { language, .. }
1627                | RdfXmlState::ParseTypeCollectionPropertyElt { language, .. }
1628                | RdfXmlState::ParseTypeLiteralPropertyElt { language, .. } => {
1629                    if let Some(language) = language {
1630                        return Some(language);
1631                    }
1632                }
1633            }
1634        }
1635        None
1636    }
1637
1638    fn current_base_iri(&self) -> Option<&Iri<String>> {
1639        for state in self.state.iter().rev() {
1640            match state {
1641                RdfXmlState::Doc { base_iri }
1642                | RdfXmlState::Rdf { base_iri, .. }
1643                | RdfXmlState::NodeElt { base_iri, .. }
1644                | RdfXmlState::PropertyElt { base_iri, .. }
1645                | RdfXmlState::ParseTypeCollectionPropertyElt { base_iri, .. }
1646                | RdfXmlState::ParseTypeLiteralPropertyElt { base_iri, .. } => {
1647                    if let Some(base_iri) = base_iri {
1648                        return Some(base_iri);
1649                    }
1650                }
1651            }
1652        }
1653        None
1654    }
1655
1656    fn resolve_entity(&self, e: &str) -> Option<&str> {
1657        resolve_xml_entity(e).or_else(|| self.custom_entities.get(e).map(String::as_str))
1658    }
1659}
1660
1661fn is_object_defined(object: &Option<NodeOrText>) -> bool {
1662    match object {
1663        Some(NodeOrText::Node(_)) => true,
1664        Some(NodeOrText::Text(t)) => !t.bytes().all(is_whitespace),
1665        None => false,
1666    }
1667}
1668
1669fn is_whitespace(c: u8) -> bool {
1670    matches!(c, b' ' | b'\t' | b'\n' | b'\r')
1671}
1672
1673fn is_utf8(encoding: &[u8]) -> bool {
1674    matches!(
1675        encoding.to_ascii_lowercase().as_slice(),
1676        b"unicode-1-1-utf-8"
1677            | b"unicode11utf8"
1678            | b"unicode20utf8"
1679            | b"utf-8"
1680            | b"utf8"
1681            | b"x-unicode20utf8"
1682    )
1683}