Skip to main content

oxirs_core/rdfxml/
parser.rs

1use crate::model::literal::LanguageTag;
2use crate::model::term::{Object, Predicate, Subject};
3use crate::model::{BlankNode, Literal, NamedNode, NamedOrBlankNode, Term, Triple};
4use crate::rdfxml::error::{RdfXmlParseError, RdfXmlSyntaxError};
5use crate::rdfxml::utils::*;
6use oxiri::{Iri, IriParseError};
7use quick_xml::escape::{resolve_xml_entity, unescape_with};
8use quick_xml::events::attributes::Attribute;
9use quick_xml::events::*;
10use quick_xml::name::{LocalName, NamespaceBindingsIter, PrefixDeclaration, QName, ResolveResult};
11use quick_xml::{Decoder, Error, NsReader, Writer};
12use std::borrow::Cow;
13use std::collections::{HashMap, HashSet};
14use std::io::{BufReader, Read};
15use std::str;
16#[cfg(feature = "async-tokio")]
17use tokio::io::{AsyncRead, BufReader as AsyncBufReader};
18
19impl From<NamedOrBlankNode> for Term {
20    fn from(node: NamedOrBlankNode) -> Self {
21        match node {
22            NamedOrBlankNode::NamedNode(n) => Term::NamedNode(n),
23            NamedOrBlankNode::BlankNode(n) => Term::BlankNode(n),
24        }
25    }
26}
27
28/// A [RDF/XML](https://www.w3.org/TR/rdf-syntax-grammar/) streaming parser.
29///
30/// It reads the file in streaming.
31/// It does not keep data in memory except a stack for handling nested XML tags, and a set of all
32/// seen `rdf:ID`s to detect duplicate ids and fail according to the specification.
33///
34/// Its performances are not optimized yet and hopefully could be significantly enhanced by reducing the
35/// number of allocations and copies done by the parser.
36///
37/// Count the number of people:
38/// ```
39/// use oxirs_core::model::NamedNode;
40/// use oxirs_core::{Predicate, Object};
41/// use oxirs_core::rdfxml::RdfXmlParser;
42///
43/// let file = br#"<?xml version="1.0"?>
44/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
45///  <rdf:Description rdf:about="http://example.com/foo">
46///    <rdf:type rdf:resource="http://schema.org/Person" />
47///    <schema:name>Foo</schema:name>
48///  </rdf:Description>
49///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
50/// </rdf:RDF>"#;
51///
52/// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
53/// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
54/// let mut count = 0;
55/// for triple in RdfXmlParser::new().for_reader(file.as_ref()) {
56///     let triple = triple.unwrap();
57///     if matches!(triple.predicate(), oxirs_core::Predicate::NamedNode(n) if n == &rdf_type) && matches!(triple.object(), oxirs_core::Object::NamedNode(n) if n == &schema_person) {
58///         count += 1;
59///     }
60/// }
61/// assert_eq!(2, count);
62/// ```
63#[derive(Default, Clone)]
64#[must_use]
65pub struct RdfXmlParser {
66    lenient: bool,
67    base: Option<Iri<String>>,
68}
69
70impl RdfXmlParser {
71    /// Builds a new [`RdfXmlParser`].
72    #[inline]
73    pub fn new() -> Self {
74        Self::default()
75    }
76
77    /// Assumes the file is valid to make parsing faster.
78    ///
79    /// It will skip some validations.
80    ///
81    /// Note that if the file is actually not valid, the parser might emit broken RDF.
82    #[inline]
83    pub fn lenient(mut self) -> Self {
84        self.lenient = true;
85        self
86    }
87
88    #[deprecated(note = "Use `lenient()` instead", since = "0.2.0")]
89    #[inline]
90    pub fn unchecked(self) -> Self {
91        self.lenient()
92    }
93
94    #[inline]
95    pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
96        self.base = Some(Iri::parse(base_iri.into())?);
97        Ok(self)
98    }
99
100    /// Parses a RDF/XML file from a [`Read`] implementation.
101    ///
102    /// Count the number of people:
103    /// ```
104    /// use oxirs_core::model::NamedNode;
105    /// use oxirs_core::{Predicate, Object};
106    /// use oxirs_core::rdfxml::RdfXmlParser;
107    ///
108    /// let file = br#"<?xml version="1.0"?>
109    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
110    ///  <rdf:Description rdf:about="http://example.com/foo">
111    ///    <rdf:type rdf:resource="http://schema.org/Person" />
112    ///    <schema:name>Foo</schema:name>
113    ///  </rdf:Description>
114    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
115    /// </rdf:RDF>"#;
116    ///
117    /// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
118    /// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
119    /// let mut count = 0;
120    /// for triple in RdfXmlParser::new().for_reader(file.as_ref()) {
121    ///     let triple = triple.unwrap();
122    ///     if matches!(triple.predicate(), oxirs_core::Predicate::NamedNode(n) if n == &rdf_type) && matches!(triple.object(), oxirs_core::Object::NamedNode(n) if n == &schema_person) {
123    ///         count += 1;
124    ///     }
125    /// }
126    /// assert_eq!(2, count);
127    /// ```
128    pub fn for_reader<R: Read>(self, reader: R) -> ReaderRdfXmlParser<R> {
129        ReaderRdfXmlParser {
130            results: Vec::new(),
131            parser: self.into_internal(BufReader::new(reader)),
132            reader_buffer: Vec::default(),
133        }
134    }
135
136    /// Parses a RDF/XML file from a [`AsyncRead`] implementation.
137    ///
138    /// Count the number of people:
139    /// ```
140    /// # #[tokio::main(flavor = "current_thread")]
141    /// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
142    /// use oxirs_core::model::NamedNode;
143    /// use oxirs_core::{Predicate, Object};
144    /// use oxirs_core::rdfxml::RdfXmlParser;
145    ///
146    /// let file = br#"<?xml version="1.0"?>
147    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
148    ///   <rdf:Description rdf:about="http://example.com/foo">
149    ///     <rdf:type rdf:resource="http://schema.org/Person" />
150    ///     <schema:name>Foo</schema:name>
151    ///   </rdf:Description>
152    ///   <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
153    /// </rdf:RDF>"#;
154    ///
155    /// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
156    /// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
157    /// let mut count = 0;
158    /// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
159    /// while let Some(triple) = parser.next().await {
160    ///     let triple = triple.unwrap();
161    ///     if matches!(triple.predicate(), oxirs_core::Predicate::NamedNode(n) if n == &rdf_type) && matches!(triple.object(), oxirs_core::Object::NamedNode(n) if n == &schema_person) {
162    ///         count += 1;
163    ///     }
164    /// }
165    /// assert_eq!(2, count);
166    /// # Ok(())
167    /// # }
168    /// ```
169    #[cfg(feature = "async-tokio")]
170    pub fn for_tokio_async_reader<R: AsyncRead + Unpin>(
171        self,
172        reader: R,
173    ) -> TokioAsyncReaderRdfXmlParser<R> {
174        TokioAsyncReaderRdfXmlParser {
175            results: Vec::new(),
176            parser: self.into_internal(AsyncBufReader::new(reader)),
177            reader_buffer: Vec::default(),
178        }
179    }
180
181    /// Parses a RDF/XML file from a byte slice.
182    ///
183    /// Count the number of people:
184    /// ```
185    /// use oxirs_core::model::NamedNode;
186    /// use oxirs_core::{Predicate, Object};
187    /// use oxirs_core::rdfxml::RdfXmlParser;
188    ///
189    /// let file = br#"<?xml version="1.0"?>
190    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
191    ///  <rdf:Description rdf:about="http://example.com/foo">
192    ///    <rdf:type rdf:resource="http://schema.org/Person" />
193    ///    <schema:name>Foo</schema:name>
194    ///  </rdf:Description>
195    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
196    /// </rdf:RDF>"#;
197    ///
198    /// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
199    /// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
200    /// let mut count = 0;
201    /// for triple in RdfXmlParser::new().for_slice(file) {
202    ///     let triple = triple.unwrap();
203    ///     if matches!(triple.predicate(), oxirs_core::Predicate::NamedNode(n) if n == &rdf_type) && matches!(triple.object(), oxirs_core::Object::NamedNode(n) if n == &schema_person) {
204    ///         count += 1;
205    ///     }
206    /// }
207    /// assert_eq!(2, count);
208    /// ```
209    pub fn for_slice(self, slice: &[u8]) -> SliceRdfXmlParser<'_> {
210        SliceRdfXmlParser {
211            results: Vec::new(),
212            parser: self.into_internal(slice),
213            reader_buffer: Vec::default(),
214        }
215    }
216
217    fn into_internal<T>(self, reader: T) -> InternalRdfXmlParser<T> {
218        let mut reader = NsReader::from_reader(reader);
219        reader.config_mut().expand_empty_elements = true;
220        InternalRdfXmlParser {
221            reader,
222            state: vec![RdfXmlState::Doc {
223                base_iri: self.base.clone(),
224            }],
225            custom_entities: HashMap::new(),
226            in_literal_depth: 0,
227            known_rdf_id: HashSet::default(),
228            is_end: false,
229            lenient: self.lenient,
230        }
231    }
232}
233
234/// Parses a RDF/XML file from a [`Read`] implementation.
235///
236/// Can be built using [`RdfXmlParser::for_reader`].
237///
238/// Count the number of people:
239/// ```
240/// use oxirs_core::model::NamedNode;
241/// use oxirs_core::{Predicate, Object};
242/// use oxirs_core::rdfxml::RdfXmlParser;
243///
244/// let file = br#"<?xml version="1.0"?>
245/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
246///  <rdf:Description rdf:about="http://example.com/foo">
247///    <rdf:type rdf:resource="http://schema.org/Person" />
248///    <schema:name>Foo</schema:name>
249///  </rdf:Description>
250///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
251/// </rdf:RDF>"#;
252///
253/// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
254/// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
255/// let mut count = 0;
256/// for triple in RdfXmlParser::new().for_reader(file.as_ref()) {
257///     let triple = triple.unwrap();
258///     if matches!(triple.predicate(), oxirs_core::Predicate::NamedNode(n) if n == &rdf_type) && matches!(triple.object(), oxirs_core::Object::NamedNode(n) if n == &schema_person) {
259///         count += 1;
260///     }
261/// }
262/// assert_eq!(2, count);
263/// ```
264#[must_use]
265pub struct ReaderRdfXmlParser<R: Read> {
266    results: Vec<Triple>,
267    parser: InternalRdfXmlParser<BufReader<R>>,
268    reader_buffer: Vec<u8>,
269}
270
271impl<R: Read> Iterator for ReaderRdfXmlParser<R> {
272    type Item = Result<Triple, RdfXmlParseError>;
273
274    fn next(&mut self) -> Option<Self::Item> {
275        loop {
276            if let Some(triple) = self.results.pop() {
277                return Some(Ok(triple));
278            } else if self.parser.is_end {
279                return None;
280            }
281            if let Err(e) = self.parse_step() {
282                return Some(Err(e));
283            }
284        }
285    }
286}
287
288impl<R: Read> ReaderRdfXmlParser<R> {
289    /// The list of IRI prefixes considered at the current step of the parsing.
290    ///
291    /// This method returns (prefix name, prefix value) tuples.
292    /// It is empty at the beginning of the parsing and gets updated when prefixes are encountered.
293    /// It should be full at the end of the parsing (but if a prefix is overridden, only the latest version will be returned).
294    ///
295    /// ```
296    /// use oxirs_core::rdfxml::RdfXmlParser;
297    ///
298    /// let file = br#"<?xml version="1.0"?>
299    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
300    ///  <rdf:Description rdf:about="http://example.com/foo">
301    ///    <rdf:type rdf:resource="http://schema.org/Person" />
302    ///    <schema:name>Foo</schema:name>
303    ///  </rdf:Description>
304    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
305    /// </rdf:RDF>"#;
306    ///
307    /// let mut parser = RdfXmlParser::new().for_reader(file.as_ref());
308    /// assert_eq!(parser.prefixes().collect::<Vec<_>>(), []); // No prefix at the beginning
309    ///
310    /// parser.next().unwrap().unwrap(); // We read the first triple
311    /// assert_eq!(
312    ///     parser.prefixes().collect::<Vec<_>>(),
313    ///     [
314    ///         ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
315    ///         ("schema", "http://schema.org/")
316    ///     ]
317    /// ); // There are now prefixes
318    /// ```
319    pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
320        RdfXmlPrefixesIter {
321            inner: self.parser.reader.resolver().bindings(),
322            decoder: self.parser.reader.decoder(),
323            lenient: self.parser.lenient,
324        }
325    }
326
327    /// The base IRI considered at the current step of the parsing.
328    ///
329    /// ```
330    /// use oxirs_core::rdfxml::RdfXmlParser;
331    ///
332    /// let file = br#"<?xml version="1.0"?>
333    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xml:base="http://example.com/">
334    ///  <rdf:Description rdf:about="foo">
335    ///    <rdf:type rdf:resource="http://schema.org/Person" />
336    ///  </rdf:Description>
337    /// </rdf:RDF>"#;
338    ///
339    /// let mut parser = RdfXmlParser::new().for_reader(file.as_ref());
340    /// assert!(parser.base_iri().is_none()); // No base at the beginning because none has been given to the parser.
341    ///
342    /// parser.next().unwrap().unwrap(); // We read the first triple
343    /// assert_eq!(parser.base_iri(), Some("http://example.com/")); // There is now a base IRI.
344    /// ```
345    pub fn base_iri(&self) -> Option<&str> {
346        Some(self.parser.current_base_iri()?.as_str())
347    }
348
349    /// The current byte position in the input data.
350    pub fn buffer_position(&self) -> u64 {
351        self.parser.reader.buffer_position()
352    }
353
354    fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
355        self.reader_buffer.clear();
356        let event = self
357            .parser
358            .reader
359            .read_event_into(&mut self.reader_buffer)?;
360        self.parser.parse_event(event, &mut self.results)
361    }
362}
363
364/// Parses a RDF/XML file from a [`AsyncRead`] implementation.
365///
366/// Can be built using [`RdfXmlParser::for_tokio_async_reader`].
367///
368/// Count the number of people:
369/// ```
370/// # #[tokio::main(flavor = "current_thread")]
371/// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
372/// use oxirs_core::model::NamedNode;
373/// use oxirs_core::{Predicate, Object};
374/// use oxirs_core::rdfxml::RdfXmlParser;
375///
376/// let file = br#"<?xml version="1.0"?>
377/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
378///   <rdf:Description rdf:about="http://example.com/foo">
379///     <rdf:type rdf:resource="http://schema.org/Person" />
380///     <schema:name>Foo</schema:name>
381///   </rdf:Description>
382///   <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
383/// </rdf:RDF>"#;
384///
385/// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
386/// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
387/// let mut count = 0;
388/// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
389/// while let Some(triple) = parser.next().await {
390///     let triple = triple.unwrap();
391///     if matches!(triple.predicate(), oxirs_core::Predicate::NamedNode(n) if n == &rdf_type) && matches!(triple.object(), oxirs_core::Object::NamedNode(n) if n == &schema_person) {
392///         count += 1;
393///     }
394/// }
395/// assert_eq!(2, count);
396/// # Ok(())
397/// # }
398/// ```
399#[cfg(feature = "async-tokio")]
400#[must_use]
401pub struct TokioAsyncReaderRdfXmlParser<R: AsyncRead + Unpin> {
402    results: Vec<Triple>,
403    parser: InternalRdfXmlParser<AsyncBufReader<R>>,
404    reader_buffer: Vec<u8>,
405}
406
407#[cfg(feature = "async-tokio")]
408impl<R: AsyncRead + Unpin> TokioAsyncReaderRdfXmlParser<R> {
409    /// Reads the next triple or returns `None` if the file is finished.
410    pub async fn next(&mut self) -> Option<Result<Triple, RdfXmlParseError>> {
411        loop {
412            if let Some(triple) = self.results.pop() {
413                return Some(Ok(triple));
414            } else if self.parser.is_end {
415                return None;
416            }
417            if let Err(e) = self.parse_step().await {
418                return Some(Err(e));
419            }
420        }
421    }
422
423    /// The list of IRI prefixes considered at the current step of the parsing.
424    ///
425    /// This method returns (prefix name, prefix value) tuples.
426    /// It is empty at the beginning of the parsing and gets updated when prefixes are encountered.
427    /// It should be full at the end of the parsing (but if a prefix is overridden, only the latest version will be returned).
428    ///
429    /// ```
430    /// # #[tokio::main(flavor = "current_thread")]
431    /// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
432    /// use oxirs_core::rdfxml::RdfXmlParser;
433    ///
434    /// let file = br#"<?xml version="1.0"?>
435    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
436    ///  <rdf:Description rdf:about="http://example.com/foo">
437    ///    <rdf:type rdf:resource="http://schema.org/Person" />
438    ///    <schema:name>Foo</schema:name>
439    ///  </rdf:Description>
440    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
441    /// </rdf:RDF>"#;
442    ///
443    /// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
444    /// assert_eq!(parser.prefixes().collect::<Vec<_>>(), []); // No prefix at the beginning
445    ///
446    /// parser.next().await.unwrap().unwrap(); // We read the first triple
447    /// assert_eq!(
448    ///     parser.prefixes().collect::<Vec<_>>(),
449    ///     [
450    ///         ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
451    ///         ("schema", "http://schema.org/")
452    ///     ]
453    /// ); // There are now prefixes
454    /// //
455    /// # Ok(())
456    /// # }
457    /// ```
458    pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
459        RdfXmlPrefixesIter {
460            inner: self.parser.reader.resolver().bindings(),
461            decoder: self.parser.reader.decoder(),
462            lenient: self.parser.lenient,
463        }
464    }
465
466    /// The base IRI considered at the current step of the parsing.
467    ///
468    /// ```
469    /// # #[tokio::main(flavor = "current_thread")]
470    /// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
471    /// use oxirs_core::rdfxml::RdfXmlParser;
472    ///
473    /// let file = br#"<?xml version="1.0"?>
474    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xml:base="http://example.com/">
475    ///  <rdf:Description rdf:about="foo">
476    ///    <rdf:type rdf:resource="http://schema.org/Person" />
477    ///  </rdf:Description>
478    /// </rdf:RDF>"#;
479    ///
480    /// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
481    /// assert!(parser.base_iri().is_none()); // No base at the beginning because none has been given to the parser.
482    ///
483    /// parser.next().await.unwrap().unwrap(); // We read the first triple
484    /// assert_eq!(parser.base_iri(), Some("http://example.com/")); // There is now a base IRI.
485    /// # Ok(())
486    /// # }
487    /// ```
488    pub fn base_iri(&self) -> Option<&str> {
489        Some(self.parser.current_base_iri()?.as_str())
490    }
491
492    /// The current byte position in the input data.
493    pub fn buffer_position(&self) -> u64 {
494        self.parser.reader.buffer_position()
495    }
496
497    async fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
498        self.reader_buffer.clear();
499        let event = self
500            .parser
501            .reader
502            .read_event_into_async(&mut self.reader_buffer)
503            .await?;
504        self.parser.parse_event(event, &mut self.results)
505    }
506}
507
508/// Parses a RDF/XML file from a byte slice.
509///
510/// Can be built using [`RdfXmlParser::for_slice`].
511///
512/// Count the number of people:
513/// ```
514/// use oxirs_core::model::NamedNode;
515/// use oxirs_core::{Predicate, Object};
516/// use oxirs_core::rdfxml::RdfXmlParser;
517///
518/// let file = br#"<?xml version="1.0"?>
519/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
520///  <rdf:Description rdf:about="http://example.com/foo">
521///    <rdf:type rdf:resource="http://schema.org/Person" />
522///    <schema:name>Foo</schema:name>
523///  </rdf:Description>
524///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
525/// </rdf:RDF>"#;
526///
527/// let schema_person = NamedNode::new("http://schema.org/Person").unwrap();
528/// let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
529/// let mut count = 0;
530/// for triple in RdfXmlParser::new().for_slice(file) {
531///     let triple = triple.unwrap();
532///     if matches!(triple.predicate(), oxirs_core::Predicate::NamedNode(n) if n == &rdf_type) && matches!(triple.object(), oxirs_core::Object::NamedNode(n) if n == &schema_person) {
533///         count += 1;
534///     }
535/// }
536/// assert_eq!(2, count);
537/// ```
538#[must_use]
539pub struct SliceRdfXmlParser<'a> {
540    results: Vec<Triple>,
541    parser: InternalRdfXmlParser<&'a [u8]>,
542    reader_buffer: Vec<u8>,
543}
544
545impl Iterator for SliceRdfXmlParser<'_> {
546    type Item = Result<Triple, RdfXmlSyntaxError>;
547
548    fn next(&mut self) -> Option<Self::Item> {
549        loop {
550            if let Some(triple) = self.results.pop() {
551                return Some(Ok(triple));
552            } else if self.parser.is_end {
553                return None;
554            }
555            if let Err(RdfXmlParseError::Syntax(e)) = self.parse_step() {
556                // I/O errors can't happen
557                return Some(Err(e));
558            }
559        }
560    }
561}
562
563impl SliceRdfXmlParser<'_> {
564    /// The list of IRI prefixes considered at the current step of the parsing.
565    ///
566    /// This method returns (prefix name, prefix value) tuples.
567    /// It is empty at the beginning of the parsing and gets updated when prefixes are encountered.
568    /// It should be full at the end of the parsing (but if a prefix is overridden, only the latest version will be returned).
569    ///
570    /// ```
571    /// use oxirs_core::rdfxml::RdfXmlParser;
572    ///
573    /// let file = br#"<?xml version="1.0"?>
574    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
575    ///  <rdf:Description rdf:about="http://example.com/foo">
576    ///    <rdf:type rdf:resource="http://schema.org/Person" />
577    ///    <schema:name>Foo</schema:name>
578    ///  </rdf:Description>
579    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
580    /// </rdf:RDF>"#;
581    ///
582    /// let mut parser = RdfXmlParser::new().for_slice(file);
583    /// assert_eq!(parser.prefixes().collect::<Vec<_>>(), []); // No prefix at the beginning
584    ///
585    /// parser.next().unwrap().unwrap(); // We read the first triple
586    /// assert_eq!(
587    ///     parser.prefixes().collect::<Vec<_>>(),
588    ///     [
589    ///         ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
590    ///         ("schema", "http://schema.org/")
591    ///     ]
592    /// ); // There are now prefixes
593    /// ```
594    pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
595        RdfXmlPrefixesIter {
596            inner: self.parser.reader.resolver().bindings(),
597            decoder: self.parser.reader.decoder(),
598            lenient: self.parser.lenient,
599        }
600    }
601
602    /// The base IRI considered at the current step of the parsing.
603    ///
604    /// ```
605    /// use oxirs_core::rdfxml::RdfXmlParser;
606    ///
607    /// let file = br#"<?xml version="1.0"?>
608    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xml:base="http://example.com/">
609    ///  <rdf:Description rdf:about="foo">
610    ///    <rdf:type rdf:resource="http://schema.org/Person" />
611    ///  </rdf:Description>
612    /// </rdf:RDF>"#;
613    ///
614    /// let mut parser = RdfXmlParser::new().for_slice(file);
615    /// assert!(parser.base_iri().is_none()); // No base at the beginning because none has been given to the parser.
616    ///
617    /// parser.next().unwrap().unwrap(); // We read the first triple
618    /// assert_eq!(parser.base_iri(), Some("http://example.com/")); // There is now a base IRI.
619    /// ```
620    pub fn base_iri(&self) -> Option<&str> {
621        Some(self.parser.current_base_iri()?.as_str())
622    }
623
624    /// The current byte position in the input data.
625    pub fn buffer_position(&self) -> u64 {
626        self.parser.reader.buffer_position()
627    }
628
629    fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
630        self.reader_buffer.clear();
631        let event = self
632            .parser
633            .reader
634            .read_event_into(&mut self.reader_buffer)?;
635        self.parser.parse_event(event, &mut self.results)
636    }
637}
638
639/// Iterator on the file prefixes.
640///
641/// See [`ReaderRdfXmlParser::prefixes`].
642pub struct RdfXmlPrefixesIter<'a> {
643    inner: NamespaceBindingsIter<'a>,
644    decoder: Decoder,
645    lenient: bool,
646}
647
648impl<'a> Iterator for RdfXmlPrefixesIter<'a> {
649    type Item = (&'a str, &'a str);
650
651    #[inline]
652    fn next(&mut self) -> Option<Self::Item> {
653        loop {
654            let (key, value) = self.inner.next()?;
655            return Some((
656                match key {
657                    PrefixDeclaration::Default => "",
658                    PrefixDeclaration::Named(name) => {
659                        let Ok(Cow::Borrowed(name)) = self.decoder.decode(name) else {
660                            continue;
661                        };
662                        let Ok(Cow::Borrowed(name)) = unescape_with(name, |_| None) else {
663                            continue;
664                        };
665                        if !self.lenient && !is_nc_name(name) {
666                            continue; // We don't return invalid prefixes
667                        }
668                        name
669                    }
670                },
671                {
672                    let Ok(Cow::Borrowed(value)) = self.decoder.decode(value.0) else {
673                        continue;
674                    };
675                    let Ok(Cow::Borrowed(value)) = unescape_with(value, |_| None) else {
676                        continue;
677                    };
678                    if !self.lenient && Iri::parse(value).is_err() {
679                        continue; // We don't return invalid prefixes
680                    }
681                    value
682                },
683            ));
684        }
685    }
686
687    #[inline]
688    fn size_hint(&self) -> (usize, Option<usize>) {
689        self.inner.size_hint()
690    }
691}
692
693const RDF_ABOUT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about";
694const RDF_ABOUT_EACH: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach";
695const RDF_ABOUT_EACH_PREFIX: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix";
696const RDF_BAG_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID";
697const RDF_DATATYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype";
698const RDF_DESCRIPTION: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description";
699const RDF_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID";
700const RDF_LI: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li";
701const RDF_NODE_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID";
702const RDF_PARSE_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType";
703const RDF_RDF: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF";
704const RDF_RESOURCE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource";
705
706// RDF vocabulary constants
707const RDF_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
708const RDF_NIL: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil";
709const RDF_FIRST: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first";
710const RDF_REST: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest";
711const RDF_STATEMENT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement";
712const RDF_SUBJECT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject";
713const RDF_PREDICATE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate";
714const RDF_OBJECT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object";
715const RDF_XML_LITERAL: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral";
716
717const RESERVED_RDF_ELEMENTS: [&str; 11] = [
718    RDF_ABOUT,
719    RDF_ABOUT_EACH,
720    RDF_ABOUT_EACH_PREFIX,
721    RDF_BAG_ID,
722    RDF_DATATYPE,
723    RDF_ID,
724    RDF_LI,
725    RDF_NODE_ID,
726    RDF_PARSE_TYPE,
727    RDF_RDF,
728    RDF_RESOURCE,
729];
730const RESERVED_RDF_ATTRIBUTES: [&str; 5] = [
731    RDF_ABOUT_EACH,
732    RDF_ABOUT_EACH_PREFIX,
733    RDF_LI,
734    RDF_RDF,
735    RDF_RESOURCE,
736];
737
738#[derive(Clone, Debug)]
739enum NodeOrText {
740    Node(NamedOrBlankNode),
741    Text(String),
742}
743
744enum RdfXmlState {
745    Doc {
746        base_iri: Option<Iri<String>>,
747    },
748    Rdf {
749        base_iri: Option<Iri<String>>,
750        language: Option<String>,
751    },
752    NodeElt {
753        base_iri: Option<Iri<String>>,
754        language: Option<String>,
755        subject: NamedOrBlankNode,
756        li_counter: u64,
757    },
758    PropertyElt {
759        // Resource, Literal or Empty property element
760        iri: NamedNode,
761        base_iri: Option<Iri<String>>,
762        language: Option<String>,
763        subject: NamedOrBlankNode,
764        object: Option<NodeOrText>,
765        id_attr: Option<NamedNode>,
766        datatype_attr: Option<NamedNode>,
767    },
768    ParseTypeCollectionPropertyElt {
769        iri: NamedNode,
770        base_iri: Option<Iri<String>>,
771        language: Option<String>,
772        subject: NamedOrBlankNode,
773        objects: Vec<NamedOrBlankNode>,
774        id_attr: Option<NamedNode>,
775    },
776    ParseTypeLiteralPropertyElt {
777        iri: NamedNode,
778        base_iri: Option<Iri<String>>,
779        language: Option<String>,
780        subject: NamedOrBlankNode,
781        writer: Writer<Vec<u8>>,
782        id_attr: Option<NamedNode>,
783        emit: bool, // false for parseTypeOtherPropertyElt support
784    },
785}
786
787struct InternalRdfXmlParser<R> {
788    reader: NsReader<R>,
789    state: Vec<RdfXmlState>,
790    custom_entities: HashMap<String, String>,
791    in_literal_depth: usize,
792    known_rdf_id: HashSet<String>,
793    is_end: bool,
794    lenient: bool,
795}
796
797/// Attributes for a node element
798struct NodeElementAttributes {
799    id_attr: Option<NamedNode>,
800    node_id_attr: Option<BlankNode>,
801    about_attr: Option<NamedNode>,
802    type_attr: Option<NamedNode>,
803    property_attrs: Vec<(NamedNode, String)>,
804}
805
806impl<R> InternalRdfXmlParser<R> {
807    fn parse_event(
808        &mut self,
809        event: Event<'_>,
810        results: &mut Vec<Triple>,
811    ) -> Result<(), RdfXmlParseError> {
812        match event {
813            Event::Start(event) => self.parse_start_event(&event, results),
814            Event::End(event) => self.parse_end_event(&event, results),
815            Event::Empty(_) => Err(RdfXmlSyntaxError::msg(
816                "The expand_empty_elements option must be enabled",
817            )
818            .into()),
819            Event::Text(event) => self.parse_text_event(&event),
820            Event::CData(event) => self.parse_text_event(&event.escape()?),
821            Event::Comment(_) | Event::PI(_) | Event::GeneralRef(_) => Ok(()),
822            Event::Decl(decl) => {
823                if let Some(encoding) = decl.encoding() {
824                    if !is_utf8(&encoding?) {
825                        return Err(RdfXmlSyntaxError::msg(
826                            "Only UTF-8 is supported by the RDF/XML parser",
827                        )
828                        .into());
829                    }
830                }
831                Ok(())
832            }
833            Event::DocType(dt) => self.parse_doctype(&dt),
834            Event::Eof => {
835                self.is_end = true;
836                Ok(())
837            }
838        }
839    }
840
841    fn parse_doctype(&mut self, dt: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
842        // we extract entities
843        for input in self
844            .reader
845            .decoder()
846            .decode(dt.as_ref())?
847            .split('<')
848            .skip(1)
849        {
850            if let Some(input) = input.strip_prefix("!ENTITY") {
851                let input = input.trim_start().strip_prefix('%').unwrap_or(input);
852                let (entity_name, input) = input.trim_start().split_once(|c: char| c.is_ascii_whitespace()).ok_or_else(|| {
853                    RdfXmlSyntaxError::msg(
854                        "<!ENTITY declarations should contain both an entity name and an entity value",
855                    )
856                })?;
857                let input = input.trim_start().strip_prefix('\"').ok_or_else(|| {
858                    RdfXmlSyntaxError::msg("<!ENTITY values should be enclosed in double quotes")
859                })?;
860                let (entity_value, input) = input.split_once('"').ok_or_else(|| {
861                    RdfXmlSyntaxError::msg(
862                        "<!ENTITY declarations values should be enclosed in double quotes",
863                    )
864                })?;
865                input.trim_start().strip_prefix('>').ok_or_else(|| {
866                    RdfXmlSyntaxError::msg("<!ENTITY declarations values should end with >")
867                })?;
868
869                // Resolves custom entities within the current entity definition.
870                let entity_value =
871                    unescape_with(entity_value, |e| self.resolve_entity(e)).map_err(Error::from)?;
872                self.custom_entities
873                    .insert(entity_name.to_owned(), entity_value.to_string());
874            }
875        }
876        Ok(())
877    }
878
879    fn parse_start_event(
880        &mut self,
881        event: &BytesStart<'_>,
882        results: &mut Vec<Triple>,
883    ) -> Result<(), RdfXmlParseError> {
884        #[derive(PartialEq, Eq)]
885        enum RdfXmlParseType {
886            Default,
887            Collection,
888            Literal,
889            Resource,
890            Other,
891        }
892
893        #[derive(PartialEq, Eq)]
894        enum RdfXmlNextProduction {
895            Rdf,
896            NodeElt,
897            PropertyElt { subject: NamedOrBlankNode },
898        }
899
900        // Literal case
901        if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) = self.state.last_mut()
902        {
903            let mut clean_event = BytesStart::new(
904                self.reader
905                    .decoder()
906                    .decode(event.name().as_ref())?
907                    .to_string(),
908            );
909            for attr in event.attributes() {
910                clean_event.push_attribute(attr.map_err(Error::InvalidAttr)?);
911            }
912            writer.write_event(Event::Start(clean_event))?;
913            self.in_literal_depth += 1;
914            return Ok(());
915        }
916
917        let tag_name = self.resolve_tag_name(event.name())?;
918
919        // We read attributes
920        let mut language = None;
921        let mut base_iri = None;
922        let mut id_attr = None;
923        let mut node_id_attr = None;
924        let mut about_attr = None;
925        let mut property_attrs = Vec::default();
926        let mut resource_attr = None;
927        let mut datatype_attr = None;
928        let mut parse_type = RdfXmlParseType::Default;
929        let mut type_attr = None;
930
931        for attribute in event.attributes() {
932            let attribute = attribute.map_err(Error::InvalidAttr)?;
933            if attribute.key.as_ref().starts_with(b"xml") {
934                if attribute.key.as_ref() == b"xml:lang" {
935                    let tag = self.convert_attribute(&attribute)?.to_ascii_lowercase();
936                    language = Some(if self.lenient {
937                        tag
938                    } else {
939                        LanguageTag::parse(tag.to_ascii_lowercase())
940                            .map_err(|error| RdfXmlSyntaxError::invalid_language_tag(tag, error))?
941                            .into_inner()
942                    });
943                } else if attribute.key.as_ref() == b"xml:base" {
944                    let iri = self.convert_attribute(&attribute)?;
945                    base_iri = Some(if self.lenient {
946                        Iri::parse_unchecked(iri.clone())
947                    } else {
948                        Iri::parse(iri.clone())
949                            .map_err(|error| RdfXmlSyntaxError::invalid_iri(iri, error))?
950                    })
951                } else {
952                    // We ignore other xml attributes
953                }
954            } else {
955                let attribute_url = self.resolve_attribute_name(attribute.key)?;
956                if *attribute_url == *RDF_ID {
957                    let mut id = self.convert_attribute(&attribute)?;
958                    if !is_nc_name(&id) {
959                        return Err(RdfXmlSyntaxError::msg(format!(
960                            "{id} is not a valid rdf:ID value"
961                        ))
962                        .into());
963                    }
964                    id.insert(0, '#');
965                    id_attr = Some(id);
966                } else if *attribute_url == *RDF_BAG_ID {
967                    let bag_id = self.convert_attribute(&attribute)?;
968                    if !is_nc_name(&bag_id) {
969                        return Err(RdfXmlSyntaxError::msg(format!(
970                            "{bag_id} is not a valid rdf:bagID value"
971                        ))
972                        .into());
973                    }
974                } else if *attribute_url == *RDF_NODE_ID {
975                    let id = self.convert_attribute(&attribute)?;
976                    if !is_nc_name(&id) {
977                        return Err(RdfXmlSyntaxError::msg(format!(
978                            "{id} is not a valid rdf:nodeID value"
979                        ))
980                        .into());
981                    }
982                    node_id_attr = Some(BlankNode::new_unchecked(id));
983                } else if *attribute_url == *RDF_ABOUT {
984                    about_attr = Some(attribute);
985                } else if *attribute_url == *RDF_RESOURCE {
986                    resource_attr = Some(attribute);
987                } else if *attribute_url == *RDF_DATATYPE {
988                    datatype_attr = Some(attribute);
989                } else if *attribute_url == *RDF_PARSE_TYPE {
990                    parse_type = match attribute.value.as_ref() {
991                        b"Collection" => RdfXmlParseType::Collection,
992                        b"Literal" => RdfXmlParseType::Literal,
993                        b"Resource" => RdfXmlParseType::Resource,
994                        _ => RdfXmlParseType::Other,
995                    };
996                } else if attribute_url == RDF_TYPE {
997                    type_attr = Some(attribute);
998                } else if RESERVED_RDF_ATTRIBUTES.contains(&&*attribute_url) {
999                    return Err(RdfXmlSyntaxError::msg(format!(
1000                        "{attribute_url} is not a valid attribute"
1001                    ))
1002                    .into());
1003                } else {
1004                    property_attrs.push((
1005                        self.parse_iri(attribute_url)?,
1006                        self.convert_attribute(&attribute)?,
1007                    ));
1008                }
1009            }
1010        }
1011
1012        // Parsing with the base URI
1013        let id_attr = match id_attr {
1014            Some(iri) => {
1015                let iri = self.resolve_iri(base_iri.as_ref(), iri)?;
1016                if !self.lenient {
1017                    if self.known_rdf_id.contains(iri.as_str()) {
1018                        return Err(RdfXmlSyntaxError::msg(format!(
1019                            "{iri} has already been used as rdf:ID value"
1020                        ))
1021                        .into());
1022                    }
1023                    self.known_rdf_id.insert(iri.as_str().into());
1024                }
1025                Some(iri)
1026            }
1027            None => None,
1028        };
1029        let about_attr = match about_attr {
1030            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1031            None => None,
1032        };
1033        let resource_attr = match resource_attr {
1034            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1035            None => None,
1036        };
1037        let datatype_attr = match datatype_attr {
1038            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1039            None => None,
1040        };
1041        let type_attr = match type_attr {
1042            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1043            None => None,
1044        };
1045
1046        let expected_production = match self.state.last() {
1047            Some(RdfXmlState::Doc { .. }) => RdfXmlNextProduction::Rdf,
1048            Some(
1049                RdfXmlState::Rdf { .. }
1050                | RdfXmlState::PropertyElt { .. }
1051                | RdfXmlState::ParseTypeCollectionPropertyElt { .. },
1052            ) => RdfXmlNextProduction::NodeElt,
1053            Some(RdfXmlState::NodeElt { subject, .. }) => RdfXmlNextProduction::PropertyElt {
1054                subject: subject.clone(),
1055            },
1056            Some(RdfXmlState::ParseTypeLiteralPropertyElt { .. }) => {
1057                return Err(
1058                    RdfXmlSyntaxError::msg("ParseTypeLiteralPropertyElt production children should never be considered as a RDF/XML content").into()
1059                );
1060            }
1061            None => {
1062                return Err(RdfXmlSyntaxError::msg(
1063                    "No state in the stack: the XML is not balanced",
1064                )
1065                .into());
1066            }
1067        };
1068
1069        let new_state = match expected_production {
1070            RdfXmlNextProduction::Rdf => {
1071                if *tag_name == *RDF_RDF {
1072                    RdfXmlState::Rdf { base_iri, language }
1073                } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1074                    return Err(RdfXmlSyntaxError::msg(format!(
1075                        "Invalid node element tag name: {tag_name}"
1076                    ))
1077                    .into());
1078                } else {
1079                    self.build_node_elt(
1080                        self.parse_iri(tag_name)?,
1081                        base_iri,
1082                        language,
1083                        NodeElementAttributes {
1084                            id_attr,
1085                            node_id_attr,
1086                            about_attr,
1087                            type_attr,
1088                            property_attrs,
1089                        },
1090                        results,
1091                    )?
1092                }
1093            }
1094            RdfXmlNextProduction::NodeElt => {
1095                if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1096                    return Err(RdfXmlSyntaxError::msg(format!(
1097                        "Invalid property element tag name: {tag_name}"
1098                    ))
1099                    .into());
1100                }
1101                self.build_node_elt(
1102                    self.parse_iri(tag_name)?,
1103                    base_iri,
1104                    language,
1105                    NodeElementAttributes {
1106                        id_attr,
1107                        node_id_attr,
1108                        about_attr,
1109                        type_attr,
1110                        property_attrs,
1111                    },
1112                    results,
1113                )?
1114            }
1115            RdfXmlNextProduction::PropertyElt { subject } => {
1116                let iri = if *tag_name == *RDF_LI {
1117                    let Some(RdfXmlState::NodeElt { li_counter, .. }) = self.state.last_mut()
1118                    else {
1119                        return Err(RdfXmlSyntaxError::msg(format!(
1120                            "Invalid property element tag name: {tag_name}"
1121                        ))
1122                        .into());
1123                    };
1124                    *li_counter += 1;
1125                    NamedNode::new_unchecked(format!(
1126                        "http://www.w3.org/1999/02/22-rdf-syntax-ns#_{li_counter}"
1127                    ))
1128                } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name)
1129                    || *tag_name == *RDF_DESCRIPTION
1130                {
1131                    return Err(RdfXmlSyntaxError::msg(format!(
1132                        "Invalid property element tag name: {tag_name}"
1133                    ))
1134                    .into());
1135                } else {
1136                    self.parse_iri(tag_name)?
1137                };
1138                match parse_type {
1139                    RdfXmlParseType::Default => {
1140                        if resource_attr.is_some()
1141                            || node_id_attr.is_some()
1142                            || !property_attrs.is_empty()
1143                        {
1144                            let object = match (resource_attr, node_id_attr)
1145                            {
1146                                (Some(resource_attr), None) => NamedOrBlankNode::from(resource_attr),
1147                                (None, Some(node_id_attr)) => node_id_attr.into(),
1148                                (None, None) => BlankNode::default().into(),
1149                                (Some(_), Some(_)) => return Err(RdfXmlSyntaxError::msg("Not both rdf:resource and rdf:nodeID could be set at the same time").into())
1150                            };
1151                            self.emit_property_attrs(
1152                                &object,
1153                                property_attrs,
1154                                language.as_deref(),
1155                                results,
1156                            );
1157                            if let Some(type_attr) = type_attr {
1158                                results.push(Triple::new(
1159                                    crate::model::term::Subject::from(object.clone()),
1160                                    NamedNode::new_unchecked(RDF_TYPE),
1161                                    type_attr,
1162                                ));
1163                            }
1164                            RdfXmlState::PropertyElt {
1165                                iri,
1166                                base_iri,
1167                                language,
1168                                subject,
1169                                object: Some(NodeOrText::Node(object)),
1170                                id_attr,
1171                                datatype_attr,
1172                            }
1173                        } else {
1174                            RdfXmlState::PropertyElt {
1175                                iri,
1176                                base_iri,
1177                                language,
1178                                subject,
1179                                object: None,
1180                                id_attr,
1181                                datatype_attr,
1182                            }
1183                        }
1184                    }
1185                    RdfXmlParseType::Literal => RdfXmlState::ParseTypeLiteralPropertyElt {
1186                        iri,
1187                        base_iri,
1188                        language,
1189                        subject,
1190                        writer: Writer::new(Vec::default()),
1191                        id_attr,
1192                        emit: true,
1193                    },
1194                    RdfXmlParseType::Resource => Self::build_parse_type_resource_property_elt(
1195                        iri, base_iri, language, subject, id_attr, results,
1196                    ),
1197                    RdfXmlParseType::Collection => RdfXmlState::ParseTypeCollectionPropertyElt {
1198                        iri,
1199                        base_iri,
1200                        language,
1201                        subject,
1202                        objects: Vec::default(),
1203                        id_attr,
1204                    },
1205                    RdfXmlParseType::Other => RdfXmlState::ParseTypeLiteralPropertyElt {
1206                        iri,
1207                        base_iri,
1208                        language,
1209                        subject,
1210                        writer: Writer::new(Vec::default()),
1211                        id_attr,
1212                        emit: false,
1213                    },
1214                }
1215            }
1216        };
1217        self.state.push(new_state);
1218        Ok(())
1219    }
1220
1221    fn parse_end_event(
1222        &mut self,
1223        event: &BytesEnd<'_>,
1224        results: &mut Vec<Triple>,
1225    ) -> Result<(), RdfXmlParseError> {
1226        // Literal case
1227        if self.in_literal_depth > 0 {
1228            if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) =
1229                self.state.last_mut()
1230            {
1231                writer.write_event(Event::End(BytesEnd::new(
1232                    self.reader.decoder().decode(event.name().as_ref())?,
1233                )))?;
1234                self.in_literal_depth -= 1;
1235                return Ok(());
1236            }
1237        }
1238
1239        if let Some(current_state) = self.state.pop() {
1240            self.end_state(current_state, results)?;
1241        }
1242        Ok(())
1243    }
1244
1245    fn parse_text_event(&mut self, event: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
1246        let text =
1247            unescape_with(std::str::from_utf8(event)?, |e| self.resolve_entity(e))?.to_string();
1248        match self.state.last_mut() {
1249            Some(RdfXmlState::PropertyElt { object, .. }) => {
1250                if is_object_defined(object) {
1251                    if text.bytes().all(is_whitespace) {
1252                        Ok(()) // whitespace anyway, we ignore
1253                    } else {
1254                        Err(
1255                            RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'"))
1256                                .into(),
1257                        )
1258                    }
1259                } else {
1260                    *object = Some(NodeOrText::Text(text));
1261                    Ok(())
1262                }
1263            }
1264            Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) => {
1265                writer.write_event(Event::Text(BytesText::new(&text)))?;
1266                Ok(())
1267            }
1268            _ => {
1269                if text.bytes().all(is_whitespace) {
1270                    Ok(())
1271                } else {
1272                    Err(RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'")).into())
1273                }
1274            }
1275        }
1276    }
1277
1278    fn resolve_tag_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1279        let (namespace, local_name) = self.reader.resolver().resolve_element(qname);
1280        self.resolve_ns_name(namespace, local_name)
1281    }
1282
1283    fn resolve_attribute_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1284        let (namespace, local_name) = self.reader.resolver().resolve_attribute(qname);
1285        self.resolve_ns_name(namespace, local_name)
1286    }
1287
1288    fn resolve_ns_name(
1289        &self,
1290        namespace: ResolveResult<'_>,
1291        local_name: LocalName<'_>,
1292    ) -> Result<String, RdfXmlParseError> {
1293        match namespace {
1294            ResolveResult::Bound(ns) => {
1295                let mut value = Vec::with_capacity(ns.as_ref().len() + local_name.as_ref().len());
1296                value.extend_from_slice(ns.as_ref());
1297                value.extend_from_slice(local_name.as_ref());
1298                Ok(unescape_with(&self.reader.decoder().decode(&value)?, |e| {
1299                    self.resolve_entity(e)
1300                })
1301                .map_err(Error::from)?
1302                .to_string())
1303            }
1304            ResolveResult::Unbound => {
1305                Err(RdfXmlSyntaxError::msg("XML namespaces are required in RDF/XML").into())
1306            }
1307            ResolveResult::Unknown(v) => Err(RdfXmlSyntaxError::msg(format!(
1308                "Unknown prefix {}:",
1309                self.reader.decoder().decode(&v)?
1310            ))
1311            .into()),
1312        }
1313    }
1314
1315    fn build_node_elt(
1316        &self,
1317        iri: NamedNode,
1318        base_iri: Option<Iri<String>>,
1319        language: Option<String>,
1320        attrs: NodeElementAttributes,
1321        results: &mut Vec<Triple>,
1322    ) -> Result<RdfXmlState, RdfXmlSyntaxError> {
1323        let subject = match (attrs.id_attr, attrs.node_id_attr, attrs.about_attr) {
1324            (Some(id_attr), None, None) => NamedOrBlankNode::from(id_attr),
1325            (None, Some(node_id_attr), None) => node_id_attr.into(),
1326            (None, None, Some(about_attr)) => about_attr.into(),
1327            (None, None, None) => BlankNode::default().into(),
1328            (Some(_), Some(_), _) => {
1329                return Err(RdfXmlSyntaxError::msg(
1330                    "Not both rdf:ID and rdf:nodeID could be set at the same time",
1331                ));
1332            }
1333            (_, Some(_), Some(_)) => {
1334                return Err(RdfXmlSyntaxError::msg(
1335                    "Not both rdf:nodeID and rdf:resource could be set at the same time",
1336                ));
1337            }
1338            (Some(_), _, Some(_)) => {
1339                return Err(RdfXmlSyntaxError::msg(
1340                    "Not both rdf:ID and rdf:resource could be set at the same time",
1341                ));
1342            }
1343        };
1344
1345        self.emit_property_attrs(&subject, attrs.property_attrs, language.as_deref(), results);
1346
1347        if let Some(type_attr) = attrs.type_attr {
1348            results.push(Triple::new(
1349                crate::model::term::Subject::from(subject.clone()),
1350                NamedNode::new_unchecked(RDF_TYPE),
1351                type_attr,
1352            ));
1353        }
1354
1355        if iri.as_str() != RDF_DESCRIPTION {
1356            results.push(Triple::new(
1357                crate::model::term::Subject::from(subject.clone()),
1358                NamedNode::new_unchecked(RDF_TYPE),
1359                iri,
1360            ));
1361        }
1362        Ok(RdfXmlState::NodeElt {
1363            base_iri,
1364            language,
1365            subject,
1366            li_counter: 0,
1367        })
1368    }
1369
1370    fn build_parse_type_resource_property_elt(
1371        iri: NamedNode,
1372        base_iri: Option<Iri<String>>,
1373        language: Option<String>,
1374        subject: NamedOrBlankNode,
1375        id_attr: Option<NamedNode>,
1376        results: &mut Vec<Triple>,
1377    ) -> RdfXmlState {
1378        let object = BlankNode::default();
1379        let triple = Triple::new(
1380            crate::model::term::Subject::from(subject),
1381            iri,
1382            object.clone(),
1383        );
1384        if let Some(id_attr) = id_attr {
1385            Self::reify(triple.clone(), id_attr, results);
1386        }
1387        results.push(triple);
1388        RdfXmlState::NodeElt {
1389            base_iri,
1390            language,
1391            subject: object.into(),
1392            li_counter: 0,
1393        }
1394    }
1395
1396    fn end_state(
1397        &mut self,
1398        state: RdfXmlState,
1399        results: &mut Vec<Triple>,
1400    ) -> Result<(), RdfXmlSyntaxError> {
1401        match state {
1402            RdfXmlState::PropertyElt {
1403                iri,
1404                language,
1405                subject,
1406                id_attr,
1407                datatype_attr,
1408                object,
1409                ..
1410            } => {
1411                let object = match object {
1412                    Some(NodeOrText::Node(node)) => match node {
1413                        NamedOrBlankNode::NamedNode(n) => Object::NamedNode(n),
1414                        NamedOrBlankNode::BlankNode(b) => Object::BlankNode(b),
1415                    },
1416                    Some(NodeOrText::Text(text)) => {
1417                        Object::Literal(self.new_literal(text, language, datatype_attr))
1418                    }
1419                    None => {
1420                        Object::Literal(self.new_literal(String::new(), language, datatype_attr))
1421                    }
1422                };
1423                let triple = Triple::new(crate::model::term::Subject::from(subject), iri, object);
1424                if let Some(id_attr) = id_attr {
1425                    Self::reify(triple.clone(), id_attr, results);
1426                }
1427                results.push(triple);
1428            }
1429            RdfXmlState::ParseTypeCollectionPropertyElt {
1430                iri,
1431                subject,
1432                id_attr,
1433                objects,
1434                ..
1435            } => {
1436                let mut current_node = NamedOrBlankNode::from(NamedNode::new_unchecked(RDF_NIL));
1437                for object in objects.into_iter().rev() {
1438                    let subject = NamedOrBlankNode::from(BlankNode::default());
1439                    results.push(Triple::new(
1440                        crate::model::term::Subject::from(subject.clone()),
1441                        NamedNode::new_unchecked(RDF_FIRST),
1442                        object,
1443                    ));
1444                    results.push(Triple::new(
1445                        crate::model::term::Subject::from(subject.clone()),
1446                        NamedNode::new_unchecked(RDF_REST),
1447                        crate::model::term::Object::from(current_node.clone()),
1448                    ));
1449                    current_node = subject;
1450                }
1451                let triple = Triple::new(
1452                    crate::model::term::Subject::from(subject),
1453                    iri,
1454                    crate::model::term::Object::from(current_node),
1455                );
1456                if let Some(id_attr) = id_attr {
1457                    Self::reify(triple.clone(), id_attr, results);
1458                }
1459                results.push(triple);
1460            }
1461            RdfXmlState::ParseTypeLiteralPropertyElt {
1462                iri,
1463                subject,
1464                id_attr,
1465                writer,
1466                emit,
1467                ..
1468            } => {
1469                if emit {
1470                    let object = writer.into_inner();
1471                    if object.is_empty() {
1472                        return Err(RdfXmlSyntaxError::msg(format!(
1473                            "No value found for rdf:XMLLiteral value of property {iri}"
1474                        )));
1475                    }
1476                    let triple = Triple::new(
1477                        crate::model::term::Subject::from(subject),
1478                        iri,
1479                        Literal::new_typed_literal(
1480                            str::from_utf8(&object).map_err(|_| {
1481                                RdfXmlSyntaxError::msg(
1482                                    "The XML literal is not in valid UTF-8".to_owned(),
1483                                )
1484                            })?,
1485                            NamedNode::new_unchecked(RDF_XML_LITERAL),
1486                        ),
1487                    );
1488                    if let Some(id_attr) = id_attr {
1489                        Self::reify(triple.clone(), id_attr, results);
1490                    }
1491                    results.push(triple);
1492                }
1493            }
1494            RdfXmlState::NodeElt { subject, .. } => match self.state.last_mut() {
1495                Some(RdfXmlState::PropertyElt { object, .. }) => {
1496                    if is_object_defined(object) {
1497                        return Err(RdfXmlSyntaxError::msg(
1498                            "Unexpected node, a text value is already present",
1499                        ));
1500                    }
1501                    *object = Some(NodeOrText::Node(subject))
1502                }
1503                Some(RdfXmlState::ParseTypeCollectionPropertyElt { objects, .. }) => {
1504                    objects.push(subject)
1505                }
1506                _ => (),
1507            },
1508            _ => (),
1509        }
1510        Ok(())
1511    }
1512
1513    fn new_literal(
1514        &self,
1515        value: String,
1516        language: Option<String>,
1517        datatype: Option<NamedNode>,
1518    ) -> Literal {
1519        if let Some(datatype) = datatype {
1520            Literal::new_typed_literal(value, datatype)
1521        } else if let Some(language) =
1522            language.or_else(|| self.current_language().map(ToOwned::to_owned))
1523        {
1524            Literal::new_language_tagged_literal_unchecked(value, language)
1525        } else {
1526            Literal::new_simple_literal(value)
1527        }
1528    }
1529
1530    fn reify(triple: Triple, statement_id: NamedNode, results: &mut Vec<Triple>) {
1531        results.push(Triple::new(
1532            statement_id.clone(),
1533            NamedNode::new_unchecked(RDF_TYPE),
1534            NamedNode::new_unchecked(RDF_STATEMENT),
1535        ));
1536        results.push(Triple::new(
1537            statement_id.clone(),
1538            NamedNode::new_unchecked(RDF_SUBJECT),
1539            match triple.subject() {
1540                Subject::NamedNode(n) => Object::NamedNode(n.clone()),
1541                Subject::BlankNode(b) => Object::BlankNode(b.clone()),
1542                Subject::Variable(v) => Object::Variable(v.clone()),
1543                Subject::QuotedTriple(qt) => Object::QuotedTriple(qt.clone()),
1544            },
1545        ));
1546        results.push(Triple::new(
1547            statement_id.clone(),
1548            NamedNode::new_unchecked(RDF_PREDICATE),
1549            match triple.predicate() {
1550                Predicate::NamedNode(n) => Object::NamedNode(n.clone()),
1551                Predicate::Variable(v) => Object::Variable(v.clone()),
1552            },
1553        ));
1554        results.push(Triple::new(
1555            statement_id,
1556            NamedNode::new_unchecked(RDF_OBJECT),
1557            triple.object().clone(),
1558        ));
1559    }
1560
1561    fn emit_property_attrs(
1562        &self,
1563        subject: &NamedOrBlankNode,
1564        literal_attributes: Vec<(NamedNode, String)>,
1565        language: Option<&str>,
1566        results: &mut Vec<Triple>,
1567    ) {
1568        for (literal_predicate, literal_value) in literal_attributes {
1569            results.push(Triple::new(
1570                crate::model::term::Subject::from(subject.clone()),
1571                literal_predicate,
1572                if let Some(language) = language.or_else(|| self.current_language()) {
1573                    Literal::new_lang(&literal_value, language)
1574                        .unwrap_or_else(|_| Literal::new(literal_value))
1575                } else {
1576                    Literal::new(literal_value)
1577                },
1578            ));
1579        }
1580    }
1581
1582    fn convert_attribute(&self, attribute: &Attribute<'_>) -> Result<String, RdfXmlParseError> {
1583        Ok(attribute
1584            .decode_and_unescape_value_with(self.reader.decoder(), |e| self.resolve_entity(e))?
1585            .into_owned())
1586    }
1587
1588    fn convert_iri_attribute(
1589        &self,
1590        base_iri: Option<&Iri<String>>,
1591        attribute: &Attribute<'_>,
1592    ) -> Result<NamedNode, RdfXmlParseError> {
1593        let converted = self.convert_attribute(attribute)?;
1594        self.resolve_iri(base_iri, converted)
1595            .map_err(RdfXmlParseError::Syntax)
1596    }
1597
1598    fn resolve_iri(
1599        &self,
1600        base_iri: Option<&Iri<String>>,
1601        relative_iri: String,
1602    ) -> Result<NamedNode, RdfXmlSyntaxError> {
1603        if let Some(base_iri) = base_iri.or_else(|| self.current_base_iri()) {
1604            Ok(NamedNode::new_unchecked(if self.lenient {
1605                base_iri.resolve_unchecked(&relative_iri).into_inner()
1606            } else {
1607                base_iri
1608                    .resolve(&relative_iri)
1609                    .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1610                    .into_inner()
1611            }))
1612        } else {
1613            self.parse_iri(relative_iri)
1614        }
1615    }
1616
1617    fn parse_iri(&self, relative_iri: String) -> Result<NamedNode, RdfXmlSyntaxError> {
1618        Ok(NamedNode::new_unchecked(if self.lenient {
1619            relative_iri
1620        } else {
1621            Iri::parse(relative_iri.clone())
1622                .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1623                .into_inner()
1624        }))
1625    }
1626
1627    fn current_language(&self) -> Option<&str> {
1628        for state in self.state.iter().rev() {
1629            match state {
1630                RdfXmlState::Doc { .. } => (),
1631                RdfXmlState::Rdf { language, .. }
1632                | RdfXmlState::NodeElt { language, .. }
1633                | RdfXmlState::PropertyElt { language, .. }
1634                | RdfXmlState::ParseTypeCollectionPropertyElt { language, .. }
1635                | RdfXmlState::ParseTypeLiteralPropertyElt { language, .. } => {
1636                    if let Some(language) = language {
1637                        return Some(language);
1638                    }
1639                }
1640            }
1641        }
1642        None
1643    }
1644
1645    fn current_base_iri(&self) -> Option<&Iri<String>> {
1646        for state in self.state.iter().rev() {
1647            match state {
1648                RdfXmlState::Doc { base_iri }
1649                | RdfXmlState::Rdf { base_iri, .. }
1650                | RdfXmlState::NodeElt { base_iri, .. }
1651                | RdfXmlState::PropertyElt { base_iri, .. }
1652                | RdfXmlState::ParseTypeCollectionPropertyElt { base_iri, .. }
1653                | RdfXmlState::ParseTypeLiteralPropertyElt { base_iri, .. } => {
1654                    if let Some(base_iri) = base_iri {
1655                        return Some(base_iri);
1656                    }
1657                }
1658            }
1659        }
1660        None
1661    }
1662
1663    fn resolve_entity(&self, e: &str) -> Option<&str> {
1664        resolve_xml_entity(e).or_else(|| self.custom_entities.get(e).map(String::as_str))
1665    }
1666}
1667
1668fn is_object_defined(object: &Option<NodeOrText>) -> bool {
1669    match object {
1670        Some(NodeOrText::Node(_)) => true,
1671        Some(NodeOrText::Text(t)) => !t.bytes().all(is_whitespace),
1672        None => false,
1673    }
1674}
1675
1676fn is_whitespace(c: u8) -> bool {
1677    matches!(c, b' ' | b'\t' | b'\n' | b'\r')
1678}
1679
1680fn is_utf8(encoding: &[u8]) -> bool {
1681    matches!(
1682        encoding.to_ascii_lowercase().as_slice(),
1683        b"unicode-1-1-utf-8"
1684            | b"unicode11utf8"
1685            | b"unicode20utf8"
1686            | b"utf-8"
1687            | b"utf8"
1688            | b"x-unicode20utf8"
1689    )
1690}