1use crate::model::literal::LanguageTag;
2use crate::model::term::{Object, Predicate, Subject};
3use crate::model::{BlankNode, Literal, NamedNode, NamedOrBlankNode, Term, Triple};
4use crate::rdfxml::error::{RdfXmlParseError, RdfXmlSyntaxError};
5use crate::rdfxml::utils::*;
6use oxiri::{Iri, IriParseError};
7use quick_xml::escape::{resolve_xml_entity, unescape_with};
8use quick_xml::events::attributes::Attribute;
9use quick_xml::events::*;
10use quick_xml::name::{LocalName, PrefixDeclaration, PrefixIter, QName, ResolveResult};
11use quick_xml::{Decoder, Error, NsReader, Writer};
12use std::borrow::Cow;
13use std::collections::{HashMap, HashSet};
14use std::io::{BufReader, Read};
15use std::str;
16#[cfg(feature = "async-tokio")]
17use tokio::io::{AsyncRead, BufReader as AsyncBufReader};
18
19impl From<NamedOrBlankNode> for Term {
20 fn from(node: NamedOrBlankNode) -> Self {
21 match node {
22 NamedOrBlankNode::NamedNode(n) => Term::NamedNode(n),
23 NamedOrBlankNode::BlankNode(n) => Term::BlankNode(n),
24 }
25 }
26}
27
28#[derive(Default, Clone)]
63#[must_use]
64pub struct RdfXmlParser {
65 lenient: bool,
66 base: Option<Iri<String>>,
67}
68
69impl RdfXmlParser {
70 #[inline]
72 pub fn new() -> Self {
73 Self::default()
74 }
75
76 #[inline]
82 pub fn lenient(mut self) -> Self {
83 self.lenient = true;
84 self
85 }
86
87 #[deprecated(note = "Use `lenient()` instead", since = "0.2.0")]
88 #[inline]
89 pub fn unchecked(self) -> Self {
90 self.lenient()
91 }
92
93 #[inline]
94 pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
95 self.base = Some(Iri::parse(base_iri.into())?);
96 Ok(self)
97 }
98
99 pub fn for_reader<R: Read>(self, reader: R) -> ReaderRdfXmlParser<R> {
127 ReaderRdfXmlParser {
128 results: Vec::new(),
129 parser: self.into_internal(BufReader::new(reader)),
130 reader_buffer: Vec::default(),
131 }
132 }
133
134 #[cfg(feature = "async-tokio")]
167 pub fn for_tokio_async_reader<R: AsyncRead + Unpin>(
168 self,
169 reader: R,
170 ) -> TokioAsyncReaderRdfXmlParser<R> {
171 TokioAsyncReaderRdfXmlParser {
172 results: Vec::new(),
173 parser: self.into_internal(AsyncBufReader::new(reader)),
174 reader_buffer: Vec::default(),
175 }
176 }
177
178 pub fn for_slice(self, slice: &[u8]) -> SliceRdfXmlParser<'_> {
206 SliceRdfXmlParser {
207 results: Vec::new(),
208 parser: self.into_internal(slice),
209 reader_buffer: Vec::default(),
210 }
211 }
212
213 fn into_internal<T>(self, reader: T) -> InternalRdfXmlParser<T> {
214 let mut reader = NsReader::from_reader(reader);
215 reader.config_mut().expand_empty_elements = true;
216 InternalRdfXmlParser {
217 reader,
218 state: vec![RdfXmlState::Doc {
219 base_iri: self.base.clone(),
220 }],
221 custom_entities: HashMap::new(),
222 in_literal_depth: 0,
223 known_rdf_id: HashSet::default(),
224 is_end: false,
225 lenient: self.lenient,
226 }
227 }
228}
229
230#[must_use]
260pub struct ReaderRdfXmlParser<R: Read> {
261 results: Vec<Triple>,
262 parser: InternalRdfXmlParser<BufReader<R>>,
263 reader_buffer: Vec<u8>,
264}
265
266impl<R: Read> Iterator for ReaderRdfXmlParser<R> {
267 type Item = Result<Triple, RdfXmlParseError>;
268
269 fn next(&mut self) -> Option<Self::Item> {
270 loop {
271 if let Some(triple) = self.results.pop() {
272 return Some(Ok(triple));
273 } else if self.parser.is_end {
274 return None;
275 }
276 if let Err(e) = self.parse_step() {
277 return Some(Err(e));
278 }
279 }
280 }
281}
282
283impl<R: Read> ReaderRdfXmlParser<R> {
284 pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
315 RdfXmlPrefixesIter {
316 inner: self.parser.reader.prefixes(),
317 decoder: self.parser.reader.decoder(),
318 lenient: self.parser.lenient,
319 }
320 }
321
322 pub fn base_iri(&self) -> Option<&str> {
341 Some(self.parser.current_base_iri()?.as_str())
342 }
343
344 pub fn buffer_position(&self) -> u64 {
346 self.parser.reader.buffer_position()
347 }
348
349 fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
350 self.reader_buffer.clear();
351 let event = self
352 .parser
353 .reader
354 .read_event_into(&mut self.reader_buffer)?;
355 self.parser.parse_event(event, &mut self.results)
356 }
357}
358
359#[cfg(feature = "async-tokio")]
394#[must_use]
395pub struct TokioAsyncReaderRdfXmlParser<R: AsyncRead + Unpin> {
396 results: Vec<Triple>,
397 parser: InternalRdfXmlParser<AsyncBufReader<R>>,
398 reader_buffer: Vec<u8>,
399}
400
401#[cfg(feature = "async-tokio")]
402impl<R: AsyncRead + Unpin> TokioAsyncReaderRdfXmlParser<R> {
403 pub async fn next(&mut self) -> Option<Result<Triple, RdfXmlParseError>> {
405 loop {
406 if let Some(triple) = self.results.pop() {
407 return Some(Ok(triple));
408 } else if self.parser.is_end {
409 return None;
410 }
411 if let Err(e) = self.parse_step().await {
412 return Some(Err(e));
413 }
414 }
415 }
416
417 pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
453 RdfXmlPrefixesIter {
454 inner: self.parser.reader.prefixes(),
455 decoder: self.parser.reader.decoder(),
456 lenient: self.parser.lenient,
457 }
458 }
459
460 pub fn base_iri(&self) -> Option<&str> {
483 Some(self.parser.current_base_iri()?.as_str())
484 }
485
486 pub fn buffer_position(&self) -> u64 {
488 self.parser.reader.buffer_position()
489 }
490
491 async fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
492 self.reader_buffer.clear();
493 let event = self
494 .parser
495 .reader
496 .read_event_into_async(&mut self.reader_buffer)
497 .await?;
498 self.parser.parse_event(event, &mut self.results)
499 }
500}
501
502#[must_use]
532pub struct SliceRdfXmlParser<'a> {
533 results: Vec<Triple>,
534 parser: InternalRdfXmlParser<&'a [u8]>,
535 reader_buffer: Vec<u8>,
536}
537
538impl Iterator for SliceRdfXmlParser<'_> {
539 type Item = Result<Triple, RdfXmlSyntaxError>;
540
541 fn next(&mut self) -> Option<Self::Item> {
542 loop {
543 if let Some(triple) = self.results.pop() {
544 return Some(Ok(triple));
545 } else if self.parser.is_end {
546 return None;
547 }
548 if let Err(RdfXmlParseError::Syntax(e)) = self.parse_step() {
549 return Some(Err(e));
551 }
552 }
553 }
554}
555
556impl SliceRdfXmlParser<'_> {
557 pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
588 RdfXmlPrefixesIter {
589 inner: self.parser.reader.prefixes(),
590 decoder: self.parser.reader.decoder(),
591 lenient: self.parser.lenient,
592 }
593 }
594
595 pub fn base_iri(&self) -> Option<&str> {
614 Some(self.parser.current_base_iri()?.as_str())
615 }
616
617 pub fn buffer_position(&self) -> u64 {
619 self.parser.reader.buffer_position()
620 }
621
622 fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
623 self.reader_buffer.clear();
624 let event = self
625 .parser
626 .reader
627 .read_event_into(&mut self.reader_buffer)?;
628 self.parser.parse_event(event, &mut self.results)
629 }
630}
631
632pub struct RdfXmlPrefixesIter<'a> {
636 inner: PrefixIter<'a>,
637 decoder: Decoder,
638 lenient: bool,
639}
640
641impl<'a> Iterator for RdfXmlPrefixesIter<'a> {
642 type Item = (&'a str, &'a str);
643
644 #[inline]
645 fn next(&mut self) -> Option<Self::Item> {
646 loop {
647 let (key, value) = self.inner.next()?;
648 return Some((
649 match key {
650 PrefixDeclaration::Default => "",
651 PrefixDeclaration::Named(name) => {
652 let Ok(Cow::Borrowed(name)) = self.decoder.decode(name) else {
653 continue;
654 };
655 let Ok(Cow::Borrowed(name)) = unescape_with(name, |_| None) else {
656 continue;
657 };
658 if !self.lenient && !is_nc_name(name) {
659 continue; }
661 name
662 }
663 },
664 {
665 let Ok(Cow::Borrowed(value)) = self.decoder.decode(value.0) else {
666 continue;
667 };
668 let Ok(Cow::Borrowed(value)) = unescape_with(value, |_| None) else {
669 continue;
670 };
671 if !self.lenient && Iri::parse(value).is_err() {
672 continue; }
674 value
675 },
676 ));
677 }
678 }
679
680 #[inline]
681 fn size_hint(&self) -> (usize, Option<usize>) {
682 self.inner.size_hint()
683 }
684}
685
686const RDF_ABOUT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about";
687const RDF_ABOUT_EACH: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach";
688const RDF_ABOUT_EACH_PREFIX: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix";
689const RDF_BAG_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID";
690const RDF_DATATYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype";
691const RDF_DESCRIPTION: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description";
692const RDF_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID";
693const RDF_LI: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li";
694const RDF_NODE_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID";
695const RDF_PARSE_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType";
696const RDF_RDF: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF";
697const RDF_RESOURCE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource";
698
699const RDF_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
701const RDF_NIL: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil";
702const RDF_FIRST: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first";
703const RDF_REST: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest";
704const RDF_STATEMENT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement";
705const RDF_SUBJECT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject";
706const RDF_PREDICATE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate";
707const RDF_OBJECT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object";
708const RDF_XML_LITERAL: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral";
709
710const RESERVED_RDF_ELEMENTS: [&str; 11] = [
711 RDF_ABOUT,
712 RDF_ABOUT_EACH,
713 RDF_ABOUT_EACH_PREFIX,
714 RDF_BAG_ID,
715 RDF_DATATYPE,
716 RDF_ID,
717 RDF_LI,
718 RDF_NODE_ID,
719 RDF_PARSE_TYPE,
720 RDF_RDF,
721 RDF_RESOURCE,
722];
723const RESERVED_RDF_ATTRIBUTES: [&str; 5] = [
724 RDF_ABOUT_EACH,
725 RDF_ABOUT_EACH_PREFIX,
726 RDF_LI,
727 RDF_RDF,
728 RDF_RESOURCE,
729];
730
731#[derive(Clone, Debug)]
732enum NodeOrText {
733 Node(NamedOrBlankNode),
734 Text(String),
735}
736
737enum RdfXmlState {
738 Doc {
739 base_iri: Option<Iri<String>>,
740 },
741 Rdf {
742 base_iri: Option<Iri<String>>,
743 language: Option<String>,
744 },
745 NodeElt {
746 base_iri: Option<Iri<String>>,
747 language: Option<String>,
748 subject: NamedOrBlankNode,
749 li_counter: u64,
750 },
751 PropertyElt {
752 iri: NamedNode,
754 base_iri: Option<Iri<String>>,
755 language: Option<String>,
756 subject: NamedOrBlankNode,
757 object: Option<NodeOrText>,
758 id_attr: Option<NamedNode>,
759 datatype_attr: Option<NamedNode>,
760 },
761 ParseTypeCollectionPropertyElt {
762 iri: NamedNode,
763 base_iri: Option<Iri<String>>,
764 language: Option<String>,
765 subject: NamedOrBlankNode,
766 objects: Vec<NamedOrBlankNode>,
767 id_attr: Option<NamedNode>,
768 },
769 ParseTypeLiteralPropertyElt {
770 iri: NamedNode,
771 base_iri: Option<Iri<String>>,
772 language: Option<String>,
773 subject: NamedOrBlankNode,
774 writer: Writer<Vec<u8>>,
775 id_attr: Option<NamedNode>,
776 emit: bool, },
778}
779
780struct InternalRdfXmlParser<R> {
781 reader: NsReader<R>,
782 state: Vec<RdfXmlState>,
783 custom_entities: HashMap<String, String>,
784 in_literal_depth: usize,
785 known_rdf_id: HashSet<String>,
786 is_end: bool,
787 lenient: bool,
788}
789
790struct NodeElementAttributes {
792 id_attr: Option<NamedNode>,
793 node_id_attr: Option<BlankNode>,
794 about_attr: Option<NamedNode>,
795 type_attr: Option<NamedNode>,
796 property_attrs: Vec<(NamedNode, String)>,
797}
798
799impl<R> InternalRdfXmlParser<R> {
800 fn parse_event(
801 &mut self,
802 event: Event<'_>,
803 results: &mut Vec<Triple>,
804 ) -> Result<(), RdfXmlParseError> {
805 match event {
806 Event::Start(event) => self.parse_start_event(&event, results),
807 Event::End(event) => self.parse_end_event(&event, results),
808 Event::Empty(_) => Err(RdfXmlSyntaxError::msg(
809 "The expand_empty_elements option must be enabled",
810 )
811 .into()),
812 Event::Text(event) => self.parse_text_event(&event),
813 Event::CData(event) => self.parse_text_event(&event.escape()?),
814 Event::Comment(_) | Event::PI(_) | Event::GeneralRef(_) => Ok(()),
815 Event::Decl(decl) => {
816 if let Some(encoding) = decl.encoding() {
817 if !is_utf8(&encoding?) {
818 return Err(RdfXmlSyntaxError::msg(
819 "Only UTF-8 is supported by the RDF/XML parser",
820 )
821 .into());
822 }
823 }
824 Ok(())
825 }
826 Event::DocType(dt) => self.parse_doctype(&dt),
827 Event::Eof => {
828 self.is_end = true;
829 Ok(())
830 }
831 }
832 }
833
834 fn parse_doctype(&mut self, dt: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
835 for input in self
837 .reader
838 .decoder()
839 .decode(dt.as_ref())?
840 .split('<')
841 .skip(1)
842 {
843 if let Some(input) = input.strip_prefix("!ENTITY") {
844 let input = input.trim_start().strip_prefix('%').unwrap_or(input);
845 let (entity_name, input) = input.trim_start().split_once(|c: char| c.is_ascii_whitespace()).ok_or_else(|| {
846 RdfXmlSyntaxError::msg(
847 "<!ENTITY declarations should contain both an entity name and an entity value",
848 )
849 })?;
850 let input = input.trim_start().strip_prefix('\"').ok_or_else(|| {
851 RdfXmlSyntaxError::msg("<!ENTITY values should be enclosed in double quotes")
852 })?;
853 let (entity_value, input) = input.split_once('"').ok_or_else(|| {
854 RdfXmlSyntaxError::msg(
855 "<!ENTITY declarations values should be enclosed in double quotes",
856 )
857 })?;
858 input.trim_start().strip_prefix('>').ok_or_else(|| {
859 RdfXmlSyntaxError::msg("<!ENTITY declarations values should end with >")
860 })?;
861
862 let entity_value =
864 unescape_with(entity_value, |e| self.resolve_entity(e)).map_err(Error::from)?;
865 self.custom_entities
866 .insert(entity_name.to_owned(), entity_value.to_string());
867 }
868 }
869 Ok(())
870 }
871
872 fn parse_start_event(
873 &mut self,
874 event: &BytesStart<'_>,
875 results: &mut Vec<Triple>,
876 ) -> Result<(), RdfXmlParseError> {
877 #[derive(PartialEq, Eq)]
878 enum RdfXmlParseType {
879 Default,
880 Collection,
881 Literal,
882 Resource,
883 Other,
884 }
885
886 #[derive(PartialEq, Eq)]
887 enum RdfXmlNextProduction {
888 Rdf,
889 NodeElt,
890 PropertyElt { subject: NamedOrBlankNode },
891 }
892
893 if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) = self.state.last_mut()
895 {
896 let mut clean_event = BytesStart::new(
897 self.reader
898 .decoder()
899 .decode(event.name().as_ref())?
900 .to_string(),
901 );
902 for attr in event.attributes() {
903 clean_event.push_attribute(attr.map_err(Error::InvalidAttr)?);
904 }
905 writer.write_event(Event::Start(clean_event))?;
906 self.in_literal_depth += 1;
907 return Ok(());
908 }
909
910 let tag_name = self.resolve_tag_name(event.name())?;
911
912 let mut language = None;
914 let mut base_iri = None;
915 let mut id_attr = None;
916 let mut node_id_attr = None;
917 let mut about_attr = None;
918 let mut property_attrs = Vec::default();
919 let mut resource_attr = None;
920 let mut datatype_attr = None;
921 let mut parse_type = RdfXmlParseType::Default;
922 let mut type_attr = None;
923
924 for attribute in event.attributes() {
925 let attribute = attribute.map_err(Error::InvalidAttr)?;
926 if attribute.key.as_ref().starts_with(b"xml") {
927 if attribute.key.as_ref() == b"xml:lang" {
928 let tag = self.convert_attribute(&attribute)?.to_ascii_lowercase();
929 language = Some(if self.lenient {
930 tag
931 } else {
932 LanguageTag::parse(tag.to_ascii_lowercase())
933 .map_err(|error| RdfXmlSyntaxError::invalid_language_tag(tag, error))?
934 .into_inner()
935 });
936 } else if attribute.key.as_ref() == b"xml:base" {
937 let iri = self.convert_attribute(&attribute)?;
938 base_iri = Some(if self.lenient {
939 Iri::parse_unchecked(iri.clone())
940 } else {
941 Iri::parse(iri.clone())
942 .map_err(|error| RdfXmlSyntaxError::invalid_iri(iri, error))?
943 })
944 } else {
945 }
947 } else {
948 let attribute_url = self.resolve_attribute_name(attribute.key)?;
949 if *attribute_url == *RDF_ID {
950 let mut id = self.convert_attribute(&attribute)?;
951 if !is_nc_name(&id) {
952 return Err(RdfXmlSyntaxError::msg(format!(
953 "{id} is not a valid rdf:ID value"
954 ))
955 .into());
956 }
957 id.insert(0, '#');
958 id_attr = Some(id);
959 } else if *attribute_url == *RDF_BAG_ID {
960 let bag_id = self.convert_attribute(&attribute)?;
961 if !is_nc_name(&bag_id) {
962 return Err(RdfXmlSyntaxError::msg(format!(
963 "{bag_id} is not a valid rdf:bagID value"
964 ))
965 .into());
966 }
967 } else if *attribute_url == *RDF_NODE_ID {
968 let id = self.convert_attribute(&attribute)?;
969 if !is_nc_name(&id) {
970 return Err(RdfXmlSyntaxError::msg(format!(
971 "{id} is not a valid rdf:nodeID value"
972 ))
973 .into());
974 }
975 node_id_attr = Some(BlankNode::new_unchecked(id));
976 } else if *attribute_url == *RDF_ABOUT {
977 about_attr = Some(attribute);
978 } else if *attribute_url == *RDF_RESOURCE {
979 resource_attr = Some(attribute);
980 } else if *attribute_url == *RDF_DATATYPE {
981 datatype_attr = Some(attribute);
982 } else if *attribute_url == *RDF_PARSE_TYPE {
983 parse_type = match attribute.value.as_ref() {
984 b"Collection" => RdfXmlParseType::Collection,
985 b"Literal" => RdfXmlParseType::Literal,
986 b"Resource" => RdfXmlParseType::Resource,
987 _ => RdfXmlParseType::Other,
988 };
989 } else if attribute_url == RDF_TYPE {
990 type_attr = Some(attribute);
991 } else if RESERVED_RDF_ATTRIBUTES.contains(&&*attribute_url) {
992 return Err(RdfXmlSyntaxError::msg(format!(
993 "{attribute_url} is not a valid attribute"
994 ))
995 .into());
996 } else {
997 property_attrs.push((
998 self.parse_iri(attribute_url)?,
999 self.convert_attribute(&attribute)?,
1000 ));
1001 }
1002 }
1003 }
1004
1005 let id_attr = match id_attr {
1007 Some(iri) => {
1008 let iri = self.resolve_iri(base_iri.as_ref(), iri)?;
1009 if !self.lenient {
1010 if self.known_rdf_id.contains(iri.as_str()) {
1011 return Err(RdfXmlSyntaxError::msg(format!(
1012 "{iri} has already been used as rdf:ID value"
1013 ))
1014 .into());
1015 }
1016 self.known_rdf_id.insert(iri.as_str().into());
1017 }
1018 Some(iri)
1019 }
1020 None => None,
1021 };
1022 let about_attr = match about_attr {
1023 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1024 None => None,
1025 };
1026 let resource_attr = match resource_attr {
1027 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1028 None => None,
1029 };
1030 let datatype_attr = match datatype_attr {
1031 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1032 None => None,
1033 };
1034 let type_attr = match type_attr {
1035 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1036 None => None,
1037 };
1038
1039 let expected_production = match self.state.last() {
1040 Some(RdfXmlState::Doc { .. }) => RdfXmlNextProduction::Rdf,
1041 Some(
1042 RdfXmlState::Rdf { .. }
1043 | RdfXmlState::PropertyElt { .. }
1044 | RdfXmlState::ParseTypeCollectionPropertyElt { .. },
1045 ) => RdfXmlNextProduction::NodeElt,
1046 Some(RdfXmlState::NodeElt { subject, .. }) => RdfXmlNextProduction::PropertyElt {
1047 subject: subject.clone(),
1048 },
1049 Some(RdfXmlState::ParseTypeLiteralPropertyElt { .. }) => {
1050 return Err(
1051 RdfXmlSyntaxError::msg("ParseTypeLiteralPropertyElt production children should never be considered as a RDF/XML content").into()
1052 );
1053 }
1054 None => {
1055 return Err(RdfXmlSyntaxError::msg(
1056 "No state in the stack: the XML is not balanced",
1057 )
1058 .into());
1059 }
1060 };
1061
1062 let new_state = match expected_production {
1063 RdfXmlNextProduction::Rdf => {
1064 if *tag_name == *RDF_RDF {
1065 RdfXmlState::Rdf { base_iri, language }
1066 } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1067 return Err(RdfXmlSyntaxError::msg(format!(
1068 "Invalid node element tag name: {tag_name}"
1069 ))
1070 .into());
1071 } else {
1072 self.build_node_elt(
1073 self.parse_iri(tag_name)?,
1074 base_iri,
1075 language,
1076 NodeElementAttributes {
1077 id_attr,
1078 node_id_attr,
1079 about_attr,
1080 type_attr,
1081 property_attrs,
1082 },
1083 results,
1084 )?
1085 }
1086 }
1087 RdfXmlNextProduction::NodeElt => {
1088 if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1089 return Err(RdfXmlSyntaxError::msg(format!(
1090 "Invalid property element tag name: {tag_name}"
1091 ))
1092 .into());
1093 }
1094 self.build_node_elt(
1095 self.parse_iri(tag_name)?,
1096 base_iri,
1097 language,
1098 NodeElementAttributes {
1099 id_attr,
1100 node_id_attr,
1101 about_attr,
1102 type_attr,
1103 property_attrs,
1104 },
1105 results,
1106 )?
1107 }
1108 RdfXmlNextProduction::PropertyElt { subject } => {
1109 let iri = if *tag_name == *RDF_LI {
1110 let Some(RdfXmlState::NodeElt { li_counter, .. }) = self.state.last_mut()
1111 else {
1112 return Err(RdfXmlSyntaxError::msg(format!(
1113 "Invalid property element tag name: {tag_name}"
1114 ))
1115 .into());
1116 };
1117 *li_counter += 1;
1118 NamedNode::new_unchecked(format!(
1119 "http://www.w3.org/1999/02/22-rdf-syntax-ns#_{li_counter}"
1120 ))
1121 } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name)
1122 || *tag_name == *RDF_DESCRIPTION
1123 {
1124 return Err(RdfXmlSyntaxError::msg(format!(
1125 "Invalid property element tag name: {tag_name}"
1126 ))
1127 .into());
1128 } else {
1129 self.parse_iri(tag_name)?
1130 };
1131 match parse_type {
1132 RdfXmlParseType::Default => {
1133 if resource_attr.is_some()
1134 || node_id_attr.is_some()
1135 || !property_attrs.is_empty()
1136 {
1137 let object = match (resource_attr, node_id_attr)
1138 {
1139 (Some(resource_attr), None) => NamedOrBlankNode::from(resource_attr),
1140 (None, Some(node_id_attr)) => node_id_attr.into(),
1141 (None, None) => BlankNode::default().into(),
1142 (Some(_), Some(_)) => return Err(RdfXmlSyntaxError::msg("Not both rdf:resource and rdf:nodeID could be set at the same time").into())
1143 };
1144 self.emit_property_attrs(
1145 &object,
1146 property_attrs,
1147 language.as_deref(),
1148 results,
1149 );
1150 if let Some(type_attr) = type_attr {
1151 results.push(Triple::new(
1152 crate::model::term::Subject::from(object.clone()),
1153 NamedNode::new_unchecked(RDF_TYPE),
1154 type_attr,
1155 ));
1156 }
1157 RdfXmlState::PropertyElt {
1158 iri,
1159 base_iri,
1160 language,
1161 subject,
1162 object: Some(NodeOrText::Node(object)),
1163 id_attr,
1164 datatype_attr,
1165 }
1166 } else {
1167 RdfXmlState::PropertyElt {
1168 iri,
1169 base_iri,
1170 language,
1171 subject,
1172 object: None,
1173 id_attr,
1174 datatype_attr,
1175 }
1176 }
1177 }
1178 RdfXmlParseType::Literal => RdfXmlState::ParseTypeLiteralPropertyElt {
1179 iri,
1180 base_iri,
1181 language,
1182 subject,
1183 writer: Writer::new(Vec::default()),
1184 id_attr,
1185 emit: true,
1186 },
1187 RdfXmlParseType::Resource => Self::build_parse_type_resource_property_elt(
1188 iri, base_iri, language, subject, id_attr, results,
1189 ),
1190 RdfXmlParseType::Collection => RdfXmlState::ParseTypeCollectionPropertyElt {
1191 iri,
1192 base_iri,
1193 language,
1194 subject,
1195 objects: Vec::default(),
1196 id_attr,
1197 },
1198 RdfXmlParseType::Other => RdfXmlState::ParseTypeLiteralPropertyElt {
1199 iri,
1200 base_iri,
1201 language,
1202 subject,
1203 writer: Writer::new(Vec::default()),
1204 id_attr,
1205 emit: false,
1206 },
1207 }
1208 }
1209 };
1210 self.state.push(new_state);
1211 Ok(())
1212 }
1213
1214 fn parse_end_event(
1215 &mut self,
1216 event: &BytesEnd<'_>,
1217 results: &mut Vec<Triple>,
1218 ) -> Result<(), RdfXmlParseError> {
1219 if self.in_literal_depth > 0 {
1221 if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) =
1222 self.state.last_mut()
1223 {
1224 writer.write_event(Event::End(BytesEnd::new(
1225 self.reader.decoder().decode(event.name().as_ref())?,
1226 )))?;
1227 self.in_literal_depth -= 1;
1228 return Ok(());
1229 }
1230 }
1231
1232 if let Some(current_state) = self.state.pop() {
1233 self.end_state(current_state, results)?;
1234 }
1235 Ok(())
1236 }
1237
1238 fn parse_text_event(&mut self, event: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
1239 let text =
1240 unescape_with(std::str::from_utf8(event)?, |e| self.resolve_entity(e))?.to_string();
1241 match self.state.last_mut() {
1242 Some(RdfXmlState::PropertyElt { object, .. }) => {
1243 if is_object_defined(object) {
1244 if text.bytes().all(is_whitespace) {
1245 Ok(()) } else {
1247 Err(
1248 RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'"))
1249 .into(),
1250 )
1251 }
1252 } else {
1253 *object = Some(NodeOrText::Text(text));
1254 Ok(())
1255 }
1256 }
1257 Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) => {
1258 writer.write_event(Event::Text(BytesText::new(&text)))?;
1259 Ok(())
1260 }
1261 _ => {
1262 if text.bytes().all(is_whitespace) {
1263 Ok(())
1264 } else {
1265 Err(RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'")).into())
1266 }
1267 }
1268 }
1269 }
1270
1271 fn resolve_tag_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1272 let (namespace, local_name) = self.reader.resolve_element(qname);
1273 self.resolve_ns_name(namespace, local_name)
1274 }
1275
1276 fn resolve_attribute_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1277 let (namespace, local_name) = self.reader.resolve_attribute(qname);
1278 self.resolve_ns_name(namespace, local_name)
1279 }
1280
1281 fn resolve_ns_name(
1282 &self,
1283 namespace: ResolveResult<'_>,
1284 local_name: LocalName<'_>,
1285 ) -> Result<String, RdfXmlParseError> {
1286 match namespace {
1287 ResolveResult::Bound(ns) => {
1288 let mut value = Vec::with_capacity(ns.as_ref().len() + local_name.as_ref().len());
1289 value.extend_from_slice(ns.as_ref());
1290 value.extend_from_slice(local_name.as_ref());
1291 Ok(unescape_with(&self.reader.decoder().decode(&value)?, |e| {
1292 self.resolve_entity(e)
1293 })
1294 .map_err(Error::from)?
1295 .to_string())
1296 }
1297 ResolveResult::Unbound => {
1298 Err(RdfXmlSyntaxError::msg("XML namespaces are required in RDF/XML").into())
1299 }
1300 ResolveResult::Unknown(v) => Err(RdfXmlSyntaxError::msg(format!(
1301 "Unknown prefix {}:",
1302 self.reader.decoder().decode(&v)?
1303 ))
1304 .into()),
1305 }
1306 }
1307
1308 fn build_node_elt(
1309 &self,
1310 iri: NamedNode,
1311 base_iri: Option<Iri<String>>,
1312 language: Option<String>,
1313 attrs: NodeElementAttributes,
1314 results: &mut Vec<Triple>,
1315 ) -> Result<RdfXmlState, RdfXmlSyntaxError> {
1316 let subject = match (attrs.id_attr, attrs.node_id_attr, attrs.about_attr) {
1317 (Some(id_attr), None, None) => NamedOrBlankNode::from(id_attr),
1318 (None, Some(node_id_attr), None) => node_id_attr.into(),
1319 (None, None, Some(about_attr)) => about_attr.into(),
1320 (None, None, None) => BlankNode::default().into(),
1321 (Some(_), Some(_), _) => {
1322 return Err(RdfXmlSyntaxError::msg(
1323 "Not both rdf:ID and rdf:nodeID could be set at the same time",
1324 ));
1325 }
1326 (_, Some(_), Some(_)) => {
1327 return Err(RdfXmlSyntaxError::msg(
1328 "Not both rdf:nodeID and rdf:resource could be set at the same time",
1329 ));
1330 }
1331 (Some(_), _, Some(_)) => {
1332 return Err(RdfXmlSyntaxError::msg(
1333 "Not both rdf:ID and rdf:resource could be set at the same time",
1334 ));
1335 }
1336 };
1337
1338 self.emit_property_attrs(&subject, attrs.property_attrs, language.as_deref(), results);
1339
1340 if let Some(type_attr) = attrs.type_attr {
1341 results.push(Triple::new(
1342 crate::model::term::Subject::from(subject.clone()),
1343 NamedNode::new_unchecked(RDF_TYPE),
1344 type_attr,
1345 ));
1346 }
1347
1348 if iri.as_str() != RDF_DESCRIPTION {
1349 results.push(Triple::new(
1350 crate::model::term::Subject::from(subject.clone()),
1351 NamedNode::new_unchecked(RDF_TYPE),
1352 iri,
1353 ));
1354 }
1355 Ok(RdfXmlState::NodeElt {
1356 base_iri,
1357 language,
1358 subject,
1359 li_counter: 0,
1360 })
1361 }
1362
1363 fn build_parse_type_resource_property_elt(
1364 iri: NamedNode,
1365 base_iri: Option<Iri<String>>,
1366 language: Option<String>,
1367 subject: NamedOrBlankNode,
1368 id_attr: Option<NamedNode>,
1369 results: &mut Vec<Triple>,
1370 ) -> RdfXmlState {
1371 let object = BlankNode::default();
1372 let triple = Triple::new(
1373 crate::model::term::Subject::from(subject),
1374 iri,
1375 object.clone(),
1376 );
1377 if let Some(id_attr) = id_attr {
1378 Self::reify(triple.clone(), id_attr, results);
1379 }
1380 results.push(triple);
1381 RdfXmlState::NodeElt {
1382 base_iri,
1383 language,
1384 subject: object.into(),
1385 li_counter: 0,
1386 }
1387 }
1388
1389 fn end_state(
1390 &mut self,
1391 state: RdfXmlState,
1392 results: &mut Vec<Triple>,
1393 ) -> Result<(), RdfXmlSyntaxError> {
1394 match state {
1395 RdfXmlState::PropertyElt {
1396 iri,
1397 language,
1398 subject,
1399 id_attr,
1400 datatype_attr,
1401 object,
1402 ..
1403 } => {
1404 let object = match object {
1405 Some(NodeOrText::Node(node)) => match node {
1406 NamedOrBlankNode::NamedNode(n) => Object::NamedNode(n),
1407 NamedOrBlankNode::BlankNode(b) => Object::BlankNode(b),
1408 },
1409 Some(NodeOrText::Text(text)) => {
1410 Object::Literal(self.new_literal(text, language, datatype_attr))
1411 }
1412 None => {
1413 Object::Literal(self.new_literal(String::new(), language, datatype_attr))
1414 }
1415 };
1416 let triple = Triple::new(crate::model::term::Subject::from(subject), iri, object);
1417 if let Some(id_attr) = id_attr {
1418 Self::reify(triple.clone(), id_attr, results);
1419 }
1420 results.push(triple);
1421 }
1422 RdfXmlState::ParseTypeCollectionPropertyElt {
1423 iri,
1424 subject,
1425 id_attr,
1426 objects,
1427 ..
1428 } => {
1429 let mut current_node = NamedOrBlankNode::from(NamedNode::new_unchecked(RDF_NIL));
1430 for object in objects.into_iter().rev() {
1431 let subject = NamedOrBlankNode::from(BlankNode::default());
1432 results.push(Triple::new(
1433 crate::model::term::Subject::from(subject.clone()),
1434 NamedNode::new_unchecked(RDF_FIRST),
1435 object,
1436 ));
1437 results.push(Triple::new(
1438 crate::model::term::Subject::from(subject.clone()),
1439 NamedNode::new_unchecked(RDF_REST),
1440 crate::model::term::Object::from(current_node.clone()),
1441 ));
1442 current_node = subject;
1443 }
1444 let triple = Triple::new(
1445 crate::model::term::Subject::from(subject),
1446 iri,
1447 crate::model::term::Object::from(current_node),
1448 );
1449 if let Some(id_attr) = id_attr {
1450 Self::reify(triple.clone(), id_attr, results);
1451 }
1452 results.push(triple);
1453 }
1454 RdfXmlState::ParseTypeLiteralPropertyElt {
1455 iri,
1456 subject,
1457 id_attr,
1458 writer,
1459 emit,
1460 ..
1461 } => {
1462 if emit {
1463 let object = writer.into_inner();
1464 if object.is_empty() {
1465 return Err(RdfXmlSyntaxError::msg(format!(
1466 "No value found for rdf:XMLLiteral value of property {iri}"
1467 )));
1468 }
1469 let triple = Triple::new(
1470 crate::model::term::Subject::from(subject),
1471 iri,
1472 Literal::new_typed_literal(
1473 str::from_utf8(&object).map_err(|_| {
1474 RdfXmlSyntaxError::msg(
1475 "The XML literal is not in valid UTF-8".to_owned(),
1476 )
1477 })?,
1478 NamedNode::new_unchecked(RDF_XML_LITERAL),
1479 ),
1480 );
1481 if let Some(id_attr) = id_attr {
1482 Self::reify(triple.clone(), id_attr, results);
1483 }
1484 results.push(triple);
1485 }
1486 }
1487 RdfXmlState::NodeElt { subject, .. } => match self.state.last_mut() {
1488 Some(RdfXmlState::PropertyElt { object, .. }) => {
1489 if is_object_defined(object) {
1490 return Err(RdfXmlSyntaxError::msg(
1491 "Unexpected node, a text value is already present",
1492 ));
1493 }
1494 *object = Some(NodeOrText::Node(subject))
1495 }
1496 Some(RdfXmlState::ParseTypeCollectionPropertyElt { objects, .. }) => {
1497 objects.push(subject)
1498 }
1499 _ => (),
1500 },
1501 _ => (),
1502 }
1503 Ok(())
1504 }
1505
1506 fn new_literal(
1507 &self,
1508 value: String,
1509 language: Option<String>,
1510 datatype: Option<NamedNode>,
1511 ) -> Literal {
1512 if let Some(datatype) = datatype {
1513 Literal::new_typed_literal(value, datatype)
1514 } else if let Some(language) =
1515 language.or_else(|| self.current_language().map(ToOwned::to_owned))
1516 {
1517 Literal::new_language_tagged_literal_unchecked(value, language)
1518 } else {
1519 Literal::new_simple_literal(value)
1520 }
1521 }
1522
1523 fn reify(triple: Triple, statement_id: NamedNode, results: &mut Vec<Triple>) {
1524 results.push(Triple::new(
1525 statement_id.clone(),
1526 NamedNode::new_unchecked(RDF_TYPE),
1527 NamedNode::new_unchecked(RDF_STATEMENT),
1528 ));
1529 results.push(Triple::new(
1530 statement_id.clone(),
1531 NamedNode::new_unchecked(RDF_SUBJECT),
1532 match triple.subject() {
1533 Subject::NamedNode(n) => Object::NamedNode(n.clone()),
1534 Subject::BlankNode(b) => Object::BlankNode(b.clone()),
1535 Subject::Variable(v) => Object::Variable(v.clone()),
1536 Subject::QuotedTriple(qt) => Object::QuotedTriple(qt.clone()),
1537 },
1538 ));
1539 results.push(Triple::new(
1540 statement_id.clone(),
1541 NamedNode::new_unchecked(RDF_PREDICATE),
1542 match triple.predicate() {
1543 Predicate::NamedNode(n) => Object::NamedNode(n.clone()),
1544 Predicate::Variable(v) => Object::Variable(v.clone()),
1545 },
1546 ));
1547 results.push(Triple::new(
1548 statement_id,
1549 NamedNode::new_unchecked(RDF_OBJECT),
1550 triple.object().clone(),
1551 ));
1552 }
1553
1554 fn emit_property_attrs(
1555 &self,
1556 subject: &NamedOrBlankNode,
1557 literal_attributes: Vec<(NamedNode, String)>,
1558 language: Option<&str>,
1559 results: &mut Vec<Triple>,
1560 ) {
1561 for (literal_predicate, literal_value) in literal_attributes {
1562 results.push(Triple::new(
1563 crate::model::term::Subject::from(subject.clone()),
1564 literal_predicate,
1565 if let Some(language) = language.or_else(|| self.current_language()) {
1566 Literal::new_lang(&literal_value, language)
1567 .unwrap_or_else(|_| Literal::new(literal_value))
1568 } else {
1569 Literal::new(literal_value)
1570 },
1571 ));
1572 }
1573 }
1574
1575 fn convert_attribute(&self, attribute: &Attribute<'_>) -> Result<String, RdfXmlParseError> {
1576 Ok(attribute
1577 .decode_and_unescape_value_with(self.reader.decoder(), |e| self.resolve_entity(e))?
1578 .into_owned())
1579 }
1580
1581 fn convert_iri_attribute(
1582 &self,
1583 base_iri: Option<&Iri<String>>,
1584 attribute: &Attribute<'_>,
1585 ) -> Result<NamedNode, RdfXmlParseError> {
1586 let converted = self.convert_attribute(attribute)?;
1587 self.resolve_iri(base_iri, converted)
1588 .map_err(RdfXmlParseError::Syntax)
1589 }
1590
1591 fn resolve_iri(
1592 &self,
1593 base_iri: Option<&Iri<String>>,
1594 relative_iri: String,
1595 ) -> Result<NamedNode, RdfXmlSyntaxError> {
1596 if let Some(base_iri) = base_iri.or_else(|| self.current_base_iri()) {
1597 Ok(NamedNode::new_unchecked(if self.lenient {
1598 base_iri.resolve_unchecked(&relative_iri).into_inner()
1599 } else {
1600 base_iri
1601 .resolve(&relative_iri)
1602 .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1603 .into_inner()
1604 }))
1605 } else {
1606 self.parse_iri(relative_iri)
1607 }
1608 }
1609
1610 fn parse_iri(&self, relative_iri: String) -> Result<NamedNode, RdfXmlSyntaxError> {
1611 Ok(NamedNode::new_unchecked(if self.lenient {
1612 relative_iri
1613 } else {
1614 Iri::parse(relative_iri.clone())
1615 .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1616 .into_inner()
1617 }))
1618 }
1619
1620 fn current_language(&self) -> Option<&str> {
1621 for state in self.state.iter().rev() {
1622 match state {
1623 RdfXmlState::Doc { .. } => (),
1624 RdfXmlState::Rdf { language, .. }
1625 | RdfXmlState::NodeElt { language, .. }
1626 | RdfXmlState::PropertyElt { language, .. }
1627 | RdfXmlState::ParseTypeCollectionPropertyElt { language, .. }
1628 | RdfXmlState::ParseTypeLiteralPropertyElt { language, .. } => {
1629 if let Some(language) = language {
1630 return Some(language);
1631 }
1632 }
1633 }
1634 }
1635 None
1636 }
1637
1638 fn current_base_iri(&self) -> Option<&Iri<String>> {
1639 for state in self.state.iter().rev() {
1640 match state {
1641 RdfXmlState::Doc { base_iri }
1642 | RdfXmlState::Rdf { base_iri, .. }
1643 | RdfXmlState::NodeElt { base_iri, .. }
1644 | RdfXmlState::PropertyElt { base_iri, .. }
1645 | RdfXmlState::ParseTypeCollectionPropertyElt { base_iri, .. }
1646 | RdfXmlState::ParseTypeLiteralPropertyElt { base_iri, .. } => {
1647 if let Some(base_iri) = base_iri {
1648 return Some(base_iri);
1649 }
1650 }
1651 }
1652 }
1653 None
1654 }
1655
1656 fn resolve_entity(&self, e: &str) -> Option<&str> {
1657 resolve_xml_entity(e).or_else(|| self.custom_entities.get(e).map(String::as_str))
1658 }
1659}
1660
1661fn is_object_defined(object: &Option<NodeOrText>) -> bool {
1662 match object {
1663 Some(NodeOrText::Node(_)) => true,
1664 Some(NodeOrText::Text(t)) => !t.bytes().all(is_whitespace),
1665 None => false,
1666 }
1667}
1668
1669fn is_whitespace(c: u8) -> bool {
1670 matches!(c, b' ' | b'\t' | b'\n' | b'\r')
1671}
1672
1673fn is_utf8(encoding: &[u8]) -> bool {
1674 matches!(
1675 encoding.to_ascii_lowercase().as_slice(),
1676 b"unicode-1-1-utf-8"
1677 | b"unicode11utf8"
1678 | b"unicode20utf8"
1679 | b"utf-8"
1680 | b"utf8"
1681 | b"x-unicode20utf8"
1682 )
1683}