1use crate::model::literal::LanguageTag;
2use crate::model::term::{Object, Predicate, Subject};
3use crate::model::{BlankNode, Literal, NamedNode, NamedOrBlankNode, Term, Triple};
4use crate::rdfxml::error::{RdfXmlParseError, RdfXmlSyntaxError};
5use crate::rdfxml::utils::*;
6use oxiri::{Iri, IriParseError};
7use quick_xml::escape::{resolve_xml_entity, unescape_with};
8use quick_xml::events::attributes::Attribute;
9use quick_xml::events::*;
10use quick_xml::name::{LocalName, NamespaceBindingsIter, PrefixDeclaration, QName, ResolveResult};
11use quick_xml::{Decoder, Error, NsReader, Writer};
12use std::borrow::Cow;
13use std::collections::{HashMap, HashSet};
14use std::io::{BufReader, Read};
15use std::str;
16#[cfg(feature = "async-tokio")]
17use tokio::io::{AsyncRead, BufReader as AsyncBufReader};
18
19impl From<NamedOrBlankNode> for Term {
20 fn from(node: NamedOrBlankNode) -> Self {
21 match node {
22 NamedOrBlankNode::NamedNode(n) => Term::NamedNode(n),
23 NamedOrBlankNode::BlankNode(n) => Term::BlankNode(n),
24 }
25 }
26}
27
28#[derive(Default, Clone)]
64#[must_use]
65pub struct RdfXmlParser {
66 lenient: bool,
67 base: Option<Iri<String>>,
68}
69
70impl RdfXmlParser {
71 #[inline]
73 pub fn new() -> Self {
74 Self::default()
75 }
76
77 #[inline]
83 pub fn lenient(mut self) -> Self {
84 self.lenient = true;
85 self
86 }
87
88 #[deprecated(note = "Use `lenient()` instead", since = "0.2.0")]
89 #[inline]
90 pub fn unchecked(self) -> Self {
91 self.lenient()
92 }
93
94 #[inline]
95 pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
96 self.base = Some(Iri::parse(base_iri.into())?);
97 Ok(self)
98 }
99
100 pub fn for_reader<R: Read>(self, reader: R) -> ReaderRdfXmlParser<R> {
129 ReaderRdfXmlParser {
130 results: Vec::new(),
131 parser: self.into_internal(BufReader::new(reader)),
132 reader_buffer: Vec::default(),
133 }
134 }
135
136 #[cfg(feature = "async-tokio")]
170 pub fn for_tokio_async_reader<R: AsyncRead + Unpin>(
171 self,
172 reader: R,
173 ) -> TokioAsyncReaderRdfXmlParser<R> {
174 TokioAsyncReaderRdfXmlParser {
175 results: Vec::new(),
176 parser: self.into_internal(AsyncBufReader::new(reader)),
177 reader_buffer: Vec::default(),
178 }
179 }
180
181 pub fn for_slice(self, slice: &[u8]) -> SliceRdfXmlParser<'_> {
210 SliceRdfXmlParser {
211 results: Vec::new(),
212 parser: self.into_internal(slice),
213 reader_buffer: Vec::default(),
214 }
215 }
216
217 fn into_internal<T>(self, reader: T) -> InternalRdfXmlParser<T> {
218 let mut reader = NsReader::from_reader(reader);
219 reader.config_mut().expand_empty_elements = true;
220 InternalRdfXmlParser {
221 reader,
222 state: vec![RdfXmlState::Doc {
223 base_iri: self.base.clone(),
224 }],
225 custom_entities: HashMap::new(),
226 in_literal_depth: 0,
227 known_rdf_id: HashSet::default(),
228 is_end: false,
229 lenient: self.lenient,
230 }
231 }
232}
233
234#[must_use]
265pub struct ReaderRdfXmlParser<R: Read> {
266 results: Vec<Triple>,
267 parser: InternalRdfXmlParser<BufReader<R>>,
268 reader_buffer: Vec<u8>,
269}
270
271impl<R: Read> Iterator for ReaderRdfXmlParser<R> {
272 type Item = Result<Triple, RdfXmlParseError>;
273
274 fn next(&mut self) -> Option<Self::Item> {
275 loop {
276 if let Some(triple) = self.results.pop() {
277 return Some(Ok(triple));
278 } else if self.parser.is_end {
279 return None;
280 }
281 if let Err(e) = self.parse_step() {
282 return Some(Err(e));
283 }
284 }
285 }
286}
287
288impl<R: Read> ReaderRdfXmlParser<R> {
289 pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
320 RdfXmlPrefixesIter {
321 inner: self.parser.reader.resolver().bindings(),
322 decoder: self.parser.reader.decoder(),
323 lenient: self.parser.lenient,
324 }
325 }
326
327 pub fn base_iri(&self) -> Option<&str> {
346 Some(self.parser.current_base_iri()?.as_str())
347 }
348
349 pub fn buffer_position(&self) -> u64 {
351 self.parser.reader.buffer_position()
352 }
353
354 fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
355 self.reader_buffer.clear();
356 let event = self
357 .parser
358 .reader
359 .read_event_into(&mut self.reader_buffer)?;
360 self.parser.parse_event(event, &mut self.results)
361 }
362}
363
364#[cfg(feature = "async-tokio")]
400#[must_use]
401pub struct TokioAsyncReaderRdfXmlParser<R: AsyncRead + Unpin> {
402 results: Vec<Triple>,
403 parser: InternalRdfXmlParser<AsyncBufReader<R>>,
404 reader_buffer: Vec<u8>,
405}
406
407#[cfg(feature = "async-tokio")]
408impl<R: AsyncRead + Unpin> TokioAsyncReaderRdfXmlParser<R> {
409 pub async fn next(&mut self) -> Option<Result<Triple, RdfXmlParseError>> {
411 loop {
412 if let Some(triple) = self.results.pop() {
413 return Some(Ok(triple));
414 } else if self.parser.is_end {
415 return None;
416 }
417 if let Err(e) = self.parse_step().await {
418 return Some(Err(e));
419 }
420 }
421 }
422
423 pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
459 RdfXmlPrefixesIter {
460 inner: self.parser.reader.resolver().bindings(),
461 decoder: self.parser.reader.decoder(),
462 lenient: self.parser.lenient,
463 }
464 }
465
466 pub fn base_iri(&self) -> Option<&str> {
489 Some(self.parser.current_base_iri()?.as_str())
490 }
491
492 pub fn buffer_position(&self) -> u64 {
494 self.parser.reader.buffer_position()
495 }
496
497 async fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
498 self.reader_buffer.clear();
499 let event = self
500 .parser
501 .reader
502 .read_event_into_async(&mut self.reader_buffer)
503 .await?;
504 self.parser.parse_event(event, &mut self.results)
505 }
506}
507
508#[must_use]
539pub struct SliceRdfXmlParser<'a> {
540 results: Vec<Triple>,
541 parser: InternalRdfXmlParser<&'a [u8]>,
542 reader_buffer: Vec<u8>,
543}
544
545impl Iterator for SliceRdfXmlParser<'_> {
546 type Item = Result<Triple, RdfXmlSyntaxError>;
547
548 fn next(&mut self) -> Option<Self::Item> {
549 loop {
550 if let Some(triple) = self.results.pop() {
551 return Some(Ok(triple));
552 } else if self.parser.is_end {
553 return None;
554 }
555 if let Err(RdfXmlParseError::Syntax(e)) = self.parse_step() {
556 return Some(Err(e));
558 }
559 }
560 }
561}
562
563impl SliceRdfXmlParser<'_> {
564 pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
595 RdfXmlPrefixesIter {
596 inner: self.parser.reader.resolver().bindings(),
597 decoder: self.parser.reader.decoder(),
598 lenient: self.parser.lenient,
599 }
600 }
601
602 pub fn base_iri(&self) -> Option<&str> {
621 Some(self.parser.current_base_iri()?.as_str())
622 }
623
624 pub fn buffer_position(&self) -> u64 {
626 self.parser.reader.buffer_position()
627 }
628
629 fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
630 self.reader_buffer.clear();
631 let event = self
632 .parser
633 .reader
634 .read_event_into(&mut self.reader_buffer)?;
635 self.parser.parse_event(event, &mut self.results)
636 }
637}
638
639pub struct RdfXmlPrefixesIter<'a> {
643 inner: NamespaceBindingsIter<'a>,
644 decoder: Decoder,
645 lenient: bool,
646}
647
648impl<'a> Iterator for RdfXmlPrefixesIter<'a> {
649 type Item = (&'a str, &'a str);
650
651 #[inline]
652 fn next(&mut self) -> Option<Self::Item> {
653 loop {
654 let (key, value) = self.inner.next()?;
655 return Some((
656 match key {
657 PrefixDeclaration::Default => "",
658 PrefixDeclaration::Named(name) => {
659 let Ok(Cow::Borrowed(name)) = self.decoder.decode(name) else {
660 continue;
661 };
662 let Ok(Cow::Borrowed(name)) = unescape_with(name, |_| None) else {
663 continue;
664 };
665 if !self.lenient && !is_nc_name(name) {
666 continue; }
668 name
669 }
670 },
671 {
672 let Ok(Cow::Borrowed(value)) = self.decoder.decode(value.0) else {
673 continue;
674 };
675 let Ok(Cow::Borrowed(value)) = unescape_with(value, |_| None) else {
676 continue;
677 };
678 if !self.lenient && Iri::parse(value).is_err() {
679 continue; }
681 value
682 },
683 ));
684 }
685 }
686
687 #[inline]
688 fn size_hint(&self) -> (usize, Option<usize>) {
689 self.inner.size_hint()
690 }
691}
692
693const RDF_ABOUT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about";
694const RDF_ABOUT_EACH: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach";
695const RDF_ABOUT_EACH_PREFIX: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix";
696const RDF_BAG_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID";
697const RDF_DATATYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype";
698const RDF_DESCRIPTION: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description";
699const RDF_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID";
700const RDF_LI: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li";
701const RDF_NODE_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID";
702const RDF_PARSE_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType";
703const RDF_RDF: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF";
704const RDF_RESOURCE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource";
705
706const RDF_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
708const RDF_NIL: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil";
709const RDF_FIRST: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first";
710const RDF_REST: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest";
711const RDF_STATEMENT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement";
712const RDF_SUBJECT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject";
713const RDF_PREDICATE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate";
714const RDF_OBJECT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#object";
715const RDF_XML_LITERAL: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral";
716
717const RESERVED_RDF_ELEMENTS: [&str; 11] = [
718 RDF_ABOUT,
719 RDF_ABOUT_EACH,
720 RDF_ABOUT_EACH_PREFIX,
721 RDF_BAG_ID,
722 RDF_DATATYPE,
723 RDF_ID,
724 RDF_LI,
725 RDF_NODE_ID,
726 RDF_PARSE_TYPE,
727 RDF_RDF,
728 RDF_RESOURCE,
729];
730const RESERVED_RDF_ATTRIBUTES: [&str; 5] = [
731 RDF_ABOUT_EACH,
732 RDF_ABOUT_EACH_PREFIX,
733 RDF_LI,
734 RDF_RDF,
735 RDF_RESOURCE,
736];
737
738#[derive(Clone, Debug)]
739enum NodeOrText {
740 Node(NamedOrBlankNode),
741 Text(String),
742}
743
744enum RdfXmlState {
745 Doc {
746 base_iri: Option<Iri<String>>,
747 },
748 Rdf {
749 base_iri: Option<Iri<String>>,
750 language: Option<String>,
751 },
752 NodeElt {
753 base_iri: Option<Iri<String>>,
754 language: Option<String>,
755 subject: NamedOrBlankNode,
756 li_counter: u64,
757 },
758 PropertyElt {
759 iri: NamedNode,
761 base_iri: Option<Iri<String>>,
762 language: Option<String>,
763 subject: NamedOrBlankNode,
764 object: Option<NodeOrText>,
765 id_attr: Option<NamedNode>,
766 datatype_attr: Option<NamedNode>,
767 },
768 ParseTypeCollectionPropertyElt {
769 iri: NamedNode,
770 base_iri: Option<Iri<String>>,
771 language: Option<String>,
772 subject: NamedOrBlankNode,
773 objects: Vec<NamedOrBlankNode>,
774 id_attr: Option<NamedNode>,
775 },
776 ParseTypeLiteralPropertyElt {
777 iri: NamedNode,
778 base_iri: Option<Iri<String>>,
779 language: Option<String>,
780 subject: NamedOrBlankNode,
781 writer: Writer<Vec<u8>>,
782 id_attr: Option<NamedNode>,
783 emit: bool, },
785}
786
787struct InternalRdfXmlParser<R> {
788 reader: NsReader<R>,
789 state: Vec<RdfXmlState>,
790 custom_entities: HashMap<String, String>,
791 in_literal_depth: usize,
792 known_rdf_id: HashSet<String>,
793 is_end: bool,
794 lenient: bool,
795}
796
797struct NodeElementAttributes {
799 id_attr: Option<NamedNode>,
800 node_id_attr: Option<BlankNode>,
801 about_attr: Option<NamedNode>,
802 type_attr: Option<NamedNode>,
803 property_attrs: Vec<(NamedNode, String)>,
804}
805
806impl<R> InternalRdfXmlParser<R> {
807 fn parse_event(
808 &mut self,
809 event: Event<'_>,
810 results: &mut Vec<Triple>,
811 ) -> Result<(), RdfXmlParseError> {
812 match event {
813 Event::Start(event) => self.parse_start_event(&event, results),
814 Event::End(event) => self.parse_end_event(&event, results),
815 Event::Empty(_) => Err(RdfXmlSyntaxError::msg(
816 "The expand_empty_elements option must be enabled",
817 )
818 .into()),
819 Event::Text(event) => self.parse_text_event(&event),
820 Event::CData(event) => self.parse_text_event(&event.escape()?),
821 Event::Comment(_) | Event::PI(_) | Event::GeneralRef(_) => Ok(()),
822 Event::Decl(decl) => {
823 if let Some(encoding) = decl.encoding() {
824 if !is_utf8(&encoding?) {
825 return Err(RdfXmlSyntaxError::msg(
826 "Only UTF-8 is supported by the RDF/XML parser",
827 )
828 .into());
829 }
830 }
831 Ok(())
832 }
833 Event::DocType(dt) => self.parse_doctype(&dt),
834 Event::Eof => {
835 self.is_end = true;
836 Ok(())
837 }
838 }
839 }
840
841 fn parse_doctype(&mut self, dt: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
842 for input in self
844 .reader
845 .decoder()
846 .decode(dt.as_ref())?
847 .split('<')
848 .skip(1)
849 {
850 if let Some(input) = input.strip_prefix("!ENTITY") {
851 let input = input.trim_start().strip_prefix('%').unwrap_or(input);
852 let (entity_name, input) = input.trim_start().split_once(|c: char| c.is_ascii_whitespace()).ok_or_else(|| {
853 RdfXmlSyntaxError::msg(
854 "<!ENTITY declarations should contain both an entity name and an entity value",
855 )
856 })?;
857 let input = input.trim_start().strip_prefix('\"').ok_or_else(|| {
858 RdfXmlSyntaxError::msg("<!ENTITY values should be enclosed in double quotes")
859 })?;
860 let (entity_value, input) = input.split_once('"').ok_or_else(|| {
861 RdfXmlSyntaxError::msg(
862 "<!ENTITY declarations values should be enclosed in double quotes",
863 )
864 })?;
865 input.trim_start().strip_prefix('>').ok_or_else(|| {
866 RdfXmlSyntaxError::msg("<!ENTITY declarations values should end with >")
867 })?;
868
869 let entity_value =
871 unescape_with(entity_value, |e| self.resolve_entity(e)).map_err(Error::from)?;
872 self.custom_entities
873 .insert(entity_name.to_owned(), entity_value.to_string());
874 }
875 }
876 Ok(())
877 }
878
879 fn parse_start_event(
880 &mut self,
881 event: &BytesStart<'_>,
882 results: &mut Vec<Triple>,
883 ) -> Result<(), RdfXmlParseError> {
884 #[derive(PartialEq, Eq)]
885 enum RdfXmlParseType {
886 Default,
887 Collection,
888 Literal,
889 Resource,
890 Other,
891 }
892
893 #[derive(PartialEq, Eq)]
894 enum RdfXmlNextProduction {
895 Rdf,
896 NodeElt,
897 PropertyElt { subject: NamedOrBlankNode },
898 }
899
900 if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) = self.state.last_mut()
902 {
903 let mut clean_event = BytesStart::new(
904 self.reader
905 .decoder()
906 .decode(event.name().as_ref())?
907 .to_string(),
908 );
909 for attr in event.attributes() {
910 clean_event.push_attribute(attr.map_err(Error::InvalidAttr)?);
911 }
912 writer.write_event(Event::Start(clean_event))?;
913 self.in_literal_depth += 1;
914 return Ok(());
915 }
916
917 let tag_name = self.resolve_tag_name(event.name())?;
918
919 let mut language = None;
921 let mut base_iri = None;
922 let mut id_attr = None;
923 let mut node_id_attr = None;
924 let mut about_attr = None;
925 let mut property_attrs = Vec::default();
926 let mut resource_attr = None;
927 let mut datatype_attr = None;
928 let mut parse_type = RdfXmlParseType::Default;
929 let mut type_attr = None;
930
931 for attribute in event.attributes() {
932 let attribute = attribute.map_err(Error::InvalidAttr)?;
933 if attribute.key.as_ref().starts_with(b"xml") {
934 if attribute.key.as_ref() == b"xml:lang" {
935 let tag = self.convert_attribute(&attribute)?.to_ascii_lowercase();
936 language = Some(if self.lenient {
937 tag
938 } else {
939 LanguageTag::parse(tag.to_ascii_lowercase())
940 .map_err(|error| RdfXmlSyntaxError::invalid_language_tag(tag, error))?
941 .into_inner()
942 });
943 } else if attribute.key.as_ref() == b"xml:base" {
944 let iri = self.convert_attribute(&attribute)?;
945 base_iri = Some(if self.lenient {
946 Iri::parse_unchecked(iri.clone())
947 } else {
948 Iri::parse(iri.clone())
949 .map_err(|error| RdfXmlSyntaxError::invalid_iri(iri, error))?
950 })
951 } else {
952 }
954 } else {
955 let attribute_url = self.resolve_attribute_name(attribute.key)?;
956 if *attribute_url == *RDF_ID {
957 let mut id = self.convert_attribute(&attribute)?;
958 if !is_nc_name(&id) {
959 return Err(RdfXmlSyntaxError::msg(format!(
960 "{id} is not a valid rdf:ID value"
961 ))
962 .into());
963 }
964 id.insert(0, '#');
965 id_attr = Some(id);
966 } else if *attribute_url == *RDF_BAG_ID {
967 let bag_id = self.convert_attribute(&attribute)?;
968 if !is_nc_name(&bag_id) {
969 return Err(RdfXmlSyntaxError::msg(format!(
970 "{bag_id} is not a valid rdf:bagID value"
971 ))
972 .into());
973 }
974 } else if *attribute_url == *RDF_NODE_ID {
975 let id = self.convert_attribute(&attribute)?;
976 if !is_nc_name(&id) {
977 return Err(RdfXmlSyntaxError::msg(format!(
978 "{id} is not a valid rdf:nodeID value"
979 ))
980 .into());
981 }
982 node_id_attr = Some(BlankNode::new_unchecked(id));
983 } else if *attribute_url == *RDF_ABOUT {
984 about_attr = Some(attribute);
985 } else if *attribute_url == *RDF_RESOURCE {
986 resource_attr = Some(attribute);
987 } else if *attribute_url == *RDF_DATATYPE {
988 datatype_attr = Some(attribute);
989 } else if *attribute_url == *RDF_PARSE_TYPE {
990 parse_type = match attribute.value.as_ref() {
991 b"Collection" => RdfXmlParseType::Collection,
992 b"Literal" => RdfXmlParseType::Literal,
993 b"Resource" => RdfXmlParseType::Resource,
994 _ => RdfXmlParseType::Other,
995 };
996 } else if attribute_url == RDF_TYPE {
997 type_attr = Some(attribute);
998 } else if RESERVED_RDF_ATTRIBUTES.contains(&&*attribute_url) {
999 return Err(RdfXmlSyntaxError::msg(format!(
1000 "{attribute_url} is not a valid attribute"
1001 ))
1002 .into());
1003 } else {
1004 property_attrs.push((
1005 self.parse_iri(attribute_url)?,
1006 self.convert_attribute(&attribute)?,
1007 ));
1008 }
1009 }
1010 }
1011
1012 let id_attr = match id_attr {
1014 Some(iri) => {
1015 let iri = self.resolve_iri(base_iri.as_ref(), iri)?;
1016 if !self.lenient {
1017 if self.known_rdf_id.contains(iri.as_str()) {
1018 return Err(RdfXmlSyntaxError::msg(format!(
1019 "{iri} has already been used as rdf:ID value"
1020 ))
1021 .into());
1022 }
1023 self.known_rdf_id.insert(iri.as_str().into());
1024 }
1025 Some(iri)
1026 }
1027 None => None,
1028 };
1029 let about_attr = match about_attr {
1030 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1031 None => None,
1032 };
1033 let resource_attr = match resource_attr {
1034 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1035 None => None,
1036 };
1037 let datatype_attr = match datatype_attr {
1038 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1039 None => None,
1040 };
1041 let type_attr = match type_attr {
1042 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1043 None => None,
1044 };
1045
1046 let expected_production = match self.state.last() {
1047 Some(RdfXmlState::Doc { .. }) => RdfXmlNextProduction::Rdf,
1048 Some(
1049 RdfXmlState::Rdf { .. }
1050 | RdfXmlState::PropertyElt { .. }
1051 | RdfXmlState::ParseTypeCollectionPropertyElt { .. },
1052 ) => RdfXmlNextProduction::NodeElt,
1053 Some(RdfXmlState::NodeElt { subject, .. }) => RdfXmlNextProduction::PropertyElt {
1054 subject: subject.clone(),
1055 },
1056 Some(RdfXmlState::ParseTypeLiteralPropertyElt { .. }) => {
1057 return Err(
1058 RdfXmlSyntaxError::msg("ParseTypeLiteralPropertyElt production children should never be considered as a RDF/XML content").into()
1059 );
1060 }
1061 None => {
1062 return Err(RdfXmlSyntaxError::msg(
1063 "No state in the stack: the XML is not balanced",
1064 )
1065 .into());
1066 }
1067 };
1068
1069 let new_state = match expected_production {
1070 RdfXmlNextProduction::Rdf => {
1071 if *tag_name == *RDF_RDF {
1072 RdfXmlState::Rdf { base_iri, language }
1073 } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1074 return Err(RdfXmlSyntaxError::msg(format!(
1075 "Invalid node element tag name: {tag_name}"
1076 ))
1077 .into());
1078 } else {
1079 self.build_node_elt(
1080 self.parse_iri(tag_name)?,
1081 base_iri,
1082 language,
1083 NodeElementAttributes {
1084 id_attr,
1085 node_id_attr,
1086 about_attr,
1087 type_attr,
1088 property_attrs,
1089 },
1090 results,
1091 )?
1092 }
1093 }
1094 RdfXmlNextProduction::NodeElt => {
1095 if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1096 return Err(RdfXmlSyntaxError::msg(format!(
1097 "Invalid property element tag name: {tag_name}"
1098 ))
1099 .into());
1100 }
1101 self.build_node_elt(
1102 self.parse_iri(tag_name)?,
1103 base_iri,
1104 language,
1105 NodeElementAttributes {
1106 id_attr,
1107 node_id_attr,
1108 about_attr,
1109 type_attr,
1110 property_attrs,
1111 },
1112 results,
1113 )?
1114 }
1115 RdfXmlNextProduction::PropertyElt { subject } => {
1116 let iri = if *tag_name == *RDF_LI {
1117 let Some(RdfXmlState::NodeElt { li_counter, .. }) = self.state.last_mut()
1118 else {
1119 return Err(RdfXmlSyntaxError::msg(format!(
1120 "Invalid property element tag name: {tag_name}"
1121 ))
1122 .into());
1123 };
1124 *li_counter += 1;
1125 NamedNode::new_unchecked(format!(
1126 "http://www.w3.org/1999/02/22-rdf-syntax-ns#_{li_counter}"
1127 ))
1128 } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name)
1129 || *tag_name == *RDF_DESCRIPTION
1130 {
1131 return Err(RdfXmlSyntaxError::msg(format!(
1132 "Invalid property element tag name: {tag_name}"
1133 ))
1134 .into());
1135 } else {
1136 self.parse_iri(tag_name)?
1137 };
1138 match parse_type {
1139 RdfXmlParseType::Default => {
1140 if resource_attr.is_some()
1141 || node_id_attr.is_some()
1142 || !property_attrs.is_empty()
1143 {
1144 let object = match (resource_attr, node_id_attr)
1145 {
1146 (Some(resource_attr), None) => NamedOrBlankNode::from(resource_attr),
1147 (None, Some(node_id_attr)) => node_id_attr.into(),
1148 (None, None) => BlankNode::default().into(),
1149 (Some(_), Some(_)) => return Err(RdfXmlSyntaxError::msg("Not both rdf:resource and rdf:nodeID could be set at the same time").into())
1150 };
1151 self.emit_property_attrs(
1152 &object,
1153 property_attrs,
1154 language.as_deref(),
1155 results,
1156 );
1157 if let Some(type_attr) = type_attr {
1158 results.push(Triple::new(
1159 crate::model::term::Subject::from(object.clone()),
1160 NamedNode::new_unchecked(RDF_TYPE),
1161 type_attr,
1162 ));
1163 }
1164 RdfXmlState::PropertyElt {
1165 iri,
1166 base_iri,
1167 language,
1168 subject,
1169 object: Some(NodeOrText::Node(object)),
1170 id_attr,
1171 datatype_attr,
1172 }
1173 } else {
1174 RdfXmlState::PropertyElt {
1175 iri,
1176 base_iri,
1177 language,
1178 subject,
1179 object: None,
1180 id_attr,
1181 datatype_attr,
1182 }
1183 }
1184 }
1185 RdfXmlParseType::Literal => RdfXmlState::ParseTypeLiteralPropertyElt {
1186 iri,
1187 base_iri,
1188 language,
1189 subject,
1190 writer: Writer::new(Vec::default()),
1191 id_attr,
1192 emit: true,
1193 },
1194 RdfXmlParseType::Resource => Self::build_parse_type_resource_property_elt(
1195 iri, base_iri, language, subject, id_attr, results,
1196 ),
1197 RdfXmlParseType::Collection => RdfXmlState::ParseTypeCollectionPropertyElt {
1198 iri,
1199 base_iri,
1200 language,
1201 subject,
1202 objects: Vec::default(),
1203 id_attr,
1204 },
1205 RdfXmlParseType::Other => RdfXmlState::ParseTypeLiteralPropertyElt {
1206 iri,
1207 base_iri,
1208 language,
1209 subject,
1210 writer: Writer::new(Vec::default()),
1211 id_attr,
1212 emit: false,
1213 },
1214 }
1215 }
1216 };
1217 self.state.push(new_state);
1218 Ok(())
1219 }
1220
1221 fn parse_end_event(
1222 &mut self,
1223 event: &BytesEnd<'_>,
1224 results: &mut Vec<Triple>,
1225 ) -> Result<(), RdfXmlParseError> {
1226 if self.in_literal_depth > 0 {
1228 if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) =
1229 self.state.last_mut()
1230 {
1231 writer.write_event(Event::End(BytesEnd::new(
1232 self.reader.decoder().decode(event.name().as_ref())?,
1233 )))?;
1234 self.in_literal_depth -= 1;
1235 return Ok(());
1236 }
1237 }
1238
1239 if let Some(current_state) = self.state.pop() {
1240 self.end_state(current_state, results)?;
1241 }
1242 Ok(())
1243 }
1244
1245 fn parse_text_event(&mut self, event: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
1246 let text =
1247 unescape_with(std::str::from_utf8(event)?, |e| self.resolve_entity(e))?.to_string();
1248 match self.state.last_mut() {
1249 Some(RdfXmlState::PropertyElt { object, .. }) => {
1250 if is_object_defined(object) {
1251 if text.bytes().all(is_whitespace) {
1252 Ok(()) } else {
1254 Err(
1255 RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'"))
1256 .into(),
1257 )
1258 }
1259 } else {
1260 *object = Some(NodeOrText::Text(text));
1261 Ok(())
1262 }
1263 }
1264 Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) => {
1265 writer.write_event(Event::Text(BytesText::new(&text)))?;
1266 Ok(())
1267 }
1268 _ => {
1269 if text.bytes().all(is_whitespace) {
1270 Ok(())
1271 } else {
1272 Err(RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'")).into())
1273 }
1274 }
1275 }
1276 }
1277
1278 fn resolve_tag_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1279 let (namespace, local_name) = self.reader.resolver().resolve_element(qname);
1280 self.resolve_ns_name(namespace, local_name)
1281 }
1282
1283 fn resolve_attribute_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1284 let (namespace, local_name) = self.reader.resolver().resolve_attribute(qname);
1285 self.resolve_ns_name(namespace, local_name)
1286 }
1287
1288 fn resolve_ns_name(
1289 &self,
1290 namespace: ResolveResult<'_>,
1291 local_name: LocalName<'_>,
1292 ) -> Result<String, RdfXmlParseError> {
1293 match namespace {
1294 ResolveResult::Bound(ns) => {
1295 let mut value = Vec::with_capacity(ns.as_ref().len() + local_name.as_ref().len());
1296 value.extend_from_slice(ns.as_ref());
1297 value.extend_from_slice(local_name.as_ref());
1298 Ok(unescape_with(&self.reader.decoder().decode(&value)?, |e| {
1299 self.resolve_entity(e)
1300 })
1301 .map_err(Error::from)?
1302 .to_string())
1303 }
1304 ResolveResult::Unbound => {
1305 Err(RdfXmlSyntaxError::msg("XML namespaces are required in RDF/XML").into())
1306 }
1307 ResolveResult::Unknown(v) => Err(RdfXmlSyntaxError::msg(format!(
1308 "Unknown prefix {}:",
1309 self.reader.decoder().decode(&v)?
1310 ))
1311 .into()),
1312 }
1313 }
1314
1315 fn build_node_elt(
1316 &self,
1317 iri: NamedNode,
1318 base_iri: Option<Iri<String>>,
1319 language: Option<String>,
1320 attrs: NodeElementAttributes,
1321 results: &mut Vec<Triple>,
1322 ) -> Result<RdfXmlState, RdfXmlSyntaxError> {
1323 let subject = match (attrs.id_attr, attrs.node_id_attr, attrs.about_attr) {
1324 (Some(id_attr), None, None) => NamedOrBlankNode::from(id_attr),
1325 (None, Some(node_id_attr), None) => node_id_attr.into(),
1326 (None, None, Some(about_attr)) => about_attr.into(),
1327 (None, None, None) => BlankNode::default().into(),
1328 (Some(_), Some(_), _) => {
1329 return Err(RdfXmlSyntaxError::msg(
1330 "Not both rdf:ID and rdf:nodeID could be set at the same time",
1331 ));
1332 }
1333 (_, Some(_), Some(_)) => {
1334 return Err(RdfXmlSyntaxError::msg(
1335 "Not both rdf:nodeID and rdf:resource could be set at the same time",
1336 ));
1337 }
1338 (Some(_), _, Some(_)) => {
1339 return Err(RdfXmlSyntaxError::msg(
1340 "Not both rdf:ID and rdf:resource could be set at the same time",
1341 ));
1342 }
1343 };
1344
1345 self.emit_property_attrs(&subject, attrs.property_attrs, language.as_deref(), results);
1346
1347 if let Some(type_attr) = attrs.type_attr {
1348 results.push(Triple::new(
1349 crate::model::term::Subject::from(subject.clone()),
1350 NamedNode::new_unchecked(RDF_TYPE),
1351 type_attr,
1352 ));
1353 }
1354
1355 if iri.as_str() != RDF_DESCRIPTION {
1356 results.push(Triple::new(
1357 crate::model::term::Subject::from(subject.clone()),
1358 NamedNode::new_unchecked(RDF_TYPE),
1359 iri,
1360 ));
1361 }
1362 Ok(RdfXmlState::NodeElt {
1363 base_iri,
1364 language,
1365 subject,
1366 li_counter: 0,
1367 })
1368 }
1369
1370 fn build_parse_type_resource_property_elt(
1371 iri: NamedNode,
1372 base_iri: Option<Iri<String>>,
1373 language: Option<String>,
1374 subject: NamedOrBlankNode,
1375 id_attr: Option<NamedNode>,
1376 results: &mut Vec<Triple>,
1377 ) -> RdfXmlState {
1378 let object = BlankNode::default();
1379 let triple = Triple::new(
1380 crate::model::term::Subject::from(subject),
1381 iri,
1382 object.clone(),
1383 );
1384 if let Some(id_attr) = id_attr {
1385 Self::reify(triple.clone(), id_attr, results);
1386 }
1387 results.push(triple);
1388 RdfXmlState::NodeElt {
1389 base_iri,
1390 language,
1391 subject: object.into(),
1392 li_counter: 0,
1393 }
1394 }
1395
1396 fn end_state(
1397 &mut self,
1398 state: RdfXmlState,
1399 results: &mut Vec<Triple>,
1400 ) -> Result<(), RdfXmlSyntaxError> {
1401 match state {
1402 RdfXmlState::PropertyElt {
1403 iri,
1404 language,
1405 subject,
1406 id_attr,
1407 datatype_attr,
1408 object,
1409 ..
1410 } => {
1411 let object = match object {
1412 Some(NodeOrText::Node(node)) => match node {
1413 NamedOrBlankNode::NamedNode(n) => Object::NamedNode(n),
1414 NamedOrBlankNode::BlankNode(b) => Object::BlankNode(b),
1415 },
1416 Some(NodeOrText::Text(text)) => {
1417 Object::Literal(self.new_literal(text, language, datatype_attr))
1418 }
1419 None => {
1420 Object::Literal(self.new_literal(String::new(), language, datatype_attr))
1421 }
1422 };
1423 let triple = Triple::new(crate::model::term::Subject::from(subject), iri, object);
1424 if let Some(id_attr) = id_attr {
1425 Self::reify(triple.clone(), id_attr, results);
1426 }
1427 results.push(triple);
1428 }
1429 RdfXmlState::ParseTypeCollectionPropertyElt {
1430 iri,
1431 subject,
1432 id_attr,
1433 objects,
1434 ..
1435 } => {
1436 let mut current_node = NamedOrBlankNode::from(NamedNode::new_unchecked(RDF_NIL));
1437 for object in objects.into_iter().rev() {
1438 let subject = NamedOrBlankNode::from(BlankNode::default());
1439 results.push(Triple::new(
1440 crate::model::term::Subject::from(subject.clone()),
1441 NamedNode::new_unchecked(RDF_FIRST),
1442 object,
1443 ));
1444 results.push(Triple::new(
1445 crate::model::term::Subject::from(subject.clone()),
1446 NamedNode::new_unchecked(RDF_REST),
1447 crate::model::term::Object::from(current_node.clone()),
1448 ));
1449 current_node = subject;
1450 }
1451 let triple = Triple::new(
1452 crate::model::term::Subject::from(subject),
1453 iri,
1454 crate::model::term::Object::from(current_node),
1455 );
1456 if let Some(id_attr) = id_attr {
1457 Self::reify(triple.clone(), id_attr, results);
1458 }
1459 results.push(triple);
1460 }
1461 RdfXmlState::ParseTypeLiteralPropertyElt {
1462 iri,
1463 subject,
1464 id_attr,
1465 writer,
1466 emit,
1467 ..
1468 } => {
1469 if emit {
1470 let object = writer.into_inner();
1471 if object.is_empty() {
1472 return Err(RdfXmlSyntaxError::msg(format!(
1473 "No value found for rdf:XMLLiteral value of property {iri}"
1474 )));
1475 }
1476 let triple = Triple::new(
1477 crate::model::term::Subject::from(subject),
1478 iri,
1479 Literal::new_typed_literal(
1480 str::from_utf8(&object).map_err(|_| {
1481 RdfXmlSyntaxError::msg(
1482 "The XML literal is not in valid UTF-8".to_owned(),
1483 )
1484 })?,
1485 NamedNode::new_unchecked(RDF_XML_LITERAL),
1486 ),
1487 );
1488 if let Some(id_attr) = id_attr {
1489 Self::reify(triple.clone(), id_attr, results);
1490 }
1491 results.push(triple);
1492 }
1493 }
1494 RdfXmlState::NodeElt { subject, .. } => match self.state.last_mut() {
1495 Some(RdfXmlState::PropertyElt { object, .. }) => {
1496 if is_object_defined(object) {
1497 return Err(RdfXmlSyntaxError::msg(
1498 "Unexpected node, a text value is already present",
1499 ));
1500 }
1501 *object = Some(NodeOrText::Node(subject))
1502 }
1503 Some(RdfXmlState::ParseTypeCollectionPropertyElt { objects, .. }) => {
1504 objects.push(subject)
1505 }
1506 _ => (),
1507 },
1508 _ => (),
1509 }
1510 Ok(())
1511 }
1512
1513 fn new_literal(
1514 &self,
1515 value: String,
1516 language: Option<String>,
1517 datatype: Option<NamedNode>,
1518 ) -> Literal {
1519 if let Some(datatype) = datatype {
1520 Literal::new_typed_literal(value, datatype)
1521 } else if let Some(language) =
1522 language.or_else(|| self.current_language().map(ToOwned::to_owned))
1523 {
1524 Literal::new_language_tagged_literal_unchecked(value, language)
1525 } else {
1526 Literal::new_simple_literal(value)
1527 }
1528 }
1529
1530 fn reify(triple: Triple, statement_id: NamedNode, results: &mut Vec<Triple>) {
1531 results.push(Triple::new(
1532 statement_id.clone(),
1533 NamedNode::new_unchecked(RDF_TYPE),
1534 NamedNode::new_unchecked(RDF_STATEMENT),
1535 ));
1536 results.push(Triple::new(
1537 statement_id.clone(),
1538 NamedNode::new_unchecked(RDF_SUBJECT),
1539 match triple.subject() {
1540 Subject::NamedNode(n) => Object::NamedNode(n.clone()),
1541 Subject::BlankNode(b) => Object::BlankNode(b.clone()),
1542 Subject::Variable(v) => Object::Variable(v.clone()),
1543 Subject::QuotedTriple(qt) => Object::QuotedTriple(qt.clone()),
1544 },
1545 ));
1546 results.push(Triple::new(
1547 statement_id.clone(),
1548 NamedNode::new_unchecked(RDF_PREDICATE),
1549 match triple.predicate() {
1550 Predicate::NamedNode(n) => Object::NamedNode(n.clone()),
1551 Predicate::Variable(v) => Object::Variable(v.clone()),
1552 },
1553 ));
1554 results.push(Triple::new(
1555 statement_id,
1556 NamedNode::new_unchecked(RDF_OBJECT),
1557 triple.object().clone(),
1558 ));
1559 }
1560
1561 fn emit_property_attrs(
1562 &self,
1563 subject: &NamedOrBlankNode,
1564 literal_attributes: Vec<(NamedNode, String)>,
1565 language: Option<&str>,
1566 results: &mut Vec<Triple>,
1567 ) {
1568 for (literal_predicate, literal_value) in literal_attributes {
1569 results.push(Triple::new(
1570 crate::model::term::Subject::from(subject.clone()),
1571 literal_predicate,
1572 if let Some(language) = language.or_else(|| self.current_language()) {
1573 Literal::new_lang(&literal_value, language)
1574 .unwrap_or_else(|_| Literal::new(literal_value))
1575 } else {
1576 Literal::new(literal_value)
1577 },
1578 ));
1579 }
1580 }
1581
1582 fn convert_attribute(&self, attribute: &Attribute<'_>) -> Result<String, RdfXmlParseError> {
1583 Ok(attribute
1584 .decode_and_unescape_value_with(self.reader.decoder(), |e| self.resolve_entity(e))?
1585 .into_owned())
1586 }
1587
1588 fn convert_iri_attribute(
1589 &self,
1590 base_iri: Option<&Iri<String>>,
1591 attribute: &Attribute<'_>,
1592 ) -> Result<NamedNode, RdfXmlParseError> {
1593 let converted = self.convert_attribute(attribute)?;
1594 self.resolve_iri(base_iri, converted)
1595 .map_err(RdfXmlParseError::Syntax)
1596 }
1597
1598 fn resolve_iri(
1599 &self,
1600 base_iri: Option<&Iri<String>>,
1601 relative_iri: String,
1602 ) -> Result<NamedNode, RdfXmlSyntaxError> {
1603 if let Some(base_iri) = base_iri.or_else(|| self.current_base_iri()) {
1604 Ok(NamedNode::new_unchecked(if self.lenient {
1605 base_iri.resolve_unchecked(&relative_iri).into_inner()
1606 } else {
1607 base_iri
1608 .resolve(&relative_iri)
1609 .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1610 .into_inner()
1611 }))
1612 } else {
1613 self.parse_iri(relative_iri)
1614 }
1615 }
1616
1617 fn parse_iri(&self, relative_iri: String) -> Result<NamedNode, RdfXmlSyntaxError> {
1618 Ok(NamedNode::new_unchecked(if self.lenient {
1619 relative_iri
1620 } else {
1621 Iri::parse(relative_iri.clone())
1622 .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1623 .into_inner()
1624 }))
1625 }
1626
1627 fn current_language(&self) -> Option<&str> {
1628 for state in self.state.iter().rev() {
1629 match state {
1630 RdfXmlState::Doc { .. } => (),
1631 RdfXmlState::Rdf { language, .. }
1632 | RdfXmlState::NodeElt { language, .. }
1633 | RdfXmlState::PropertyElt { language, .. }
1634 | RdfXmlState::ParseTypeCollectionPropertyElt { language, .. }
1635 | RdfXmlState::ParseTypeLiteralPropertyElt { language, .. } => {
1636 if let Some(language) = language {
1637 return Some(language);
1638 }
1639 }
1640 }
1641 }
1642 None
1643 }
1644
1645 fn current_base_iri(&self) -> Option<&Iri<String>> {
1646 for state in self.state.iter().rev() {
1647 match state {
1648 RdfXmlState::Doc { base_iri }
1649 | RdfXmlState::Rdf { base_iri, .. }
1650 | RdfXmlState::NodeElt { base_iri, .. }
1651 | RdfXmlState::PropertyElt { base_iri, .. }
1652 | RdfXmlState::ParseTypeCollectionPropertyElt { base_iri, .. }
1653 | RdfXmlState::ParseTypeLiteralPropertyElt { base_iri, .. } => {
1654 if let Some(base_iri) = base_iri {
1655 return Some(base_iri);
1656 }
1657 }
1658 }
1659 }
1660 None
1661 }
1662
1663 fn resolve_entity(&self, e: &str) -> Option<&str> {
1664 resolve_xml_entity(e).or_else(|| self.custom_entities.get(e).map(String::as_str))
1665 }
1666}
1667
1668fn is_object_defined(object: &Option<NodeOrText>) -> bool {
1669 match object {
1670 Some(NodeOrText::Node(_)) => true,
1671 Some(NodeOrText::Text(t)) => !t.bytes().all(is_whitespace),
1672 None => false,
1673 }
1674}
1675
1676fn is_whitespace(c: u8) -> bool {
1677 matches!(c, b' ' | b'\t' | b'\n' | b'\r')
1678}
1679
1680fn is_utf8(encoding: &[u8]) -> bool {
1681 matches!(
1682 encoding.to_ascii_lowercase().as_slice(),
1683 b"unicode-1-1-utf-8"
1684 | b"unicode11utf8"
1685 | b"unicode20utf8"
1686 | b"utf-8"
1687 | b"utf8"
1688 | b"x-unicode20utf8"
1689 )
1690}