oxrdf/
parser.rs

1#[cfg(feature = "rdf-12")]
2use crate::BaseDirection;
3use crate::vocab::xsd;
4use crate::{
5    BlankNode, BlankNodeIdParseError, GraphName, IriParseError, LanguageTagParseError, Literal,
6    NamedNode, Quad, Term, Triple, Variable, VariableNameParseError,
7};
8use std::borrow::Cow;
9use std::char;
10use std::str::{Chars, FromStr};
11
12/// This limit is set in order to avoid stack overflow error when parsing nested triples due to too many recursive calls.
13/// The actual limit value is a wet finger compromise between not failing to parse valid files and avoiding to trigger stack overflow errors.
14const MAX_NUMBER_OF_NESTED_TRIPLES: usize = 128;
15
16impl FromStr for NamedNode {
17    type Err = TermParseError;
18
19    /// Parses a named node from its NTriples and Turtle serialization
20    ///
21    /// ```
22    /// use oxrdf::NamedNode;
23    /// use std::str::FromStr;
24    ///
25    /// assert_eq!(
26    ///     NamedNode::from_str("<http://example.com>")?,
27    ///     NamedNode::new("http://example.com")?
28    /// );
29    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
30    /// ```
31    fn from_str(s: &str) -> Result<Self, Self::Err> {
32        let (term, left) = read_named_node(s)?;
33        if !left.is_empty() {
34            return Err(Self::Err::msg(
35                "Named node serialization should end with a >",
36            ));
37        }
38        Ok(term)
39    }
40}
41
42impl FromStr for BlankNode {
43    type Err = TermParseError;
44
45    /// Parses a blank node from its NTriples serialization
46    ///
47    /// ```
48    /// use oxrdf::BlankNode;
49    /// use std::str::FromStr;
50    ///
51    /// assert_eq!(BlankNode::from_str("_:ex")?, BlankNode::new("ex")?);
52    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
53    /// ```
54    fn from_str(s: &str) -> Result<Self, Self::Err> {
55        let (term, left) = read_blank_node(s)?;
56        if !left.is_empty() {
57            return Err(Self::Err::msg(
58                "Blank node serialization should not contain whitespaces",
59            ));
60        }
61        Ok(term)
62    }
63}
64
65impl FromStr for Literal {
66    type Err = TermParseError;
67
68    /// Parses a literal from its NTriples serialization
69    ///
70    /// ```
71    /// use oxrdf::vocab::xsd;
72    /// use oxrdf::{Literal, NamedNode};
73    /// use std::str::FromStr;
74    ///
75    /// assert_eq!(
76    ///     Literal::from_str("\"ex\\n\"")?,
77    ///     Literal::new_simple_literal("ex\n")
78    /// );
79    /// assert_eq!(
80    ///     Literal::from_str("\"ex\"@en")?,
81    ///     Literal::new_language_tagged_literal("ex", "en")?
82    /// );
83    /// assert_eq!(
84    ///     Literal::from_str("\"2020\"^^<http://www.w3.org/2001/XMLSchema#gYear>")?,
85    ///     Literal::new_typed_literal(
86    ///         "2020",
87    ///         NamedNode::new("http://www.w3.org/2001/XMLSchema#gYear")?
88    ///     )
89    /// );
90    /// assert_eq!(
91    ///     Literal::from_str("true")?,
92    ///     Literal::new_typed_literal("true", xsd::BOOLEAN)
93    /// );
94    /// assert_eq!(
95    ///     Literal::from_str("+122")?,
96    ///     Literal::new_typed_literal("+122", xsd::INTEGER)
97    /// );
98    /// assert_eq!(
99    ///     Literal::from_str("-122.23")?,
100    ///     Literal::new_typed_literal("-122.23", xsd::DECIMAL)
101    /// );
102    /// assert_eq!(
103    ///     Literal::from_str("-122e+1")?,
104    ///     Literal::new_typed_literal("-122e+1", xsd::DOUBLE)
105    /// );
106    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
107    /// ```
108    fn from_str(s: &str) -> Result<Self, Self::Err> {
109        let (term, left) = read_literal(s)?;
110        if !left.is_empty() {
111            return Err(Self::Err::msg("Invalid literal serialization"));
112        }
113        Ok(term)
114    }
115}
116
117impl FromStr for Term {
118    type Err = TermParseError;
119
120    /// Parses a term from its NTriples serialization
121    ///
122    /// ```
123    /// use oxrdf::*;
124    /// use std::str::FromStr;
125    ///
126    /// assert_eq!(
127    ///     Term::from_str("\"ex\"")?,
128    ///     Literal::new_simple_literal("ex").into()
129    /// );
130    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
131    /// ```
132    fn from_str(s: &str) -> Result<Self, Self::Err> {
133        let (term, left) = read_term(s, 0)?;
134        if !left.is_empty() {
135            return Err(Self::Err::msg("Invalid term serialization"));
136        }
137        Ok(term)
138    }
139}
140
141impl FromStr for Triple {
142    type Err = TermParseError;
143
144    /// Parses a triple from its NTriples serialization
145    ///
146    /// ```
147    /// use oxrdf::{BlankNode, Literal, NamedNode, Triple};
148    /// use std::str::FromStr;
149    ///
150    /// assert_eq!(
151    ///     Triple::from_str("_:a <http://example.com/p> \"o\" .")?,
152    ///     Triple::new(
153    ///         BlankNode::new("a")?,
154    ///         NamedNode::new("http://example.com/p")?,
155    ///         Literal::new_simple_literal("o")
156    ///     )
157    /// );
158    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
159    /// ```
160    fn from_str(s: &str) -> Result<Self, Self::Err> {
161        let (triple, left) = read_triple(s, 0)?;
162        if !matches!(left.trim(), "" | ".") {
163            return Err(Self::Err::msg("Invalid triple serialization"));
164        }
165        Ok(triple)
166    }
167}
168
169impl FromStr for Quad {
170    type Err = TermParseError;
171
172    /// Parses a triple from its NQuads serialization
173    ///
174    /// ```
175    /// use oxrdf::{BlankNode, GraphName, Literal, NamedNode, Quad};
176    /// use std::str::FromStr;
177    ///
178    /// assert_eq!(
179    ///     Quad::from_str("_:a <http://example.com/p> \"o\" .")?,
180    ///     Quad::new(
181    ///         BlankNode::new("a")?,
182    ///         NamedNode::new("http://example.com/p")?,
183    ///         Literal::new_simple_literal("o"),
184    ///         GraphName::DefaultGraph
185    ///     )
186    /// );
187    /// assert_eq!(
188    ///     Quad::from_str("_:a <http://example.com/p> \"o\" <http://example.com/g> .")?,
189    ///     Quad::new(
190    ///         BlankNode::new("a")?,
191    ///         NamedNode::new("http://example.com/p")?,
192    ///         Literal::new_simple_literal("o"),
193    ///         NamedNode::new("http://example.com/g")?
194    ///     )
195    /// );
196    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
197    /// ```
198    fn from_str(s: &str) -> Result<Self, Self::Err> {
199        let (triple, left) = read_triple(s, 0)?;
200        if matches!(left.trim(), "" | ".") {
201            return Ok(triple.in_graph(GraphName::DefaultGraph));
202        }
203        let (graph_name, left) = read_term(left, 0)?;
204        if !matches!(left.trim(), "" | ".") {
205            return Err(Self::Err::msg("Invalid triple serialization"));
206        }
207        Ok(triple.in_graph(match graph_name {
208            Term::NamedNode(graph_name) => GraphName::from(graph_name),
209            Term::BlankNode(graph_name) => GraphName::from(graph_name),
210            Term::Literal(_) => {
211                return Err(TermParseError::msg(
212                    "Literals are not allowed in graph name position",
213                ));
214            }
215            #[cfg(feature = "rdf-12")]
216            Term::Triple(_) => {
217                return Err(TermParseError::msg(
218                    "Triple terms are not allowed in graph name position",
219                ));
220            }
221        }))
222    }
223}
224
225impl FromStr for Variable {
226    type Err = TermParseError;
227
228    /// Parses a variable from its SPARQL serialization
229    ///
230    /// ```
231    /// use oxrdf::Variable;
232    /// use std::str::FromStr;
233    ///
234    /// assert_eq!(Variable::from_str("$foo")?, Variable::new("foo")?);
235    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
236    /// ```
237    fn from_str(s: &str) -> Result<Self, Self::Err> {
238        if !s.starts_with('?') && !s.starts_with('$') {
239            return Err(Self::Err::msg(
240                "Variable serialization should start with ? or $",
241            ));
242        }
243        Self::new(&s[1..]).map_err(|error| {
244            TermParseError(TermParseErrorKind::Variable {
245                value: s.to_owned(),
246                error,
247            })
248        })
249    }
250}
251
252fn read_named_node(s: &str) -> Result<(NamedNode, &str), TermParseError> {
253    let s = s.trim();
254    if let Some(remain) = s.strip_prefix('<') {
255        let end = remain
256            .find('>')
257            .ok_or_else(|| TermParseError::msg("Named node serialization should end with a >"))?;
258        let (value, remain) = remain.split_at(end);
259        let remain = &remain[1..];
260        let value = if value.contains('\\') {
261            let mut escaped = String::with_capacity(value.len());
262            let mut chars = value.chars();
263            while let Some(c) = chars.next() {
264                if c == '\\' {
265                    match chars.next() {
266                        Some('u') => escaped.push(read_hexa_char(&mut chars, 4)?),
267                        Some('U') => escaped.push(read_hexa_char(&mut chars, 8)?),
268                        Some(c) => {
269                            escaped.push('\\');
270                            escaped.push(c);
271                        }
272                        None => escaped.push('\\'),
273                    }
274                } else {
275                    escaped.push(c);
276                }
277            }
278            Cow::Owned(escaped)
279        } else {
280            Cow::Borrowed(value)
281        };
282        let term = NamedNode::new(value.as_ref()).map_err(|error| {
283            TermParseError(TermParseErrorKind::Iri {
284                value: value.into_owned(),
285                error,
286            })
287        })?;
288        Ok((term, remain))
289    } else {
290        Err(TermParseError::msg(
291            "Named node serialization should start with a <",
292        ))
293    }
294}
295
296fn read_blank_node(s: &str) -> Result<(BlankNode, &str), TermParseError> {
297    let s = s.trim();
298    if let Some(remain) = s.strip_prefix("_:") {
299        let mut end = remain
300            .find(|v: char| {
301                v.is_whitespace()
302                    || matches!(
303                        v,
304                        '<' | '?'
305                            | '$'
306                            | '"'
307                            | '\''
308                            | '>'
309                            | '@'
310                            | '^'
311                            | ':'
312                            | '('
313                            | ')'
314                            | '{'
315                            | '}'
316                            | '['
317                            | ']'
318                    )
319            })
320            .unwrap_or(remain.len());
321        if let Some(pos) = remain[..end].find("..") {
322            end = pos;
323        }
324        if remain[..end].ends_with('.') {
325            // It can't end with '.'
326            end -= 1;
327        }
328        let (value, remain) = remain.split_at(end);
329        let term = BlankNode::new(value).map_err(|error| {
330            TermParseError(TermParseErrorKind::BlankNode {
331                value: value.to_owned(),
332                error,
333            })
334        })?;
335        Ok((term, remain))
336    } else {
337        Err(TermParseError::msg(
338            "Blank node serialization should start with '_:'",
339        ))
340    }
341}
342
343fn read_literal(s: &str) -> Result<(Literal, &str), TermParseError> {
344    let s = s.trim();
345    if let Some(s) = s.strip_prefix('"') {
346        let mut value = String::with_capacity(s.len());
347        let mut chars = s.chars();
348        while let Some(c) = chars.next() {
349            match c {
350                '"' => {
351                    let remain = chars.as_str().trim();
352                    return if let Some(remain) = remain.strip_prefix('@') {
353                        let end = remain
354                            .find(|v: char| !v.is_ascii_alphanumeric() && v != '-')
355                            .unwrap_or(remain.len());
356                        let (language, remain) = remain.split_at(end);
357                        #[cfg(feature = "rdf-12")]
358                        if let Some((language, direction)) = language.split_once("--") {
359                            return Ok((
360                                Literal::new_directional_language_tagged_literal(value, language, match direction {
361                                    "ltr" => BaseDirection::Ltr,
362                                    "rtl" => BaseDirection::Rtl,
363                                    _ => return Err(TermParseError(TermParseErrorKind::Msg(format!("The only two possible base directions are 'rtl' and 'ltr', found '{direction}'"))))
364                                }).map_err(
365                                    |error| {
366                                        TermParseError(TermParseErrorKind::LanguageTag {
367                                            value: language.to_owned(),
368                                            error,
369                                        })
370                                    },
371                                )?,
372                                remain,
373                            ));
374                        }
375                        Ok((
376                            Literal::new_language_tagged_literal(value, language).map_err(
377                                |error| {
378                                    TermParseError(TermParseErrorKind::LanguageTag {
379                                        value: language.to_owned(),
380                                        error,
381                                    })
382                                },
383                            )?,
384                            remain,
385                        ))
386                    } else if let Some(remain) = remain.strip_prefix("^^") {
387                        let (datatype, remain) = read_named_node(remain)?;
388                        Ok((Literal::new_typed_literal(value, datatype), remain))
389                    } else {
390                        Ok((Literal::new_simple_literal(value), remain))
391                    };
392                }
393                '\\' => {
394                    if let Some(c) = chars.next() {
395                        value.push(match c {
396                            't' => '\t',
397                            'b' => '\u{08}',
398                            'n' => '\n',
399                            'r' => '\r',
400                            'f' => '\u{0C}',
401                            '"' => '"',
402                            '\'' => '\'',
403                            '\\' => '\\',
404                            'u' => read_hexa_char(&mut chars, 4)?,
405                            'U' => read_hexa_char(&mut chars, 8)?,
406                            _ => return Err(TermParseError::msg("Unexpected escaped char")),
407                        })
408                    } else {
409                        return Err(TermParseError::msg("Unexpected literal end"));
410                    }
411                }
412                _ => value.push(c),
413            }
414        }
415        Err(TermParseError::msg("Unexpected literal end"))
416    } else if let Some(remain) = s.strip_prefix("true") {
417        Ok((Literal::new_typed_literal("true", xsd::BOOLEAN), remain))
418    } else if let Some(remain) = s.strip_prefix("false") {
419        Ok((Literal::new_typed_literal("false", xsd::BOOLEAN), remain))
420    } else {
421        let input = s.as_bytes();
422        if input.is_empty() {
423            return Err(TermParseError::msg("Empty term serialization"));
424        }
425
426        let mut cursor = match input.first() {
427            Some(b'+' | b'-') => 1,
428            _ => 0,
429        };
430        let mut with_dot = false;
431
432        let mut count_before: usize = 0;
433        while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' {
434            count_before += 1;
435            cursor += 1;
436        }
437
438        let mut count_after: usize = 0;
439        if cursor < input.len() && input[cursor] == b'.' {
440            with_dot = true;
441            cursor += 1;
442            while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' {
443                count_after += 1;
444                cursor += 1;
445            }
446        }
447
448        if cursor < input.len() && (input[cursor] == b'e' || input[cursor] == b'E') {
449            cursor += 1;
450            cursor += match input.get(cursor) {
451                Some(b'+' | b'-') => 1,
452                _ => 0,
453            };
454            let mut count_exponent = 0;
455            while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' {
456                count_exponent += 1;
457                cursor += 1;
458            }
459            if count_exponent > 0 {
460                Ok((Literal::new_typed_literal(s, xsd::DOUBLE), &s[cursor..]))
461            } else {
462                Err(TermParseError::msg(
463                    "Double serialization with an invalid exponent",
464                ))
465            }
466        } else if with_dot {
467            if count_after > 0 {
468                Ok((Literal::new_typed_literal(s, xsd::DECIMAL), &s[cursor..]))
469            } else {
470                Err(TermParseError::msg(
471                    "Decimal serialization without floating part",
472                ))
473            }
474        } else if count_before > 0 {
475            Ok((Literal::new_typed_literal(s, xsd::INTEGER), &s[cursor..]))
476        } else {
477            Err(TermParseError::msg("Empty integer serialization"))
478        }
479    }
480}
481
482fn read_term(s: &str, number_of_recursive_calls: usize) -> Result<(Term, &str), TermParseError> {
483    if number_of_recursive_calls == MAX_NUMBER_OF_NESTED_TRIPLES {
484        return Err(TermParseError::msg(
485            "Too many nested triples. The parser fails here to avoid a stack overflow.",
486        ));
487    }
488    let s = s.trim();
489    #[allow(unused_variables, clippy::allow_attributes)]
490    if let Some(remain) = s.strip_prefix("<<(") {
491        #[cfg(feature = "rdf-12")]
492        {
493            let (triple, remain) = read_triple(remain, number_of_recursive_calls + 1)?;
494            let remain = remain.trim_start();
495            if let Some(remain) = remain.strip_prefix(")>>") {
496                Ok((triple.into(), remain))
497            } else {
498                Err(TermParseError::msg(
499                    "Triple term serialization must be enclosed between <<( and )>>",
500                ))
501            }
502        }
503        #[cfg(not(feature = "rdf-12"))]
504        {
505            Err(TermParseError::msg("RDF 1.2 is not supported"))
506        }
507    } else if s.starts_with('<') {
508        let (term, remain) = read_named_node(s)?;
509        Ok((term.into(), remain))
510    } else if s.starts_with('_') {
511        let (term, remain) = read_blank_node(s)?;
512        Ok((term.into(), remain))
513    } else {
514        let (term, remain) = read_literal(s)?;
515        Ok((term.into(), remain))
516    }
517}
518
519fn read_triple(
520    s: &str,
521    number_of_recursive_calls: usize,
522) -> Result<(Triple, &str), TermParseError> {
523    let s = s.trim();
524    let (subject, remain) = read_term(s, number_of_recursive_calls + 1)?;
525    let (predicate, remain) = read_named_node(remain)?;
526    let (object, remain) = read_term(remain, number_of_recursive_calls + 1)?;
527    Ok((
528        Triple {
529            subject: match subject {
530                Term::NamedNode(s) => s.into(),
531                Term::BlankNode(s) => s.into(),
532                Term::Literal(_) => {
533                    return Err(TermParseError::msg(
534                        "Literals are not allowed in subject position",
535                    ));
536                }
537                #[cfg(feature = "rdf-12")]
538                Term::Triple(_) => {
539                    return Err(TermParseError::msg(
540                        "Triple terms are not allowed in subject position",
541                    ));
542                }
543            },
544            predicate,
545            object,
546        },
547        remain,
548    ))
549}
550
551fn read_hexa_char(input: &mut Chars<'_>, len: usize) -> Result<char, TermParseError> {
552    let mut value = 0;
553    for _ in 0..len {
554        if let Some(c) = input.next() {
555            value = value * 16
556                + match c {
557                    '0'..='9' => u32::from(c) - u32::from('0'),
558                    'a'..='f' => u32::from(c) - u32::from('a') + 10,
559                    'A'..='F' => u32::from(c) - u32::from('A') + 10,
560                    _ => {
561                        return Err(TermParseError::msg(format!(
562                            "Unexpected character in a unicode escape: {c}"
563                        )));
564                    }
565                }
566        } else {
567            return Err(TermParseError::msg("Unexpected literal string end"));
568        }
569    }
570    char::from_u32(value).ok_or_else(|| TermParseError::msg("Invalid encoded unicode code point"))
571}
572
573/// An error raised during term serialization parsing using the [`FromStr`] trait.
574#[derive(Debug, thiserror::Error)]
575#[error(transparent)]
576pub struct TermParseError(#[from] TermParseErrorKind);
577
578/// An internal error raised during term serialization parsing using the [`FromStr`] trait.
579#[derive(Debug, thiserror::Error)]
580enum TermParseErrorKind {
581    #[error("Error while parsing the named node '{value}': {error}")]
582    Iri { error: IriParseError, value: String },
583    #[error("Error while parsing the blank node '{value}': {error}")]
584    BlankNode {
585        error: BlankNodeIdParseError,
586        value: String,
587    },
588    #[error("Error while parsing the language tag '{value}': {error}")]
589    LanguageTag {
590        error: LanguageTagParseError,
591        value: String,
592    },
593    #[error("Error while parsing the variable '{value}': {error}")]
594    Variable {
595        error: VariableNameParseError,
596        value: String,
597    },
598    #[error("{0}")]
599    Msg(String),
600}
601
602impl TermParseError {
603    pub(crate) fn msg(msg: impl Into<String>) -> Self {
604        Self(TermParseErrorKind::Msg(msg.into()))
605    }
606}
607
608#[cfg(test)]
609#[cfg(feature = "rdf-12")]
610mod tests {
611    use super::*;
612
613    #[test]
614    fn triple_term_parsing() {
615        assert_eq!(
616            Term::from_str("\"ex\\u00E9\\U000000E9\"").unwrap(),
617            Literal::new_simple_literal("ex\u{e9}\u{e9}").into()
618        );
619        assert_eq!(
620            Term::from_str("<http://example.com/\\u00E9\\U000000E9>").unwrap(),
621            NamedNode::new_unchecked("http://example.com/\u{e9}\u{e9}").into()
622        );
623        assert_eq!(
624            Term::from_str("<<( _:s <http://example.com/p> \"o\" )>>").unwrap(),
625            Triple::new(
626                BlankNode::new("s").unwrap(),
627                NamedNode::new("http://example.com/p").unwrap(),
628                Literal::new_simple_literal("o"),
629            )
630            .into()
631        );
632    }
633}