Skip to main content

oxirs_core/format/
turtle.rs

1//! Turtle Format Parser and Serializer
2//!
3//! Extracted and adapted from OxiGraph oxttl with OxiRS enhancements.
4//! Based on W3C Turtle specification: <https://www.w3.org/TR/turtle/>
5
6use super::error::SerializeResult;
7use super::error::{ParseResult, RdfParseError};
8use super::serializer::QuadSerializer;
9use crate::model::{QuadRef, Triple, TripleRef};
10use std::collections::HashMap;
11use std::io::{Read, Write};
12
13/// Turtle parser implementation
14#[derive(Debug, Clone)]
15pub struct TurtleParser {
16    lenient: bool,
17    base_iri: Option<String>,
18    prefixes: HashMap<String, String>,
19}
20
21impl TurtleParser {
22    /// Create a new Turtle parser
23    pub fn new() -> Self {
24        Self {
25            lenient: false,
26            base_iri: None,
27            prefixes: HashMap::new(),
28        }
29    }
30
31    /// Enable lenient parsing (skip some validations)
32    pub fn lenient(mut self) -> Self {
33        self.lenient = true;
34        self
35    }
36
37    /// Set base IRI for resolving relative IRIs
38    pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Self {
39        self.base_iri = Some(base_iri.into());
40        self
41    }
42
43    /// Add a namespace prefix
44    pub fn with_prefix(mut self, prefix: impl Into<String>, iri: impl Into<String>) -> Self {
45        self.prefixes.insert(prefix.into(), iri.into());
46        self
47    }
48
49    /// Parse Turtle from a reader
50    pub fn parse_reader<R: Read>(&self, mut reader: R) -> ParseResult<Vec<Triple>> {
51        // Read all data from the reader
52        let mut buffer = String::new();
53        reader.read_to_string(&mut buffer)?;
54
55        // Use the string parser (handles basic Turtle syntax)
56        // Note: Current implementation handles simple triples, prefixes, and base directives
57        // Advanced Turtle features (collections, lists, multi-line literals) are partially supported
58        self.parse_str(&buffer)
59    }
60
61    /// Parse Turtle from a byte slice
62    pub fn parse_slice(&self, slice: &[u8]) -> ParseResult<Vec<Triple>> {
63        // Convert to string and parse
64        // Note: Future optimization could use zero-copy parsing with byte-level operations
65        let content = std::str::from_utf8(slice)
66            .map_err(|e| RdfParseError::syntax(format!("Invalid UTF-8: {e}")))?;
67
68        self.parse_str(content)
69    }
70
71    /// Parse Turtle from a string
72    pub fn parse_str(&self, input: &str) -> ParseResult<Vec<Triple>> {
73        use super::parser::helpers::convert_quad;
74        use std::io::Cursor;
75
76        // Build oxttl parser with configuration
77        let mut oxttl_parser = oxttl::TurtleParser::new();
78
79        // Apply base IRI if set
80        if let Some(ref base) = self.base_iri {
81            oxttl_parser = oxttl_parser
82                .with_base_iri(base.as_str())
83                .unwrap_or_else(|_| oxttl::TurtleParser::new());
84        }
85
86        // Enable lenient mode if requested
87        if self.lenient {
88            oxttl_parser = oxttl_parser.lenient();
89        }
90
91        // Parse and collect triples
92        let reader = Cursor::new(input.as_bytes());
93        let mut triples = Vec::new();
94
95        for result in oxttl_parser.for_reader(reader) {
96            match result {
97                Ok(triple) => {
98                    // Convert oxrdf Triple to oxirs Triple via Quad
99                    let quad = oxrdf::Quad::new(
100                        triple.subject,
101                        triple.predicate,
102                        triple.object,
103                        oxrdf::GraphName::DefaultGraph,
104                    );
105                    let oxirs_quad = convert_quad(quad)?;
106                    triples.push(oxirs_quad.to_triple());
107                }
108                Err(e) => {
109                    if !self.lenient {
110                        return Err(RdfParseError::syntax(e.to_string()));
111                    }
112                    // In lenient mode, skip errors
113                }
114            }
115        }
116
117        Ok(triples)
118    }
119
120    /// Get current prefixes
121    pub fn prefixes(&self) -> &HashMap<String, String> {
122        &self.prefixes
123    }
124
125    /// Get current base IRI
126    pub fn base_iri(&self) -> Option<&str> {
127        self.base_iri.as_deref()
128    }
129
130    /// Check if lenient parsing is enabled
131    pub fn is_lenient(&self) -> bool {
132        self.lenient
133    }
134}
135
136impl Default for TurtleParser {
137    fn default() -> Self {
138        Self::new()
139    }
140}
141
142/// Turtle serializer implementation
143#[derive(Debug, Clone)]
144pub struct TurtleSerializer {
145    base_iri: Option<String>,
146    prefixes: HashMap<String, String>,
147    pretty: bool,
148}
149
150impl TurtleSerializer {
151    /// Create a new Turtle serializer
152    pub fn new() -> Self {
153        Self {
154            base_iri: None,
155            prefixes: HashMap::new(),
156            pretty: false,
157        }
158    }
159
160    /// Set base IRI for generating relative IRIs
161    pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Self {
162        self.base_iri = Some(base_iri.into());
163        self
164    }
165
166    /// Add a namespace prefix
167    pub fn with_prefix(mut self, prefix: impl Into<String>, iri: impl Into<String>) -> Self {
168        self.prefixes.insert(prefix.into(), iri.into());
169        self
170    }
171
172    /// Enable pretty formatting
173    pub fn pretty(mut self) -> Self {
174        self.pretty = true;
175        self
176    }
177
178    /// Create a writer-based serializer
179    pub fn for_writer<W: Write>(self, writer: W) -> WriterTurtleSerializer<W> {
180        WriterTurtleSerializer::new(writer, self)
181    }
182
183    /// Serialize triples to a string
184    pub fn serialize_to_string(&self, triples: &[Triple]) -> SerializeResult<String> {
185        let mut buffer = Vec::new();
186        {
187            let mut serializer = self.clone().for_writer(&mut buffer);
188            for triple in triples {
189                serializer.serialize_triple(triple.as_ref())?;
190            }
191            serializer.finish()?;
192        }
193        String::from_utf8(buffer)
194            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
195    }
196
197    /// Get the prefixes
198    pub fn prefixes(&self) -> &HashMap<String, String> {
199        &self.prefixes
200    }
201
202    /// Get the base IRI
203    pub fn base_iri(&self) -> Option<&str> {
204        self.base_iri.as_deref()
205    }
206
207    /// Check if pretty formatting is enabled
208    pub fn is_pretty(&self) -> bool {
209        self.pretty
210    }
211}
212
213impl Default for TurtleSerializer {
214    fn default() -> Self {
215        Self::new()
216    }
217}
218
219/// Writer-based Turtle serializer
220pub struct WriterTurtleSerializer<W: Write> {
221    writer: W,
222    config: TurtleSerializer,
223    headers_written: bool,
224}
225
226impl<W: Write> WriterTurtleSerializer<W> {
227    /// Create a new writer serializer
228    pub fn new(writer: W, config: TurtleSerializer) -> Self {
229        Self {
230            writer,
231            config,
232            headers_written: false,
233        }
234    }
235
236    /// Serialize a triple
237    pub fn serialize_triple(&mut self, triple: TripleRef<'_>) -> SerializeResult<()> {
238        self.ensure_headers_written()?;
239
240        // Subject serialization
241        let subject_str = self.serialize_subject(triple.subject())?;
242
243        // Predicate serialization
244        let predicate_str = self.serialize_predicate(triple.predicate())?;
245
246        // Object serialization
247        let object_str = self.serialize_object(triple.object())?;
248
249        // Write the triple with proper formatting
250        if self.config.pretty {
251            writeln!(self.writer, "{subject_str} {predicate_str} {object_str} .")?;
252        } else {
253            writeln!(self.writer, "{subject_str} {predicate_str} {object_str}.")?;
254        }
255
256        Ok(())
257    }
258
259    /// Serialize a subject (NamedNode, BlankNode, or Variable)
260    fn serialize_subject(&self, subject: crate::model::SubjectRef<'_>) -> SerializeResult<String> {
261        use crate::model::SubjectRef;
262
263        match subject {
264            SubjectRef::NamedNode(node) => self.serialize_named_node(node.into()),
265            SubjectRef::BlankNode(node) => {
266                let node_str = node.as_str();
267                Ok(format!("_:{node_str}"))
268            }
269            SubjectRef::Variable(var) => {
270                let var_str = var.as_str();
271                Ok(format!("?{var_str}"))
272            }
273        }
274    }
275
276    /// Serialize a predicate (NamedNode or Variable)
277    fn serialize_predicate(
278        &self,
279        predicate: crate::model::PredicateRef<'_>,
280    ) -> SerializeResult<String> {
281        use crate::model::PredicateRef;
282
283        match predicate {
284            PredicateRef::NamedNode(node) => {
285                // Check for rdf:type shorthand
286                if node.as_str() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" {
287                    Ok("a".to_string())
288                } else {
289                    self.serialize_named_node(node.into())
290                }
291            }
292            PredicateRef::Variable(var) => {
293                let var_str = var.as_str();
294                Ok(format!("?{var_str}"))
295            }
296        }
297    }
298
299    /// Serialize an object (NamedNode, BlankNode, Literal, or Variable)
300    fn serialize_object(&self, object: crate::model::ObjectRef<'_>) -> SerializeResult<String> {
301        use crate::model::ObjectRef;
302
303        match object {
304            ObjectRef::NamedNode(node) => self.serialize_named_node(node.into()),
305            ObjectRef::BlankNode(node) => {
306                let node_str = node.as_str();
307                Ok(format!("_:{node_str}"))
308            }
309            ObjectRef::Literal(literal) => self.serialize_literal(literal),
310            ObjectRef::Variable(var) => {
311                let var_str = var.as_str();
312                Ok(format!("?{var_str}"))
313            }
314        }
315    }
316
317    /// Serialize a named node with prefix abbreviation
318    fn serialize_named_node(
319        &self,
320        node: crate::model::NamedNodeRef<'_>,
321    ) -> SerializeResult<String> {
322        let iri = node.as_str();
323
324        // Try to find a matching prefix
325        for (prefix, namespace) in &self.config.prefixes {
326            if iri.starts_with(namespace) {
327                let local = &iri[namespace.len()..];
328                // Check if local part is valid for prefixed name
329                if is_valid_local_name(local) {
330                    return Ok(format!("{prefix}:{local}"));
331                }
332            }
333        }
334
335        // Fall back to full IRI in angle brackets
336        Ok(format!("<{iri}>"))
337    }
338
339    /// Serialize a literal
340    fn serialize_literal(&self, literal: &crate::model::Literal) -> SerializeResult<String> {
341        let value = literal.value();
342
343        // Escape special characters in the string
344        let escaped_value = escape_turtle_string(value);
345
346        // Handle language tag
347        if let Some(lang) = literal.language() {
348            return Ok(format!("\"{escaped_value}\"@{lang}"));
349        }
350
351        // Handle datatype
352        let datatype = literal.datatype();
353        if datatype.as_str() == "http://www.w3.org/2001/XMLSchema#string" {
354            // XSD string is the default, no need to specify
355            Ok(format!("\"{escaped_value}\""))
356        } else {
357            // Serialize datatype as IRI
358            let datatype_str = self.serialize_named_node(datatype)?;
359            Ok(format!("\"{escaped_value}\"^^{datatype_str}"))
360        }
361    }
362
363    /// Finish serialization and return the writer
364    pub fn finish(self) -> SerializeResult<W> {
365        Ok(self.writer)
366    }
367
368    /// Ensure headers (prefixes, base) are written
369    fn ensure_headers_written(&mut self) -> SerializeResult<()> {
370        if self.headers_written {
371            return Ok(());
372        }
373
374        // Write base directive
375        if let Some(base) = &self.config.base_iri {
376            writeln!(self.writer, "@base <{base}> .")?;
377        }
378
379        // Write prefix directives
380        for (prefix, iri) in &self.config.prefixes {
381            writeln!(self.writer, "@prefix {prefix}: <{iri}> .")?;
382        }
383
384        // Add blank line after headers if we wrote any
385        if self.config.base_iri.is_some() || !self.config.prefixes.is_empty() {
386            writeln!(self.writer)?;
387        }
388
389        self.headers_written = true;
390        Ok(())
391    }
392}
393
394impl<W: Write> QuadSerializer<W> for WriterTurtleSerializer<W> {
395    fn serialize_quad(&mut self, quad: QuadRef<'_>) -> SerializeResult<()> {
396        // Turtle only supports default graph, so ignore named graphs
397        if quad.graph_name().is_default_graph() {
398            self.serialize_triple(quad.triple())
399        } else {
400            // Could log a warning here about ignoring named graph
401            Ok(())
402        }
403    }
404
405    fn finish(self: Box<Self>) -> SerializeResult<W> {
406        Ok(self.writer)
407    }
408}
409
410/// Check if a string is a valid local name for Turtle prefixed names
411fn is_valid_local_name(local: &str) -> bool {
412    if local.is_empty() {
413        return true; // Empty local names are allowed
414    }
415
416    // First character must be a name start char or underscore
417    let first_char = local
418        .chars()
419        .next()
420        .expect("local name validated to be non-empty");
421    if !is_pn_chars_base(first_char) && first_char != '_' {
422        return false;
423    }
424
425    // Rest of characters must be name chars, underscore, dot, or hyphen
426    for ch in local.chars().skip(1) {
427        if !is_pn_chars(ch) && ch != '.' && ch != '-' {
428            return false;
429        }
430    }
431
432    // Cannot end with a dot
433    !local.ends_with('.')
434}
435
436/// Check if character is a PN_CHARS_BASE (per Turtle grammar)
437fn is_pn_chars_base(ch: char) -> bool {
438    ch.is_ascii_alphabetic()
439        || ('\u{00C0}'..='\u{00D6}').contains(&ch)
440        || ('\u{00D8}'..='\u{00F6}').contains(&ch)
441        || ('\u{00F8}'..='\u{02FF}').contains(&ch)
442        || ('\u{0370}'..='\u{037D}').contains(&ch)
443        || ('\u{037F}'..='\u{1FFF}').contains(&ch)
444        || ('\u{200C}'..='\u{200D}').contains(&ch)
445        || ('\u{2070}'..='\u{218F}').contains(&ch)
446        || ('\u{2C00}'..='\u{2FEF}').contains(&ch)
447        || ('\u{3001}'..='\u{D7FF}').contains(&ch)
448        || ('\u{F900}'..='\u{FDCF}').contains(&ch)
449        || ('\u{FDF0}'..='\u{FFFD}').contains(&ch)
450}
451
452/// Check if character is a PN_CHARS (per Turtle grammar)
453fn is_pn_chars(ch: char) -> bool {
454    is_pn_chars_base(ch)
455        || ch == '_'
456        || ch.is_ascii_digit()
457        || ch == '\u{00B7}'
458        || ('\u{0300}'..='\u{036F}').contains(&ch)
459        || ('\u{203F}'..='\u{2040}').contains(&ch)
460}
461
462/// Escape special characters in Turtle strings
463fn escape_turtle_string(input: &str) -> String {
464    let mut result = String::with_capacity(input.len());
465
466    for ch in input.chars() {
467        match ch {
468            '"' => result.push_str("\\\""),
469            '\\' => result.push_str("\\\\"),
470            '\n' => result.push_str("\\n"),
471            '\r' => result.push_str("\\r"),
472            '\t' => result.push_str("\\t"),
473            '\x08' => result.push_str("\\b"), // backspace
474            '\x0C' => result.push_str("\\f"), // form feed
475            c if c.is_control() => {
476                // Escape other control characters as Unicode escape sequences
477                let code = c as u32;
478                result.push_str(&format!("\\u{code:04X}"));
479            }
480            c => result.push(c),
481        }
482    }
483
484    result
485}
486
487#[cfg(test)]
488mod tests {
489    use super::*;
490
491    #[test]
492    fn test_turtle_parser_creation() {
493        let parser = TurtleParser::new();
494        assert!(!parser.is_lenient());
495        assert!(parser.base_iri().is_none());
496        assert!(parser.prefixes().is_empty());
497    }
498
499    #[test]
500    fn test_turtle_parser_configuration() {
501        let parser = TurtleParser::new()
502            .lenient()
503            .with_base_iri("http://example.org/")
504            .with_prefix("ex", "http://example.org/ns#");
505
506        assert!(parser.is_lenient());
507        assert_eq!(parser.base_iri(), Some("http://example.org/"));
508        assert_eq!(
509            parser.prefixes().get("ex"),
510            Some(&"http://example.org/ns#".to_string())
511        );
512    }
513
514    #[test]
515    fn test_turtle_serializer_creation() {
516        let serializer = TurtleSerializer::new();
517        assert!(!serializer.is_pretty());
518        assert!(serializer.base_iri().is_none());
519        assert!(serializer.prefixes().is_empty());
520    }
521
522    #[test]
523    fn test_turtle_serializer_configuration() {
524        let serializer = TurtleSerializer::new()
525            .pretty()
526            .with_base_iri("http://example.org/")
527            .with_prefix("ex", "http://example.org/ns#");
528
529        assert!(serializer.is_pretty());
530        assert_eq!(serializer.base_iri(), Some("http://example.org/"));
531        assert_eq!(
532            serializer.prefixes().get("ex"),
533            Some(&"http://example.org/ns#".to_string())
534        );
535    }
536
537    #[test]
538    fn test_empty_turtle_parsing() {
539        let parser = TurtleParser::new();
540        let result = parser.parse_str("");
541        assert!(result.is_ok());
542        assert!(result.expect("should have value").is_empty());
543    }
544
545    #[test]
546    fn test_turtle_comments() {
547        let parser = TurtleParser::new();
548        let turtle = "# This is a comment\n# Another comment";
549        let result = parser.parse_str(turtle);
550        assert!(result.is_ok());
551        assert!(result.expect("should have value").is_empty());
552    }
553}