oxirs_ttl/toolkit/
format_converter.rs

1//! Format Conversion Utilities
2//!
3//! This module provides utilities for converting between different RDF formats.
4//! Supports both in-memory and streaming conversions with format auto-detection.
5//!
6//! # Examples
7//!
8//! ## Basic Format Conversion
9//!
10//! ```rust
11//! use oxirs_ttl::toolkit::format_converter::FormatConverter;
12//! use oxirs_ttl::toolkit::RdfFormat;
13//!
14//! let turtle_data = r#"
15//! @prefix ex: <http://example.org/> .
16//! ex:subject ex:predicate "object" .
17//! "#;
18//!
19//! let converter = FormatConverter::new();
20//! let ntriples = converter.convert_string(
21//!     turtle_data,
22//!     RdfFormat::Turtle,
23//!     RdfFormat::NTriples
24//! )?;
25//!
26//! assert!(ntriples.contains("<http://example.org/subject>"));
27//! # Ok::<(), Box<dyn std::error::Error>>(())
28//! ```
29//!
30//! ## Streaming Conversion
31//!
32//! ```rust
33//! use oxirs_ttl::toolkit::format_converter::FormatConverter;
34//! use oxirs_ttl::toolkit::RdfFormat;
35//! use std::io::Cursor;
36//!
37//! let turtle_data = b"<http://s> <http://p> <http://o> .";
38//! let input = Cursor::new(turtle_data);
39//! let mut output = Vec::new();
40//!
41//! let converter = FormatConverter::new();
42//! converter.convert_stream(
43//!     input,
44//!     &mut output,
45//!     RdfFormat::NTriples,
46//!     RdfFormat::Turtle
47//! )?;
48//!
49//! let result = String::from_utf8(output)?;
50//! assert!(result.contains("<http://s>"));
51//! # Ok::<(), Box<dyn std::error::Error>>(())
52//! ```
53
54use crate::error::TurtleParseError;
55use crate::formats::nquads::{NQuadsParser, NQuadsSerializer};
56use crate::formats::ntriples::{NTriplesParser, NTriplesSerializer};
57use crate::formats::trig::TriGParser;
58use crate::formats::turtle::{TurtleParser, TurtleSerializer};
59use crate::toolkit::{Parser, RdfFormat, SerializationConfig, Serializer};
60use oxirs_core::model::{Quad, Triple};
61use std::io::{BufRead, BufReader, Write};
62
63/// Result type for format conversion operations
64pub type ConversionResult<T> = Result<T, ConversionError>;
65
66/// Errors that can occur during format conversion
67#[derive(Debug, thiserror::Error)]
68pub enum ConversionError {
69    /// Parse error during conversion
70    #[error("Parse error: {0}")]
71    ParseError(#[from] TurtleParseError),
72
73    /// I/O error during conversion
74    #[error("I/O error: {0}")]
75    IoError(#[from] std::io::Error),
76
77    /// Unsupported format combination
78    #[error("Unsupported conversion from {0:?} to {1:?}")]
79    UnsupportedConversion(RdfFormat, RdfFormat),
80
81    /// Serialization error
82    #[error("Serialization error: {0}")]
83    SerializationError(String),
84
85    /// Invalid input data
86    #[error("Invalid input: {0}")]
87    InvalidInput(String),
88}
89
90/// Configuration for format conversion
91#[derive(Debug, Clone)]
92pub struct ConversionConfig {
93    /// Serialization configuration
94    pub serialization: SerializationConfig,
95    /// Whether to preserve prefixes when possible
96    pub preserve_prefixes: bool,
97    /// Whether to use lenient parsing
98    pub lenient_parsing: bool,
99    /// Batch size for streaming conversion
100    pub batch_size: usize,
101}
102
103impl Default for ConversionConfig {
104    fn default() -> Self {
105        Self {
106            serialization: SerializationConfig::default(),
107            preserve_prefixes: true,
108            lenient_parsing: false,
109            batch_size: 10_000,
110        }
111    }
112}
113
114impl ConversionConfig {
115    /// Create a new conversion config with default settings
116    pub fn new() -> Self {
117        Self::default()
118    }
119
120    /// Enable lenient parsing
121    pub fn with_lenient(mut self, lenient: bool) -> Self {
122        self.lenient_parsing = lenient;
123        self
124    }
125
126    /// Set whether to preserve prefixes
127    pub fn with_preserve_prefixes(mut self, preserve: bool) -> Self {
128        self.preserve_prefixes = preserve;
129        self
130    }
131
132    /// Set batch size for streaming
133    pub fn with_batch_size(mut self, size: usize) -> Self {
134        self.batch_size = size;
135        self
136    }
137
138    /// Set serialization configuration
139    pub fn with_serialization(mut self, config: SerializationConfig) -> Self {
140        self.serialization = config;
141        self
142    }
143}
144
145/// Format converter for RDF data
146///
147/// Provides methods for converting between different RDF formats.
148#[derive(Debug)]
149pub struct FormatConverter {
150    config: ConversionConfig,
151}
152
153impl FormatConverter {
154    /// Create a new format converter with default configuration
155    pub fn new() -> Self {
156        Self {
157            config: ConversionConfig::default(),
158        }
159    }
160
161    /// Create a converter with custom configuration
162    pub fn with_config(config: ConversionConfig) -> Self {
163        Self { config }
164    }
165
166    /// Convert a string from one format to another
167    ///
168    /// # Example
169    ///
170    /// ```rust
171    /// use oxirs_ttl::toolkit::format_converter::FormatConverter;
172    /// use oxirs_ttl::toolkit::RdfFormat;
173    ///
174    /// let converter = FormatConverter::new();
175    /// let turtle = r#"<http://s> <http://p> <http://o> ."#;
176    /// let ntriples = converter.convert_string(
177    ///     turtle,
178    ///     RdfFormat::Turtle,
179    ///     RdfFormat::NTriples
180    /// )?;
181    /// # Ok::<(), Box<dyn std::error::Error>>(())
182    /// ```
183    pub fn convert_string(
184        &self,
185        input: &str,
186        from: RdfFormat,
187        to: RdfFormat,
188    ) -> ConversionResult<String> {
189        let mut output = Vec::new();
190        let input_bytes = input.as_bytes().to_vec();
191        let cursor = std::io::Cursor::new(input_bytes);
192        self.convert_stream(cursor, &mut output, from, to)?;
193        String::from_utf8(output).map_err(|e| {
194            ConversionError::SerializationError(format!("Invalid UTF-8 output: {}", e))
195        })
196    }
197
198    /// Convert between formats using streams
199    ///
200    /// This is the most memory-efficient method for large datasets.
201    pub fn convert_stream<R: BufRead + 'static, W: Write>(
202        &self,
203        input: R,
204        output: &mut W,
205        from: RdfFormat,
206        to: RdfFormat,
207    ) -> ConversionResult<()> {
208        // Check if formats support triples or quads
209        let from_has_quads = matches!(from, RdfFormat::NQuads | RdfFormat::TriG);
210        let to_has_quads = matches!(to, RdfFormat::NQuads | RdfFormat::TriG);
211
212        if from_has_quads && !to_has_quads {
213            // Convert quads to triples (extract default graph)
214            let quads = self.parse_quads(input, from)?;
215            let triples: Vec<Triple> = quads
216                .into_iter()
217                .filter_map(|q| match q.graph_name() {
218                    oxirs_core::model::GraphName::DefaultGraph => Some(Triple::new(
219                        q.subject().clone(),
220                        q.predicate().clone(),
221                        q.object().clone(),
222                    )),
223                    _ => None,
224                })
225                .collect();
226            self.serialize_triples(&triples, output, to)?;
227        } else if !from_has_quads && to_has_quads {
228            // Convert triples to quads (add to default graph)
229            let triples = self.parse_triples(input, from)?;
230            let quads: Vec<Quad> = triples
231                .into_iter()
232                .map(|t| {
233                    Quad::new(
234                        t.subject().clone(),
235                        t.predicate().clone(),
236                        t.object().clone(),
237                        oxirs_core::model::GraphName::DefaultGraph,
238                    )
239                })
240                .collect();
241            self.serialize_quads(&quads, output, to)?;
242        } else if from_has_quads && to_has_quads {
243            // Both use quads
244            let quads = self.parse_quads(input, from)?;
245            self.serialize_quads(&quads, output, to)?;
246        } else {
247            // Both use triples
248            let triples = self.parse_triples(input, from)?;
249            self.serialize_triples(&triples, output, to)?;
250        }
251
252        Ok(())
253    }
254
255    /// Parse triples from input
256    fn parse_triples<R: BufRead + 'static>(
257        &self,
258        input: R,
259        format: RdfFormat,
260    ) -> ConversionResult<Vec<Triple>> {
261        match format {
262            RdfFormat::Turtle => {
263                let parser = if self.config.lenient_parsing {
264                    TurtleParser::new_lenient()
265                } else {
266                    TurtleParser::new()
267                };
268                parser
269                    .for_reader(input)
270                    .collect::<Result<Vec<_>, _>>()
271                    .map_err(ConversionError::from)
272            }
273            RdfFormat::NTriples => {
274                let parser = NTriplesParser::new();
275                parser
276                    .for_reader(input)
277                    .collect::<Result<Vec<_>, _>>()
278                    .map_err(ConversionError::from)
279            }
280            _ => Err(ConversionError::UnsupportedConversion(
281                format,
282                RdfFormat::Turtle,
283            )),
284        }
285    }
286
287    /// Parse quads from input
288    fn parse_quads<R: BufRead + 'static>(
289        &self,
290        input: R,
291        format: RdfFormat,
292    ) -> ConversionResult<Vec<Quad>> {
293        match format {
294            RdfFormat::NQuads => {
295                let parser = NQuadsParser::new();
296                parser
297                    .for_reader(input)
298                    .collect::<Result<Vec<_>, _>>()
299                    .map_err(ConversionError::from)
300            }
301            RdfFormat::TriG => {
302                let parser = TriGParser::new();
303                parser
304                    .for_reader(input)
305                    .collect::<Result<Vec<_>, _>>()
306                    .map_err(ConversionError::from)
307            }
308            _ => Err(ConversionError::UnsupportedConversion(
309                format,
310                RdfFormat::TriG,
311            )),
312        }
313    }
314
315    /// Serialize triples to output
316    fn serialize_triples<W: Write>(
317        &self,
318        triples: &[Triple],
319        output: &mut W,
320        format: RdfFormat,
321    ) -> ConversionResult<()> {
322        match format {
323            RdfFormat::Turtle => {
324                let serializer = TurtleSerializer::with_config(self.config.serialization.clone());
325                serializer
326                    .serialize(triples, output)
327                    .map_err(|e| ConversionError::SerializationError(e.to_string()))?;
328            }
329            RdfFormat::NTriples => {
330                let serializer = NTriplesSerializer::new();
331                serializer
332                    .serialize(triples, output)
333                    .map_err(|e| ConversionError::SerializationError(e.to_string()))?;
334            }
335            _ => {
336                return Err(ConversionError::UnsupportedConversion(
337                    RdfFormat::Turtle,
338                    format,
339                ))
340            }
341        }
342        Ok(())
343    }
344
345    /// Serialize quads to output
346    fn serialize_quads<W: Write>(
347        &self,
348        quads: &[Quad],
349        output: &mut W,
350        format: RdfFormat,
351    ) -> ConversionResult<()> {
352        match format {
353            RdfFormat::NQuads => {
354                let serializer = NQuadsSerializer::new();
355                serializer
356                    .serialize(quads, output)
357                    .map_err(|e| ConversionError::SerializationError(e.to_string()))?;
358            }
359            RdfFormat::TriG => {
360                // TriG serialization not yet implemented in serializer
361                // For now, serialize as N-Quads
362                let serializer = NQuadsSerializer::new();
363                serializer
364                    .serialize(quads, output)
365                    .map_err(|e| ConversionError::SerializationError(e.to_string()))?;
366            }
367            _ => {
368                return Err(ConversionError::UnsupportedConversion(
369                    RdfFormat::TriG,
370                    format,
371                ))
372            }
373        }
374        Ok(())
375    }
376
377    /// Convert a file from one format to another
378    pub fn convert_file(
379        &self,
380        input_path: &str,
381        output_path: &str,
382        from: RdfFormat,
383        to: RdfFormat,
384    ) -> ConversionResult<ConversionStats> {
385        let input = std::fs::File::open(input_path)?;
386        let reader = BufReader::new(input);
387
388        let mut output = std::fs::File::create(output_path)?;
389
390        let start = std::time::Instant::now();
391        self.convert_stream(reader, &mut output, from, to)?;
392        let duration = start.elapsed();
393
394        Ok(ConversionStats {
395            duration,
396            items_processed: 0, // TODO: track this
397        })
398    }
399}
400
401impl Default for FormatConverter {
402    fn default() -> Self {
403        Self::new()
404    }
405}
406
407/// Statistics for a conversion operation
408#[derive(Debug, Clone)]
409pub struct ConversionStats {
410    /// Time taken for conversion
411    pub duration: std::time::Duration,
412    /// Number of items processed
413    pub items_processed: usize,
414}
415
416impl ConversionStats {
417    /// Get throughput in items per second
418    pub fn throughput(&self) -> f64 {
419        if self.duration.as_secs_f64() > 0.0 {
420            self.items_processed as f64 / self.duration.as_secs_f64()
421        } else {
422            0.0
423        }
424    }
425}
426
427#[cfg(test)]
428mod tests {
429    use super::*;
430
431    #[test]
432    fn test_turtle_to_ntriples() {
433        let converter = FormatConverter::new();
434        let turtle = r#"
435@prefix ex: <http://example.org/> .
436ex:subject ex:predicate "object" .
437        "#;
438
439        let result = converter
440            .convert_string(turtle, RdfFormat::Turtle, RdfFormat::NTriples)
441            .unwrap();
442
443        assert!(result.contains("<http://example.org/subject>"));
444        assert!(result.contains("<http://example.org/predicate>"));
445        assert!(result.contains("\"object\""));
446    }
447
448    #[test]
449    fn test_ntriples_to_turtle() {
450        let converter = FormatConverter::new();
451        let ntriples = "<http://example.org/s> <http://example.org/p> \"o\" .";
452
453        let result = converter
454            .convert_string(ntriples, RdfFormat::NTriples, RdfFormat::Turtle)
455            .unwrap();
456
457        assert!(result.contains("<http://example.org/s>"));
458    }
459
460    #[test]
461    fn test_streaming_conversion() {
462        let converter = FormatConverter::new();
463        let turtle = b"<http://s> <http://p> <http://o> ." as &[u8];
464        let mut output = Vec::new();
465
466        converter
467            .convert_stream(turtle, &mut output, RdfFormat::Turtle, RdfFormat::NTriples)
468            .unwrap();
469
470        let result = String::from_utf8(output).unwrap();
471        assert!(result.contains("<http://s>"));
472    }
473
474    #[test]
475    fn test_config_builder() {
476        let config = ConversionConfig::new()
477            .with_lenient(true)
478            .with_preserve_prefixes(false)
479            .with_batch_size(5000);
480
481        assert!(config.lenient_parsing);
482        assert!(!config.preserve_prefixes);
483        assert_eq!(config.batch_size, 5000);
484    }
485}