ggen_cli_lib/cmds/graph/
load.rs

1//! RDF graph loading and data ingestion functionality.
2//!
3//! This module provides comprehensive RDF data loading capabilities, supporting
4//! multiple formats (Turtle, N-Triples, RDF/XML, JSON-LD) with format detection,
5//! base IRI resolution, and merge operations. It validates input paths and
6//! formats to ensure secure and reliable data loading.
7//!
8//! # Examples
9//!
10//! ```bash
11//! ggen graph load data.ttl
12//! ggen graph load ontology.rdf --format rdfxml --base http://example.org/
13//! ggen graph load additional.ttl --merge
14//! ```
15//!
16//! # Errors
17//!
18//! Returns errors if the file path is invalid or contains traversal attempts,
19//! the RDF format is unsupported, the file cannot be read, or if RDF parsing
20//! fails due to malformed data.
21
22use clap::Args;
23use ggen_utils::error::Result;
24use std::path::{Component, Path};
25
26#[derive(Args, Debug)]
27pub struct LoadArgs {
28    /// RDF file to load
29    pub file: String,
30
31    /// RDF format (turtle, ntriples, rdfxml)
32    #[arg(long)]
33    pub format: Option<String>,
34
35    /// Base IRI for relative URIs
36    #[arg(long)]
37    pub base: Option<String>,
38
39    /// Merge with existing graph
40    #[arg(long)]
41    pub merge: bool,
42}
43
44#[cfg_attr(test, mockall::automock)]
45pub trait RdfLoader {
46    fn load(
47        &self, file: String, format: Option<String>, base: Option<String>, merge: bool,
48    ) -> Result<LoadStats>;
49}
50
51#[derive(Debug, Clone)]
52pub struct LoadStats {
53    pub triples_loaded: usize,
54    pub total_triples: usize,
55    pub format_detected: String,
56}
57
58/// Validate and sanitize file path input
59fn validate_file_path(file: &str) -> Result<()> {
60    // Validate file path is not empty
61    if file.trim().is_empty() {
62        return Err(ggen_utils::error::Error::new("File path cannot be empty"));
63    }
64
65    // Validate file path length
66    if file.len() > 1000 {
67        return Err(ggen_utils::error::Error::new(
68            "File path too long (max 1000 characters)",
69        ));
70    }
71
72    // Use Path components for proper traversal protection
73    let path = Path::new(file);
74    if path.components().any(|c| matches!(c, Component::ParentDir)) {
75        return Err(ggen_utils::error::Error::new(
76            "Path traversal detected: paths containing '..' are not allowed",
77        ));
78    }
79
80    // Validate file path format (basic pattern check)
81    if !file
82        .chars()
83        .all(|c| c.is_alphanumeric() || c == '.' || c == '/' || c == '-' || c == '_' || c == '\\')
84    {
85        return Err(ggen_utils::error::Error::new(
86            "Invalid file path format: only alphanumeric characters, dots, slashes, dashes, underscores, and backslashes allowed",
87        ));
88    }
89
90    Ok(())
91}
92
93/// Validate and sanitize format input (if provided)
94fn validate_format(format: &Option<String>) -> Result<()> {
95    if let Some(format) = format {
96        // Validate format is not empty
97        if format.trim().is_empty() {
98            return Err(ggen_utils::error::Error::new("Format cannot be empty"));
99        }
100
101        // Validate format length
102        if format.len() > 50 {
103            return Err(ggen_utils::error::Error::new(
104                "Format too long (max 50 characters)",
105            ));
106        }
107
108        // Validate format format (basic pattern check)
109        if !format
110            .chars()
111            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
112        {
113            return Err(ggen_utils::error::Error::new(
114                "Invalid format: only alphanumeric characters, dashes, and underscores allowed",
115            ));
116        }
117
118        // Validate against known formats
119        let valid_formats = ["turtle", "ntriples", "rdfxml", "jsonld", "n3"];
120        if !valid_formats.contains(&format.to_lowercase().as_str()) {
121            return Err(ggen_utils::error::Error::new(
122                "Unsupported format: supported formats are turtle, ntriples, rdfxml, jsonld, n3",
123            ));
124        }
125    }
126
127    Ok(())
128}
129
130/// Validate and sanitize base IRI input (if provided)
131fn validate_base_iri(base: &Option<String>) -> Result<()> {
132    if let Some(base) = base {
133        // Validate base IRI is not empty
134        if base.trim().is_empty() {
135            return Err(ggen_utils::error::Error::new("Base IRI cannot be empty"));
136        }
137
138        // Validate base IRI length
139        if base.len() > 500 {
140            return Err(ggen_utils::error::Error::new(
141                "Base IRI too long (max 500 characters)",
142            ));
143        }
144
145        // Basic URI validation
146        if !base.starts_with("http://")
147            && !base.starts_with("https://")
148            && !base.starts_with("file://")
149        {
150            return Err(ggen_utils::error::Error::new(
151                "Invalid base IRI: must start with http://, https://, or file://",
152            ));
153        }
154    }
155
156    Ok(())
157}
158
159/// Detect RDF format from file extension
160fn detect_format_from_extension(filename: &str) -> &'static str {
161    let path = std::path::Path::new(filename);
162    match path.extension().and_then(|ext| ext.to_str()) {
163        Some("ttl") | Some("turtle") => "turtle",
164        Some("nt") | Some("ntriples") => "ntriples",
165        Some("rdf") | Some("xml") => "rdfxml",
166        Some("jsonld") | Some("json") => "jsonld",
167        Some("n3") => "n3",
168        _ => "turtle", // Default to turtle
169    }
170}
171
172pub async fn run(args: &LoadArgs) -> Result<()> {
173    // Validate inputs
174    validate_file_path(&args.file)?;
175    validate_format(&args.format)?;
176    validate_base_iri(&args.base)?;
177
178    println!("📊 Loading RDF graph...");
179
180    // Check if file exists
181    let file_path = std::path::Path::new(&args.file);
182    if !file_path.exists() {
183        return Err(ggen_utils::error::Error::new(&format!(
184            "File not found: {}",
185            args.file
186        )));
187    }
188
189    // Detect format if not provided
190    let format = args
191        .format
192        .as_deref()
193        .unwrap_or_else(|| detect_format_from_extension(&args.file));
194
195    println!("📁 Loading file: {}", args.file);
196    println!("🔍 Format: {}", format);
197
198    if let Some(base) = &args.base {
199        println!("🌐 Base IRI: {}", base);
200    }
201
202    // Load the RDF file using ggen-core
203    let graph = ggen_core::Graph::load_from_file(&args.file)
204        .map_err(|e| ggen_utils::error::Error::new(&format!("Failed to load RDF file: {}", e)))?;
205
206    // Get graph statistics
207    let triples_count = graph.len();
208
209    if args.merge {
210        println!(
211            "✅ Merged {} triples from {} ({})",
212            triples_count, args.file, format
213        );
214        println!("📊 Total triples in graph: {}", triples_count);
215    } else {
216        println!(
217            "✅ Loaded {} triples from {} ({})",
218            triples_count, args.file, format
219        );
220    }
221
222    Ok(())
223}
224
225pub async fn run_with_deps(args: &LoadArgs, loader: &dyn RdfLoader) -> Result<()> {
226    // Validate inputs
227    validate_file_path(&args.file)?;
228    validate_format(&args.format)?;
229    validate_base_iri(&args.base)?;
230
231    // Show progress for loading operation
232    println!("🔍 Loading RDF file...");
233
234    let stats = loader.load(
235        args.file.clone(),
236        args.format.clone(),
237        args.base.clone(),
238        args.merge,
239    )?;
240
241    if args.merge {
242        println!(
243            "✅ Loaded {} triples from {} ({})",
244            stats.triples_loaded, args.file, stats.format_detected
245        );
246        println!("📊 Total triples in graph: {}", stats.total_triples);
247    } else {
248        println!(
249            "✅ Loaded {} triples from {} ({})",
250            stats.triples_loaded, args.file, stats.format_detected
251        );
252    }
253
254    Ok(())
255}
256
257#[cfg(test)]
258mod tests {
259    use super::*;
260    use mockall::predicate::*;
261
262    #[tokio::test]
263    async fn test_load_rdf_file() {
264        let mut mock_loader = MockRdfLoader::new();
265        mock_loader
266            .expect_load()
267            .with(
268                eq(String::from("data.ttl")),
269                eq(Some(String::from("turtle"))),
270                eq(None::<String>),
271                eq(false),
272            )
273            .times(1)
274            .returning(|_, _, _, _| {
275                Ok(LoadStats {
276                    triples_loaded: 100,
277                    total_triples: 100,
278                    format_detected: "Turtle".to_string(),
279                })
280            });
281
282        let args = LoadArgs {
283            file: "data.ttl".to_string(),
284            format: Some("turtle".to_string()),
285            base: None,
286            merge: false,
287        };
288
289        let result = run_with_deps(&args, &mock_loader).await;
290        assert!(result.is_ok());
291    }
292
293    #[tokio::test]
294    async fn test_load_with_merge() {
295        let mut mock_loader = MockRdfLoader::new();
296        mock_loader
297            .expect_load()
298            .with(
299                eq(String::from("additional.ttl")),
300                always(),
301                always(),
302                eq(true),
303            )
304            .times(1)
305            .returning(|_, _, _, _| {
306                Ok(LoadStats {
307                    triples_loaded: 50,
308                    total_triples: 150,
309                    format_detected: "Turtle".to_string(),
310                })
311            });
312
313        let args = LoadArgs {
314            file: "additional.ttl".to_string(),
315            format: None,
316            base: None,
317            merge: true,
318        };
319
320        let result = run_with_deps(&args, &mock_loader).await;
321        assert!(result.is_ok());
322    }
323
324    #[tokio::test]
325    async fn test_load_with_base_iri() {
326        let mut mock_loader = MockRdfLoader::new();
327        mock_loader
328            .expect_load()
329            .with(
330                eq(String::from("relative.ttl")),
331                always(),
332                eq(Some(String::from("http://example.org/"))),
333                eq(false),
334            )
335            .times(1)
336            .returning(|_, _, _, _| {
337                Ok(LoadStats {
338                    triples_loaded: 25,
339                    total_triples: 25,
340                    format_detected: "Turtle".to_string(),
341                })
342            });
343
344        let args = LoadArgs {
345            file: "relative.ttl".to_string(),
346            format: None,
347            base: Some("http://example.org/".to_string()),
348            merge: false,
349        };
350
351        let result = run_with_deps(&args, &mock_loader).await;
352        assert!(result.is_ok());
353    }
354}