Skip to main content

ext_rdf/
lib.rs

1//! ext-rdf: RDF Linked Data import extension for KyuGraph.
2//!
3//! Provides RDF file parsing (Turtle, N-Triples, N-Quads, RDF/XML) and
4//! automatic schema inference following Linked Data Principles:
5//!
6//! - `rdf:type` → node table assignment
7//! - Literal-valued predicates → node properties
8//! - URI-valued predicates → relationships (hyperlinks as edges)
9//! - URI serves as primary key on every node table
10//!
11//! ## Library API
12//!
13//! ```ignore
14//! let triples = ext_rdf::parse_triples("foaf.ttl")?;
15//! let schema = ext_rdf::infer_schema(&triples)?;
16//! ```
17//!
18//! ## Extension procedures
19//!
20//! ```cypher
21//! CALL rdf.stats('file.ttl')
22//! CALL rdf.prefixes('file.ttl')
23//! CALL rdf.types('file.ttl')
24//! ```
25
26pub mod model;
27pub mod parser;
28pub mod schema;
29
30use std::collections::HashMap;
31
32use kyu_extension::{Extension, ProcColumn, ProcParam, ProcRow, ProcedureSignature};
33use kyu_types::{LogicalType, TypedValue};
34use smol_str::SmolStr;
35
36pub use model::{RdfNodeTable, RdfObject, RdfRelTable, RdfSchema, Triple};
37pub use parser::parse_triples;
38pub use schema::infer_schema;
39
40/// RDF extension providing inspection procedures.
41pub struct RdfExtension;
42
43impl RdfExtension {
44    pub fn new() -> Self {
45        Self
46    }
47}
48
49impl Default for RdfExtension {
50    fn default() -> Self {
51        Self::new()
52    }
53}
54
55impl Extension for RdfExtension {
56    fn name(&self) -> &str {
57        "rdf"
58    }
59
60    fn procedures(&self) -> Vec<ProcedureSignature> {
61        vec![
62            ProcedureSignature {
63                name: "stats".into(),
64                params: vec![ProcParam {
65                    name: "path".into(),
66                    type_desc: "STRING".into(),
67                }],
68                columns: vec![
69                    ProcColumn {
70                        name: "triple_count".into(),
71                        data_type: LogicalType::Int64,
72                    },
73                    ProcColumn {
74                        name: "subject_count".into(),
75                        data_type: LogicalType::Int64,
76                    },
77                    ProcColumn {
78                        name: "predicate_count".into(),
79                        data_type: LogicalType::Int64,
80                    },
81                    ProcColumn {
82                        name: "type_count".into(),
83                        data_type: LogicalType::Int64,
84                    },
85                ],
86            },
87            ProcedureSignature {
88                name: "prefixes".into(),
89                params: vec![ProcParam {
90                    name: "path".into(),
91                    type_desc: "STRING".into(),
92                }],
93                columns: vec![
94                    ProcColumn {
95                        name: "prefix".into(),
96                        data_type: LogicalType::String,
97                    },
98                    ProcColumn {
99                        name: "namespace".into(),
100                        data_type: LogicalType::String,
101                    },
102                ],
103            },
104            ProcedureSignature {
105                name: "types".into(),
106                params: vec![ProcParam {
107                    name: "path".into(),
108                    type_desc: "STRING".into(),
109                }],
110                columns: vec![
111                    ProcColumn {
112                        name: "type_uri".into(),
113                        data_type: LogicalType::String,
114                    },
115                    ProcColumn {
116                        name: "local_name".into(),
117                        data_type: LogicalType::String,
118                    },
119                    ProcColumn {
120                        name: "count".into(),
121                        data_type: LogicalType::Int64,
122                    },
123                ],
124            },
125        ]
126    }
127
128    fn execute(
129        &self,
130        procedure: &str,
131        args: &[String],
132        _adjacency: &HashMap<i64, Vec<(i64, f64)>>,
133    ) -> Result<Vec<ProcRow>, String> {
134        let path = args.first().ok_or("rdf.* requires a file path argument")?;
135
136        match procedure {
137            "stats" => exec_stats(path),
138            "prefixes" => exec_prefixes(path),
139            "types" => exec_types(path),
140            _ => Err(format!("unknown procedure: {procedure}")),
141        }
142    }
143}
144
145fn exec_stats(path: &str) -> Result<Vec<ProcRow>, String> {
146    let triples = parse_triples(path).map_err(|e| e.to_string())?;
147
148    let mut subjects = std::collections::HashSet::new();
149    let mut predicates = std::collections::HashSet::new();
150    let mut type_count = 0u64;
151
152    for t in &triples {
153        subjects.insert(&t.subject);
154        predicates.insert(&t.predicate);
155        if t.predicate == model::RDF_TYPE {
156            type_count += 1;
157        }
158    }
159
160    Ok(vec![vec![
161        TypedValue::Int64(triples.len() as i64),
162        TypedValue::Int64(subjects.len() as i64),
163        TypedValue::Int64(predicates.len() as i64),
164        TypedValue::Int64(type_count as i64),
165    ]])
166}
167
168fn exec_prefixes(path: &str) -> Result<Vec<ProcRow>, String> {
169    let prefixes = parser::parse_prefixes(path).map_err(|e| e.to_string())?;
170    Ok(prefixes
171        .into_iter()
172        .map(|(prefix, ns)| {
173            vec![
174                TypedValue::String(SmolStr::new(prefix)),
175                TypedValue::String(SmolStr::new(ns)),
176            ]
177        })
178        .collect())
179}
180
181fn exec_types(path: &str) -> Result<Vec<ProcRow>, String> {
182    let triples = parse_triples(path).map_err(|e| e.to_string())?;
183
184    let mut type_counts: HashMap<String, i64> = HashMap::new();
185    for t in &triples {
186        if t.predicate == model::RDF_TYPE
187            && let RdfObject::Uri(ref uri) = t.object
188        {
189            *type_counts.entry(uri.clone()).or_insert(0) += 1;
190        }
191    }
192
193    let mut rows: Vec<ProcRow> = type_counts
194        .into_iter()
195        .map(|(uri, count)| {
196            let name = model::local_name(&uri).to_string();
197            vec![
198                TypedValue::String(SmolStr::new(&uri)),
199                TypedValue::String(SmolStr::new(name)),
200                TypedValue::Int64(count),
201            ]
202        })
203        .collect();
204    rows.sort_by(|a, b| {
205        let a_count = match &a[2] {
206            TypedValue::Int64(n) => *n,
207            _ => 0,
208        };
209        let b_count = match &b[2] {
210            TypedValue::Int64(n) => *n,
211            _ => 0,
212        };
213        b_count.cmp(&a_count)
214    });
215
216    Ok(rows)
217}