oxirs_core/assembler/builder.rs
1//! Jena Assembler document parser.
2//!
3//! Converts a Turtle-format Jena Assembler document (or a pre-expanded set of
4//! string triples) into an [`AssemblerConfig`] value.
5//!
6//! ## Design
7//!
8//! The implementation has two layers:
9//!
10//! 1. **[`AssemblerBuilder::from_triples`]** — the core graph-walking logic.
11//! Accepts `(subject, predicate, object)` string tuples (IRIs and literals
12//! in N-Triples-like form, already expanded), and produces the config.
13//! This is the primary testing seam: it works without any external parser.
14//!
15//! 2. **[`AssemblerBuilder::from_turtle`]** — uses `oxttl` (already a workspace
16//! dep) to parse Turtle into `oxrdf::Triple` values, then converts each
17//! triple into a string tuple and delegates to `from_triples`.
18//!
19//! ## Triple representation for `from_triples`
20//!
21//! Subjects and objects that are IRIs are stored as bare IRI strings
22//! (e.g. `"http://example.org/ds"`). Blank nodes are stored with a leading
23//! `"_:"` sigil (e.g. `"_:b0"`). Literal objects are stored with their
24//! N-Triples quotation (e.g. `"\"/data/db\""` or `"\"hello\"^^<xsd:string>"`).
25//! The builder strips the surrounding double-quotes when it extracts literal
26//! values.
27
28use std::collections::HashMap;
29use std::io::Cursor;
30use std::path::PathBuf;
31
32use super::config::{AssemblerConfig, DatasetConfig, GraphConfig, StoreBackend};
33use super::vocab::{
34 JA_CONTENT_URL, JA_DEFAULT_GRAPH, JA_GRAPH, JA_GRAPH_NAME, JA_MEMORY_DATASET, JA_MEMORY_MODEL,
35 JA_NAMED_GRAPH, JA_RDF_DATASET, RDF_TYPE, TDB2_DATASET, TDB2_LOCATION,
36};
37
38// ---------------------------------------------------------------------------
39// AssemblerError
40// ---------------------------------------------------------------------------
41
42/// Errors produced by the Jena Assembler parser.
43#[derive(Debug)]
44pub enum AssemblerError {
45 /// The Turtle source could not be parsed.
46 ParseError(String),
47
48 /// A required triple was absent from the graph.
49 MissingRequired { resource: String, property: String },
50
51 /// The backend type IRI is recognised but could not be instantiated
52 /// (e.g. `tdb2:DatasetTDB2` without a `tdb2:location`).
53 InvalidLocation(String),
54}
55
56impl std::fmt::Display for AssemblerError {
57 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58 match self {
59 AssemblerError::ParseError(msg) => write!(f, "Assembler parse error: {msg}"),
60 AssemblerError::MissingRequired { resource, property } => {
61 write!(f, "Missing required property <{property}> on <{resource}>")
62 }
63 AssemblerError::InvalidLocation(msg) => {
64 write!(f, "Invalid tdb2:location: {msg}")
65 }
66 }
67 }
68}
69
70impl std::error::Error for AssemblerError {}
71
72// ---------------------------------------------------------------------------
73// Internal adjacency map
74// ---------------------------------------------------------------------------
75
76/// A minimal in-memory graph: subject → list of (predicate, object) pairs.
77///
78/// Keys may be bare IRIs or `_:`-prefixed blank-node IDs.
79type AdjMap = HashMap<String, Vec<(String, String)>>;
80
81fn adjacency_map(triples: &[(String, String, String)]) -> AdjMap {
82 let mut map: AdjMap = HashMap::new();
83 for (s, p, o) in triples {
84 map.entry(s.clone())
85 .or_default()
86 .push((p.clone(), o.clone()));
87 }
88 map
89}
90
91/// Return all objects for the given subject + predicate pair.
92fn objects_of<'a>(map: &'a AdjMap, subject: &str, predicate: &str) -> Vec<&'a str> {
93 match map.get(subject) {
94 None => vec![],
95 Some(pairs) => pairs
96 .iter()
97 .filter(|(p, _)| p == predicate)
98 .map(|(_, o)| o.as_str())
99 .collect(),
100 }
101}
102
103/// Return the first object for the given subject + predicate pair.
104fn first_object<'a>(map: &'a AdjMap, subject: &str, predicate: &str) -> Option<&'a str> {
105 objects_of(map, subject, predicate).into_iter().next()
106}
107
108// ---------------------------------------------------------------------------
109// Literal stripping
110// ---------------------------------------------------------------------------
111
112/// Extract the lexical value from an N-Triples-like literal string.
113///
114/// Literals may arrive as:
115/// - `"some text"` → `some text`
116/// - `"some text"@en` → `some text`
117/// - `"some text"^^<...>` → `some text`
118///
119/// Non-quoted strings are returned as-is (they are IRIs or blank nodes).
120fn strip_literal(raw: &str) -> &str {
121 if let Some(inner) = raw.strip_prefix('"') {
122 // The closing quote may be followed by @lang or ^^type
123 let end = inner.rfind('"').unwrap_or(inner.len());
124 &inner[..end]
125 } else {
126 raw
127 }
128}
129
130// ---------------------------------------------------------------------------
131// Backend resolution
132// ---------------------------------------------------------------------------
133
134fn resolve_backend(
135 map: &AdjMap,
136 resource: &str,
137 type_iri: &str,
138) -> Result<StoreBackend, AssemblerError> {
139 if type_iri == JA_MEMORY_MODEL || type_iri == JA_MEMORY_DATASET || type_iri == JA_RDF_DATASET {
140 Ok(StoreBackend::InMemory)
141 } else if type_iri == TDB2_DATASET {
142 let loc_raw = first_object(map, resource, TDB2_LOCATION).ok_or_else(|| {
143 AssemblerError::MissingRequired {
144 resource: resource.to_owned(),
145 property: TDB2_LOCATION.to_owned(),
146 }
147 })?;
148 let loc_str = strip_literal(loc_raw);
149 if loc_str.is_empty() {
150 return Err(AssemblerError::InvalidLocation(
151 "tdb2:location value is empty".to_owned(),
152 ));
153 }
154 Ok(StoreBackend::Tdb2 {
155 location: PathBuf::from(loc_str),
156 })
157 } else {
158 Ok(StoreBackend::Unknown(type_iri.to_owned()))
159 }
160}
161
162// ---------------------------------------------------------------------------
163// GraphConfig resolution
164// ---------------------------------------------------------------------------
165
166/// Build a [`GraphConfig`] for a blank-node or IRI `graph_resource` subject.
167///
168/// `graph_name` is the named-graph IRI (or `None` for the default graph).
169fn build_graph_config(
170 map: &AdjMap,
171 graph_resource: &str,
172 graph_name: Option<String>,
173) -> GraphConfig {
174 // Determine backend from rdf:type on the model resource (best-effort; default InMemory)
175 let backend = objects_of(map, graph_resource, RDF_TYPE)
176 .into_iter()
177 .find_map(|type_iri| resolve_backend(map, graph_resource, type_iri).ok())
178 .unwrap_or(StoreBackend::InMemory);
179
180 // Collect ja:contentURL values from the model resource itself and any
181 // linked ja:content blank node.
182 let mut content_urls: Vec<String> = Vec::new();
183
184 // Direct contentURL on model resource
185 for url_raw in objects_of(map, graph_resource, JA_CONTENT_URL) {
186 content_urls.push(strip_literal(url_raw).to_owned());
187 }
188
189 // Indirect via ja:content → blank node → ja:contentURL
190 for content_bnode in objects_of(map, graph_resource, super::vocab::JA_CONTENT) {
191 for url_raw in objects_of(map, content_bnode, JA_CONTENT_URL) {
192 content_urls.push(strip_literal(url_raw).to_owned());
193 }
194 }
195
196 GraphConfig {
197 graph_name,
198 backend,
199 content_urls,
200 }
201}
202
203// ---------------------------------------------------------------------------
204// DatasetConfig resolution
205// ---------------------------------------------------------------------------
206
207fn build_dataset_config(
208 map: &AdjMap,
209 resource: &str,
210 type_iri: &str,
211) -> Result<DatasetConfig, AssemblerError> {
212 let backend = resolve_backend(map, resource, type_iri)?;
213
214 // Collect named-graph descriptions: ja:namedGraph → blank node with
215 // ja:graphName <iri> and ja:graph <model>
216 let mut named_graphs: Vec<GraphConfig> = Vec::new();
217
218 for ng_bnode in objects_of(map, resource, JA_NAMED_GRAPH) {
219 // ja:graphName gives the named-graph IRI
220 let graph_name = first_object(map, ng_bnode, JA_GRAPH_NAME).map(|s| s.to_owned());
221
222 // ja:graph gives the model resource
223 if let Some(model_resource) = first_object(map, ng_bnode, JA_GRAPH) {
224 named_graphs.push(build_graph_config(map, model_resource, graph_name));
225 } else {
226 // No model resource — treat the blank node itself as the model
227 named_graphs.push(build_graph_config(map, ng_bnode, graph_name));
228 }
229 }
230
231 // Default graph: ja:defaultGraph → model resource
232 let default_graph = first_object(map, resource, JA_DEFAULT_GRAPH)
233 .map(|model_resource| build_graph_config(map, model_resource, None));
234
235 Ok(DatasetConfig {
236 resource_iri: resource.to_owned(),
237 backend,
238 named_graphs,
239 default_graph,
240 })
241}
242
243// ---------------------------------------------------------------------------
244// AssemblerBuilder
245// ---------------------------------------------------------------------------
246
247/// Parses Jena Assembler documents into [`AssemblerConfig`] values.
248pub struct AssemblerBuilder;
249
250impl AssemblerBuilder {
251 /// Parse an `(subject, predicate, object)` triple set into an
252 /// [`AssemblerConfig`].
253 ///
254 /// Each element of the slice is a `(String, String, String)` tuple where:
255 /// - Subjects are bare IRI strings or `"_:id"` for blank nodes.
256 /// - Predicates are bare IRI strings.
257 /// - Objects are bare IRI strings, `"_:id"` blank nodes, or N-Triples
258 /// quoted literals (e.g. `"\"/data/db\""`).
259 ///
260 /// This function is the primary testing seam; it does not require a Turtle
261 /// parser and works entirely from pre-expanded triples.
262 pub fn from_triples(
263 triples: &[(String, String, String)],
264 ) -> Result<AssemblerConfig, AssemblerError> {
265 let map = adjacency_map(triples);
266
267 // Collect all subjects that are typed as dataset resources.
268 //
269 // Strategy: find every subject that has an `rdf:type` triple whose
270 // object is one of the recognised Jena/TDB2 dataset class IRIs. When
271 // the type IRI is unrecognised, a `DatasetConfig` with
272 // `backend: Unknown(type_iri)` is still produced — this preserves
273 // information for callers that handle proprietary or future extensions.
274 //
275 // Blank-node subjects are skipped: they are intermediate nodes (e.g.
276 // named-graph descriptions), not top-level dataset resources.
277 let mut datasets: Vec<DatasetConfig> = Vec::new();
278
279 // Collect (subject, type_iri) pairs for all typed, non-blank subjects
280 let typed_subjects: Vec<(String, String)> = map
281 .iter()
282 .filter(|(subject, _)| !subject.starts_with("_:"))
283 .flat_map(|(subject, pairs)| {
284 pairs
285 .iter()
286 .filter(|(pred, _)| pred == RDF_TYPE)
287 .map(|(_, obj)| (subject.clone(), obj.clone()))
288 .collect::<Vec<_>>()
289 })
290 .collect();
291
292 for (subject, type_iri) in typed_subjects {
293 match build_dataset_config(&map, &subject, &type_iri) {
294 Ok(cfg) => datasets.push(cfg),
295 Err(e) => return Err(e),
296 }
297 }
298
299 // Stable ordering by resource IRI for deterministic output
300 datasets.sort_by(|a, b| a.resource_iri.cmp(&b.resource_iri));
301
302 Ok(AssemblerConfig { datasets })
303 }
304
305 /// Parse a Turtle-format Jena Assembler document into an
306 /// [`AssemblerConfig`].
307 ///
308 /// Uses `oxttl` (a workspace dependency) to parse the Turtle, then
309 /// delegates to [`Self::from_triples`].
310 ///
311 /// # Example
312 ///
313 /// ```rust
314 /// use oxirs_core::assembler::AssemblerBuilder;
315 ///
316 /// let ttl = r#"
317 /// @prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
318 /// <http://example.org/ds> a ja:MemoryDataset .
319 /// "#;
320 /// let config = AssemblerBuilder::from_turtle(ttl).unwrap();
321 /// assert_eq!(config.len(), 1);
322 /// ```
323 pub fn from_turtle(input: &str) -> Result<AssemblerConfig, AssemblerError> {
324 let reader = Cursor::new(input.as_bytes());
325 let parser = oxttl::TurtleParser::new().lenient();
326
327 let mut triples: Vec<(String, String, String)> = Vec::new();
328
329 for result in parser.for_reader(reader) {
330 match result {
331 Ok(triple) => {
332 let subject = subject_to_key(&triple.subject);
333 let predicate = triple.predicate.as_str().to_owned();
334 let object = term_to_value(&triple.object);
335 triples.push((subject, predicate, object));
336 }
337 Err(e) => {
338 return Err(AssemblerError::ParseError(e.to_string()));
339 }
340 }
341 }
342
343 Self::from_triples(&triples)
344 }
345}
346
347// ---------------------------------------------------------------------------
348// oxrdf term → string helpers
349// ---------------------------------------------------------------------------
350
351fn subject_to_key(subject: &oxrdf::NamedOrBlankNode) -> String {
352 match subject {
353 oxrdf::NamedOrBlankNode::NamedNode(n) => n.as_str().to_owned(),
354 oxrdf::NamedOrBlankNode::BlankNode(b) => format!("_:{}", b.as_str()),
355 }
356}
357
358fn term_to_value(term: &oxrdf::Term) -> String {
359 match term {
360 oxrdf::Term::NamedNode(n) => n.as_str().to_owned(),
361 oxrdf::Term::BlankNode(b) => format!("_:{}", b.as_str()),
362 oxrdf::Term::Literal(lit) => {
363 // Store as a quoted string so strip_literal can unwrap it later
364 format!("\"{}\"", lit.value())
365 }
366 // RDF-star triple terms — use N-Triples form
367 #[allow(unreachable_patterns)]
368 _ => term.to_string(),
369 }
370}