Skip to main content

oxirs_core/assembler/
builder.rs

1//! Jena Assembler document parser.
2//!
3//! Converts a Turtle-format Jena Assembler document (or a pre-expanded set of
4//! string triples) into an [`AssemblerConfig`] value.
5//!
6//! ## Design
7//!
8//! The implementation has two layers:
9//!
10//! 1. **[`AssemblerBuilder::from_triples`]** — the core graph-walking logic.
11//!    Accepts `(subject, predicate, object)` string tuples (IRIs and literals
12//!    in N-Triples-like form, already expanded), and produces the config.
13//!    This is the primary testing seam: it works without any external parser.
14//!
15//! 2. **[`AssemblerBuilder::from_turtle`]** — uses `oxttl` (already a workspace
16//!    dep) to parse Turtle into `oxrdf::Triple` values, then converts each
17//!    triple into a string tuple and delegates to `from_triples`.
18//!
19//! ## Triple representation for `from_triples`
20//!
21//! Subjects and objects that are IRIs are stored as bare IRI strings
22//! (e.g. `"http://example.org/ds"`).  Blank nodes are stored with a leading
23//! `"_:"` sigil (e.g. `"_:b0"`).  Literal objects are stored with their
24//! N-Triples quotation (e.g. `"\"/data/db\""` or `"\"hello\"^^<xsd:string>"`).
25//! The builder strips the surrounding double-quotes when it extracts literal
26//! values.
27
28use std::collections::HashMap;
29use std::io::Cursor;
30use std::path::PathBuf;
31
32use super::config::{AssemblerConfig, DatasetConfig, GraphConfig, StoreBackend};
33use super::vocab::{
34    JA_CONTENT_URL, JA_DEFAULT_GRAPH, JA_GRAPH, JA_GRAPH_NAME, JA_MEMORY_DATASET, JA_MEMORY_MODEL,
35    JA_NAMED_GRAPH, JA_RDF_DATASET, RDF_TYPE, TDB2_DATASET, TDB2_LOCATION,
36};
37
38// ---------------------------------------------------------------------------
39// AssemblerError
40// ---------------------------------------------------------------------------
41
42/// Errors produced by the Jena Assembler parser.
43#[derive(Debug)]
44pub enum AssemblerError {
45    /// The Turtle source could not be parsed.
46    ParseError(String),
47
48    /// A required triple was absent from the graph.
49    MissingRequired { resource: String, property: String },
50
51    /// The backend type IRI is recognised but could not be instantiated
52    /// (e.g. `tdb2:DatasetTDB2` without a `tdb2:location`).
53    InvalidLocation(String),
54}
55
56impl std::fmt::Display for AssemblerError {
57    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58        match self {
59            AssemblerError::ParseError(msg) => write!(f, "Assembler parse error: {msg}"),
60            AssemblerError::MissingRequired { resource, property } => {
61                write!(f, "Missing required property <{property}> on <{resource}>")
62            }
63            AssemblerError::InvalidLocation(msg) => {
64                write!(f, "Invalid tdb2:location: {msg}")
65            }
66        }
67    }
68}
69
70impl std::error::Error for AssemblerError {}
71
72// ---------------------------------------------------------------------------
73// Internal adjacency map
74// ---------------------------------------------------------------------------
75
76/// A minimal in-memory graph: subject → list of (predicate, object) pairs.
77///
78/// Keys may be bare IRIs or `_:`-prefixed blank-node IDs.
79type AdjMap = HashMap<String, Vec<(String, String)>>;
80
81fn adjacency_map(triples: &[(String, String, String)]) -> AdjMap {
82    let mut map: AdjMap = HashMap::new();
83    for (s, p, o) in triples {
84        map.entry(s.clone())
85            .or_default()
86            .push((p.clone(), o.clone()));
87    }
88    map
89}
90
91/// Return all objects for the given subject + predicate pair.
92fn objects_of<'a>(map: &'a AdjMap, subject: &str, predicate: &str) -> Vec<&'a str> {
93    match map.get(subject) {
94        None => vec![],
95        Some(pairs) => pairs
96            .iter()
97            .filter(|(p, _)| p == predicate)
98            .map(|(_, o)| o.as_str())
99            .collect(),
100    }
101}
102
103/// Return the first object for the given subject + predicate pair.
104fn first_object<'a>(map: &'a AdjMap, subject: &str, predicate: &str) -> Option<&'a str> {
105    objects_of(map, subject, predicate).into_iter().next()
106}
107
108// ---------------------------------------------------------------------------
109// Literal stripping
110// ---------------------------------------------------------------------------
111
112/// Extract the lexical value from an N-Triples-like literal string.
113///
114/// Literals may arrive as:
115/// - `"some text"` → `some text`
116/// - `"some text"@en` → `some text`
117/// - `"some text"^^<...>` → `some text`
118///
119/// Non-quoted strings are returned as-is (they are IRIs or blank nodes).
120fn strip_literal(raw: &str) -> &str {
121    if let Some(inner) = raw.strip_prefix('"') {
122        // The closing quote may be followed by @lang or ^^type
123        let end = inner.rfind('"').unwrap_or(inner.len());
124        &inner[..end]
125    } else {
126        raw
127    }
128}
129
130// ---------------------------------------------------------------------------
131// Backend resolution
132// ---------------------------------------------------------------------------
133
134fn resolve_backend(
135    map: &AdjMap,
136    resource: &str,
137    type_iri: &str,
138) -> Result<StoreBackend, AssemblerError> {
139    if type_iri == JA_MEMORY_MODEL || type_iri == JA_MEMORY_DATASET || type_iri == JA_RDF_DATASET {
140        Ok(StoreBackend::InMemory)
141    } else if type_iri == TDB2_DATASET {
142        let loc_raw = first_object(map, resource, TDB2_LOCATION).ok_or_else(|| {
143            AssemblerError::MissingRequired {
144                resource: resource.to_owned(),
145                property: TDB2_LOCATION.to_owned(),
146            }
147        })?;
148        let loc_str = strip_literal(loc_raw);
149        if loc_str.is_empty() {
150            return Err(AssemblerError::InvalidLocation(
151                "tdb2:location value is empty".to_owned(),
152            ));
153        }
154        Ok(StoreBackend::Tdb2 {
155            location: PathBuf::from(loc_str),
156        })
157    } else {
158        Ok(StoreBackend::Unknown(type_iri.to_owned()))
159    }
160}
161
162// ---------------------------------------------------------------------------
163// GraphConfig resolution
164// ---------------------------------------------------------------------------
165
166/// Build a [`GraphConfig`] for a blank-node or IRI `graph_resource` subject.
167///
168/// `graph_name` is the named-graph IRI (or `None` for the default graph).
169fn build_graph_config(
170    map: &AdjMap,
171    graph_resource: &str,
172    graph_name: Option<String>,
173) -> GraphConfig {
174    // Determine backend from rdf:type on the model resource (best-effort; default InMemory)
175    let backend = objects_of(map, graph_resource, RDF_TYPE)
176        .into_iter()
177        .find_map(|type_iri| resolve_backend(map, graph_resource, type_iri).ok())
178        .unwrap_or(StoreBackend::InMemory);
179
180    // Collect ja:contentURL values from the model resource itself and any
181    // linked ja:content blank node.
182    let mut content_urls: Vec<String> = Vec::new();
183
184    // Direct contentURL on model resource
185    for url_raw in objects_of(map, graph_resource, JA_CONTENT_URL) {
186        content_urls.push(strip_literal(url_raw).to_owned());
187    }
188
189    // Indirect via ja:content → blank node → ja:contentURL
190    for content_bnode in objects_of(map, graph_resource, super::vocab::JA_CONTENT) {
191        for url_raw in objects_of(map, content_bnode, JA_CONTENT_URL) {
192            content_urls.push(strip_literal(url_raw).to_owned());
193        }
194    }
195
196    GraphConfig {
197        graph_name,
198        backend,
199        content_urls,
200    }
201}
202
203// ---------------------------------------------------------------------------
204// DatasetConfig resolution
205// ---------------------------------------------------------------------------
206
207fn build_dataset_config(
208    map: &AdjMap,
209    resource: &str,
210    type_iri: &str,
211) -> Result<DatasetConfig, AssemblerError> {
212    let backend = resolve_backend(map, resource, type_iri)?;
213
214    // Collect named-graph descriptions: ja:namedGraph → blank node with
215    //   ja:graphName <iri> and ja:graph <model>
216    let mut named_graphs: Vec<GraphConfig> = Vec::new();
217
218    for ng_bnode in objects_of(map, resource, JA_NAMED_GRAPH) {
219        // ja:graphName gives the named-graph IRI
220        let graph_name = first_object(map, ng_bnode, JA_GRAPH_NAME).map(|s| s.to_owned());
221
222        // ja:graph gives the model resource
223        if let Some(model_resource) = first_object(map, ng_bnode, JA_GRAPH) {
224            named_graphs.push(build_graph_config(map, model_resource, graph_name));
225        } else {
226            // No model resource — treat the blank node itself as the model
227            named_graphs.push(build_graph_config(map, ng_bnode, graph_name));
228        }
229    }
230
231    // Default graph: ja:defaultGraph → model resource
232    let default_graph = first_object(map, resource, JA_DEFAULT_GRAPH)
233        .map(|model_resource| build_graph_config(map, model_resource, None));
234
235    Ok(DatasetConfig {
236        resource_iri: resource.to_owned(),
237        backend,
238        named_graphs,
239        default_graph,
240    })
241}
242
243// ---------------------------------------------------------------------------
244// AssemblerBuilder
245// ---------------------------------------------------------------------------
246
247/// Parses Jena Assembler documents into [`AssemblerConfig`] values.
248pub struct AssemblerBuilder;
249
250impl AssemblerBuilder {
251    /// Parse an `(subject, predicate, object)` triple set into an
252    /// [`AssemblerConfig`].
253    ///
254    /// Each element of the slice is a `(String, String, String)` tuple where:
255    /// - Subjects are bare IRI strings or `"_:id"` for blank nodes.
256    /// - Predicates are bare IRI strings.
257    /// - Objects are bare IRI strings, `"_:id"` blank nodes, or N-Triples
258    ///   quoted literals (e.g. `"\"/data/db\""`).
259    ///
260    /// This function is the primary testing seam; it does not require a Turtle
261    /// parser and works entirely from pre-expanded triples.
262    pub fn from_triples(
263        triples: &[(String, String, String)],
264    ) -> Result<AssemblerConfig, AssemblerError> {
265        let map = adjacency_map(triples);
266
267        // Collect all subjects that are typed as dataset resources.
268        //
269        // Strategy: find every subject that has an `rdf:type` triple whose
270        // object is one of the recognised Jena/TDB2 dataset class IRIs.  When
271        // the type IRI is unrecognised, a `DatasetConfig` with
272        // `backend: Unknown(type_iri)` is still produced — this preserves
273        // information for callers that handle proprietary or future extensions.
274        //
275        // Blank-node subjects are skipped: they are intermediate nodes (e.g.
276        // named-graph descriptions), not top-level dataset resources.
277        let mut datasets: Vec<DatasetConfig> = Vec::new();
278
279        // Collect (subject, type_iri) pairs for all typed, non-blank subjects
280        let typed_subjects: Vec<(String, String)> = map
281            .iter()
282            .filter(|(subject, _)| !subject.starts_with("_:"))
283            .flat_map(|(subject, pairs)| {
284                pairs
285                    .iter()
286                    .filter(|(pred, _)| pred == RDF_TYPE)
287                    .map(|(_, obj)| (subject.clone(), obj.clone()))
288                    .collect::<Vec<_>>()
289            })
290            .collect();
291
292        for (subject, type_iri) in typed_subjects {
293            match build_dataset_config(&map, &subject, &type_iri) {
294                Ok(cfg) => datasets.push(cfg),
295                Err(e) => return Err(e),
296            }
297        }
298
299        // Stable ordering by resource IRI for deterministic output
300        datasets.sort_by(|a, b| a.resource_iri.cmp(&b.resource_iri));
301
302        Ok(AssemblerConfig { datasets })
303    }
304
305    /// Parse a Turtle-format Jena Assembler document into an
306    /// [`AssemblerConfig`].
307    ///
308    /// Uses `oxttl` (a workspace dependency) to parse the Turtle, then
309    /// delegates to [`Self::from_triples`].
310    ///
311    /// # Example
312    ///
313    /// ```rust
314    /// use oxirs_core::assembler::AssemblerBuilder;
315    ///
316    /// let ttl = r#"
317    ///     @prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
318    ///     <http://example.org/ds> a ja:MemoryDataset .
319    /// "#;
320    /// let config = AssemblerBuilder::from_turtle(ttl).unwrap();
321    /// assert_eq!(config.len(), 1);
322    /// ```
323    pub fn from_turtle(input: &str) -> Result<AssemblerConfig, AssemblerError> {
324        let reader = Cursor::new(input.as_bytes());
325        let parser = oxttl::TurtleParser::new().lenient();
326
327        let mut triples: Vec<(String, String, String)> = Vec::new();
328
329        for result in parser.for_reader(reader) {
330            match result {
331                Ok(triple) => {
332                    let subject = subject_to_key(&triple.subject);
333                    let predicate = triple.predicate.as_str().to_owned();
334                    let object = term_to_value(&triple.object);
335                    triples.push((subject, predicate, object));
336                }
337                Err(e) => {
338                    return Err(AssemblerError::ParseError(e.to_string()));
339                }
340            }
341        }
342
343        Self::from_triples(&triples)
344    }
345}
346
347// ---------------------------------------------------------------------------
348// oxrdf term → string helpers
349// ---------------------------------------------------------------------------
350
351fn subject_to_key(subject: &oxrdf::NamedOrBlankNode) -> String {
352    match subject {
353        oxrdf::NamedOrBlankNode::NamedNode(n) => n.as_str().to_owned(),
354        oxrdf::NamedOrBlankNode::BlankNode(b) => format!("_:{}", b.as_str()),
355    }
356}
357
358fn term_to_value(term: &oxrdf::Term) -> String {
359    match term {
360        oxrdf::Term::NamedNode(n) => n.as_str().to_owned(),
361        oxrdf::Term::BlankNode(b) => format!("_:{}", b.as_str()),
362        oxrdf::Term::Literal(lit) => {
363            // Store as a quoted string so strip_literal can unwrap it later
364            format!("\"{}\"", lit.value())
365        }
366        // RDF-star triple terms — use N-Triples form
367        #[allow(unreachable_patterns)]
368        _ => term.to_string(),
369    }
370}