Skip to main content

shifty_engine/
frozen.rs

1//! Immutable dictionary-encoded RDF dataset built at the inference→validation
2//! boundary. Implements `spareval::QueryableDataset` so Oxigraph's prepared
3//! SPARQL evaluator can execute queries against it without a mutable Store.
4//!
5//! Stage 2 of `docs/05-sparql-execution.md`.
6
7use std::cell::RefCell;
8use std::collections::{HashMap, HashSet};
9use std::convert::Infallible;
10use std::rc::Rc;
11
12use crate::path_plan::ReachStep;
13use oxrdf::{Graph, NamedNode, Term};
14use shifty_opt::ClosureKind;
15use spareval::{InternalQuad, QueryableDataset};
16
17/// IRI under which the shapes graph is loaded into the named-graph slot.
18pub(crate) const SHAPES_GRAPH_IRI: &str = "urn:x-shacl:shapes-graph";
19
20/// Dictionary index for a term. Chosen as `u32` to keep triple arrays at 12
21/// bytes each; the 4B ceiling is unreachable in practice.
22pub type TermId = u32;
23
24/// Which graph a scan reads from.
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
26pub(crate) enum GraphSel {
27    /// The default (data) graph.
28    Default,
29    /// A specific named graph, identified by its graph-IRI's `TermId`.
30    Named(TermId),
31}
32
33// ── Term dictionary ──────────────────────────────────────────────────────────
34
35struct TermDictInner {
36    id_to_term: Vec<Term>,
37    term_to_id: HashMap<Term, TermId>,
38}
39
40/// Bidirectional Term ↔ TermId map. Interior-mutable so that
41/// `QueryableDataset::internalize_term` (which takes `&self`) can lazily
42/// assign IDs to query constants that did not appear in the loaded triples.
43struct TermDictionary(RefCell<TermDictInner>);
44
45impl TermDictionary {
46    fn new() -> Self {
47        Self(RefCell::new(TermDictInner {
48            id_to_term: Vec::new(),
49            term_to_id: HashMap::new(),
50        }))
51    }
52
53    /// Return existing ID or assign a new one.
54    fn intern(&self, term: Term) -> TermId {
55        let mut inner = self.0.borrow_mut();
56        if let Some(&id) = inner.term_to_id.get(&term) {
57            return id;
58        }
59        let id = inner.id_to_term.len() as TermId;
60        inner.term_to_id.insert(term.clone(), id);
61        inner.id_to_term.push(term);
62        id
63    }
64
65    #[cfg(test)]
66    fn get(&self, term: &Term) -> Option<TermId> {
67        self.0.borrow().term_to_id.get(term).copied()
68    }
69
70    fn externalize(&self, id: TermId) -> Option<Term> {
71        self.0.borrow().id_to_term.get(id as usize).cloned()
72    }
73}
74
75// ── Triple index ─────────────────────────────────────────────────────────────
76
77/// Three sorted arrays giving the six access patterns documented in §161:
78/// `spo` covers S-first and membership; `pos` covers P-first lookups;
79/// `osp` covers O-first lookups.
80struct TripleIndex {
81    spo: Vec<[TermId; 3]>, // elements are [s, p, o]
82    pos: Vec<[TermId; 3]>, // elements are [p, o, s]
83    osp: Vec<[TermId; 3]>, // elements are [o, s, p]
84}
85
86impl TripleIndex {
87    fn build(mut triples: Vec<[TermId; 3]>) -> Self {
88        triples.sort_unstable();
89        triples.dedup();
90
91        let mut pos: Vec<[TermId; 3]> = triples.iter().map(|&[s, p, o]| [p, o, s]).collect();
92        pos.sort_unstable();
93
94        let mut osp: Vec<[TermId; 3]> = triples.iter().map(|&[s, p, o]| [o, s, p]).collect();
95        osp.sort_unstable();
96
97        Self {
98            spo: triples,
99            pos,
100            osp,
101        }
102    }
103
104    fn contains(&self, s: TermId, p: TermId, o: TermId) -> bool {
105        self.spo.binary_search(&[s, p, o]).is_ok()
106    }
107
108    fn extend(&mut self, triples: Vec<[TermId; 3]>) {
109        if triples.is_empty() {
110            return;
111        }
112        self.spo.extend(triples.iter().copied());
113        self.spo.sort_unstable();
114        self.spo.dedup();
115
116        self.pos.extend(triples.iter().map(|&[s, p, o]| [p, o, s]));
117        self.pos.sort_unstable();
118        self.pos.dedup();
119
120        self.osp
121            .extend(triples.into_iter().map(|[s, p, o]| [o, s, p]));
122        self.osp.sort_unstable();
123        self.osp.dedup();
124    }
125
126    fn range_s(&self, s: TermId) -> &[[TermId; 3]] {
127        let lo = self.spo.partition_point(|t| t[0] < s);
128        let hi = self.spo.partition_point(|t| t[0] <= s);
129        &self.spo[lo..hi]
130    }
131
132    fn range_sp(&self, s: TermId, p: TermId) -> &[[TermId; 3]] {
133        let lo = self.spo.partition_point(|t| (t[0], t[1]) < (s, p));
134        let hi = self.spo.partition_point(|t| (t[0], t[1]) <= (s, p));
135        &self.spo[lo..hi]
136    }
137
138    fn range_p(&self, p: TermId) -> &[[TermId; 3]] {
139        let lo = self.pos.partition_point(|t| t[0] < p);
140        let hi = self.pos.partition_point(|t| t[0] <= p);
141        &self.pos[lo..hi]
142    }
143
144    fn range_po(&self, p: TermId, o: TermId) -> &[[TermId; 3]] {
145        let lo = self.pos.partition_point(|t| (t[0], t[1]) < (p, o));
146        let hi = self.pos.partition_point(|t| (t[0], t[1]) <= (p, o));
147        &self.pos[lo..hi]
148    }
149
150    fn range_o(&self, o: TermId) -> &[[TermId; 3]] {
151        let lo = self.osp.partition_point(|t| t[0] < o);
152        let hi = self.osp.partition_point(|t| t[0] <= o);
153        &self.osp[lo..hi]
154    }
155}
156
157// ── Statistics ───────────────────────────────────────────────────────────────
158
159/// Dataset statistics used in stage 3+ for join-order planning (doc §198).
160pub struct DatasetStatistics {
161    pub triple_count: u64,
162    pub distinct_subjects: u64,
163    pub distinct_objects: u64,
164    /// Triples per predicate ID.
165    pub predicate_cardinality: HashMap<TermId, u64>,
166    subjects: HashSet<TermId>,
167    objects: HashSet<TermId>,
168}
169
170impl DatasetStatistics {
171    fn compute(triples: &TripleIndex) -> Self {
172        let triple_count = triples.spo.len() as u64;
173        let subjects: HashSet<TermId> = triples.spo.iter().map(|t| t[0]).collect();
174        let objects: HashSet<TermId> = triples.spo.iter().map(|t| t[2]).collect();
175        let mut predicate_cardinality: HashMap<TermId, u64> = HashMap::new();
176        for &[_, p, _] in &triples.spo {
177            *predicate_cardinality.entry(p).or_insert(0) += 1;
178        }
179
180        Self {
181            triple_count,
182            distinct_subjects: subjects.len() as u64,
183            distinct_objects: objects.len() as u64,
184            predicate_cardinality,
185            subjects,
186            objects,
187        }
188    }
189
190    fn extend(&mut self, triples: &[[TermId; 3]]) {
191        self.triple_count += triples.len() as u64;
192        for &[subject, predicate, object] in triples {
193            self.subjects.insert(subject);
194            self.objects.insert(object);
195            *self.predicate_cardinality.entry(predicate).or_insert(0) += 1;
196        }
197        self.distinct_subjects = self.subjects.len() as u64;
198        self.distinct_objects = self.objects.len() as u64;
199    }
200}
201
202// ── FrozenIndexedDataset ─────────────────────────────────────────────────────
203
204/// Immutable, dictionary-encoded snapshot of a post-inference RDF dataset.
205/// Intended to be built once at the inference→validation boundary and shared
206/// across all per-focus-node SPARQL evaluations.
207pub struct FrozenIndexedDataset {
208    terms: TermDictionary,
209    default_graph: TripleIndex,
210    /// Named graphs keyed by the graph-IRI's TermId. Each value is a
211    /// simple SPO-sorted vec (named graphs are typically small).
212    named_graphs: HashMap<TermId, Vec<[TermId; 3]>>,
213    reach_cache: RefCell<ReachCache>,
214    pub stats: DatasetStatistics,
215}
216
217const MAX_CACHED_REACH_IDS: usize = 1_000_000;
218
219#[derive(PartialEq, Eq, Hash)]
220struct ReachCacheKey {
221    node: TermId,
222    step: ReachStep,
223    kind: ClosureKind,
224    graph: GraphSel,
225}
226
227#[derive(Default)]
228struct ReachCache {
229    entries: HashMap<ReachCacheKey, Rc<HashSet<TermId>>>,
230    cached_ids: usize,
231}
232
233impl FrozenIndexedDataset {
234    /// Build from a single graph loaded into the default graph slot.
235    pub fn from_graph(graph: &Graph) -> Self {
236        let terms = TermDictionary::new();
237        let triples = intern_graph(graph, &terms);
238        let default_graph = TripleIndex::build(triples);
239        let stats = DatasetStatistics::compute(&default_graph);
240        Self {
241            terms,
242            default_graph,
243            named_graphs: HashMap::new(),
244            reach_cache: RefCell::new(ReachCache::default()),
245            stats,
246        }
247    }
248
249    /// Build a default graph from the set union of two source graphs without
250    /// materializing an intermediate `Graph`.
251    pub fn from_graph_union(left: &Graph, right: &Graph) -> Self {
252        let terms = TermDictionary::new();
253        let mut triples = intern_graph(left, &terms);
254        triples.extend(intern_graph(right, &terms));
255        let default_graph = TripleIndex::build(triples);
256        let stats = DatasetStatistics::compute(&default_graph);
257        Self {
258            terms,
259            default_graph,
260            named_graphs: HashMap::new(),
261            reach_cache: RefCell::new(ReachCache::default()),
262            stats,
263        }
264    }
265
266    /// Intern a term against this dataset's dictionary, returning its `TermId`.
267    /// Unknown terms (e.g. query constants absent from the data) receive a fresh
268    /// id that matches no stored triple — exactly the semantics a scan needs.
269    pub(crate) fn intern(&self, term: &Term) -> TermId {
270        self.terms.intern(term.clone())
271    }
272
273    /// Build a [`PlanStats`] for use by the query planner. Converts TermId-keyed
274    /// statistics to Term-keyed form so the planner can look up predicate
275    /// cardinalities without touching the dictionary directly.
276    pub(crate) fn plan_stats(&self) -> shifty_opt::PlanStats {
277        let predicate_cardinality: HashMap<Term, u64> = self
278            .stats
279            .predicate_cardinality
280            .iter()
281            .filter_map(|(&id, &count)| Some((self.externalize(id)?, count)))
282            .collect();
283        let distinct_predicates = predicate_cardinality.len() as u64;
284        shifty_opt::PlanStats {
285            total_triples: self.stats.triple_count,
286            distinct_subjects: self.stats.distinct_subjects,
287            distinct_objects: self.stats.distinct_objects,
288            distinct_predicates,
289            predicate_cardinality,
290        }
291    }
292
293    /// Map a `TermId` back to its RDF term. Returns `None` only for ids that did
294    /// not originate from this dataset.
295    pub(crate) fn externalize(&self, id: TermId) -> Option<Term> {
296        self.terms.externalize(id)
297    }
298
299    /// Like [`externalize`](Self::externalize) but for ids that are guaranteed to
300    /// originate from this dataset (e.g. a [`scan`](Self::scan) result), so the
301    /// lookup cannot fail.
302    pub(crate) fn externalize_id(&self, id: TermId) -> Term {
303        self.terms
304            .externalize(id)
305            .expect("scanned TermId originates from this dataset")
306    }
307
308    pub(crate) fn contains_ids(&self, subject: TermId, predicate: TermId, object: TermId) -> bool {
309        self.default_graph.contains(subject, predicate, object)
310    }
311
312    pub(crate) fn encode_triple(&self, triple: &oxrdf::Triple) -> [TermId; 3] {
313        [
314            self.terms.intern(triple.subject.clone().into()),
315            self.terms.intern(Term::NamedNode(triple.predicate.clone())),
316            self.terms.intern(triple.object.clone()),
317        ]
318    }
319
320    pub(crate) fn cached_reach(
321        &self,
322        node: TermId,
323        step: &ReachStep,
324        kind: ClosureKind,
325        graph: GraphSel,
326    ) -> Option<Rc<HashSet<TermId>>> {
327        self.reach_cache
328            .borrow()
329            .entries
330            .get(&ReachCacheKey {
331                node,
332                step: step.clone(),
333                kind,
334                graph,
335            })
336            .cloned()
337    }
338
339    pub(crate) fn cache_reach(
340        &self,
341        node: TermId,
342        step: &ReachStep,
343        kind: ClosureKind,
344        graph: GraphSel,
345        result: Rc<HashSet<TermId>>,
346    ) {
347        let mut cache = self.reach_cache.borrow_mut();
348        if cache.cached_ids.saturating_add(result.len()) > MAX_CACHED_REACH_IDS {
349            return;
350        }
351        let key = ReachCacheKey {
352            node,
353            step: step.clone(),
354            kind,
355            graph,
356        };
357        if cache.entries.contains_key(&key) {
358            return;
359        }
360        cache.cached_ids += result.len();
361        cache.entries.insert(key, result);
362    }
363
364    /// Scan triples in the selected graph matching an optional S/P/O pattern,
365    /// yielding `(subject, predicate, object)` term-id triples. Reuses the same
366    /// sorted-index access paths as the `QueryableDataset` impl.
367    pub(crate) fn scan(
368        &self,
369        s: Option<TermId>,
370        p: Option<TermId>,
371        o: Option<TermId>,
372        graph: GraphSel,
373    ) -> Box<dyn Iterator<Item = [TermId; 3]> + '_> {
374        let iter = match graph {
375            GraphSel::Default => default_graph_quads(self, s, p, o),
376            GraphSel::Named(g) => named_graph_quads(self, g, s, p, o),
377        };
378        Box::new(iter.map(|q| {
379            let q = q.expect("infallible");
380            [q.subject, q.predicate, q.object]
381        }))
382    }
383
384    pub(crate) fn triples_for_predicate(
385        &self,
386        predicate: &NamedNode,
387    ) -> Box<dyn Iterator<Item = (Term, Term)> + '_> {
388        let p = self.intern(&Term::NamedNode(predicate.clone()));
389        Box::new(
390            self.scan(None, Some(p), None, GraphSel::Default)
391                .map(|[subject, _, object]| {
392                    (self.externalize_id(subject), self.externalize_id(object))
393                }),
394        )
395    }
396
397    pub(crate) fn outgoing(
398        &self,
399        subject: &Term,
400    ) -> Box<dyn Iterator<Item = (NamedNode, Term)> + '_> {
401        let s = self.intern(subject);
402        Box::new(
403            self.scan(Some(s), None, None, GraphSel::Default)
404                .filter_map(|[_, predicate, object]| {
405                    let Term::NamedNode(predicate) = self.externalize_id(predicate) else {
406                        return None;
407                    };
408                    Some((predicate, self.externalize_id(object)))
409                }),
410        )
411    }
412
413    /// Add a committed inference batch to the default-graph indexes.
414    pub(crate) fn extend_triples<'a>(
415        &mut self,
416        triples: impl IntoIterator<Item = &'a oxrdf::Triple>,
417    ) {
418        let mut encoded: Vec<_> = triples
419            .into_iter()
420            .map(|triple| self.encode_triple(triple))
421            .collect();
422        encoded.sort_unstable();
423        encoded.dedup();
424        encoded.retain(|&[s, p, o]| !self.default_graph.contains(s, p, o));
425        self.stats.extend(&encoded);
426        self.default_graph.extend(encoded);
427        *self.reach_cache.borrow_mut() = ReachCache::default();
428    }
429
430    /// Build with `context` in the default graph and `shapes` in the named
431    /// graph `urn:x-shacl:shapes-graph`, mirroring what `SparqlExecutor::build`
432    /// does with the Oxigraph Store.
433    pub fn from_graphs(context: &Graph, shapes: &Graph) -> Self {
434        let terms = TermDictionary::new();
435        let triples = intern_graph(context, &terms);
436        let default_graph = TripleIndex::build(triples);
437        let stats = DatasetStatistics::compute(&default_graph);
438
439        let shapes_iri = NamedNode::new(SHAPES_GRAPH_IRI).expect("static IRI is valid");
440        let graph_id = terms.intern(Term::NamedNode(shapes_iri));
441        let mut named_triples = intern_graph(shapes, &terms);
442        named_triples.sort_unstable();
443        named_triples.dedup();
444
445        let mut named_graphs = HashMap::new();
446        named_graphs.insert(graph_id, named_triples);
447
448        Self {
449            terms,
450            default_graph,
451            named_graphs,
452            reach_cache: RefCell::new(ReachCache::default()),
453            stats,
454        }
455    }
456
457    /// Build a union default graph while also exposing `shapes` through the
458    /// named `$shapesGraph` slot.
459    pub fn from_graph_union_with_shapes(data: &Graph, shapes: &Graph) -> Self {
460        let terms = TermDictionary::new();
461        let mut triples = intern_graph(data, &terms);
462        triples.extend(intern_graph(shapes, &terms));
463        let default_graph = TripleIndex::build(triples);
464        let stats = DatasetStatistics::compute(&default_graph);
465
466        let shapes_iri = NamedNode::new(SHAPES_GRAPH_IRI).expect("static IRI is valid");
467        let graph_id = terms.intern(Term::NamedNode(shapes_iri));
468        let mut named_triples = intern_graph(shapes, &terms);
469        named_triples.sort_unstable();
470        named_triples.dedup();
471        let mut named_graphs = HashMap::new();
472        named_graphs.insert(graph_id, named_triples);
473
474        Self {
475            terms,
476            default_graph,
477            named_graphs,
478            reach_cache: RefCell::new(ReachCache::default()),
479            stats,
480        }
481    }
482}
483
484fn intern_graph(graph: &Graph, terms: &TermDictionary) -> Vec<[TermId; 3]> {
485    graph
486        .iter()
487        .map(|triple| {
488            let s = terms.intern(triple.subject.into_owned().into());
489            let p = terms.intern(Term::NamedNode(triple.predicate.into_owned()));
490            let o = terms.intern(triple.object.into_owned());
491            [s, p, o]
492        })
493        .collect()
494}
495
496// ── QueryableDataset impl ────────────────────────────────────────────────────
497
498#[allow(refining_impl_trait)]
499impl<'a> QueryableDataset<'a> for &'a FrozenIndexedDataset {
500    type InternalTerm = TermId;
501    type Error = Infallible;
502
503    fn internal_quads_for_pattern(
504        &self,
505        subject: Option<&TermId>,
506        predicate: Option<&TermId>,
507        object: Option<&TermId>,
508        graph_name: Option<Option<&TermId>>,
509    ) -> QuadIter<'a> {
510        // Dereference once to get &'a FrozenIndexedDataset, preserving the full
511        // 'a lifetime instead of the shorter borrow lifetime of &self. (Not an
512        // auto-deref: the explicit `*` is what keeps the `'a` lifetime.)
513        #[allow(clippy::explicit_auto_deref)]
514        let ds: &'a FrozenIndexedDataset = *self;
515        let s = subject.copied();
516        let p = predicate.copied();
517        let o = object.copied();
518        match graph_name {
519            Some(None) => default_graph_quads(ds, s, p, o),
520            Some(Some(&g)) => named_graph_quads(ds, g, s, p, o),
521            None => all_named_quads(ds, s, p, o),
522        }
523    }
524
525    fn internalize_term(&self, term: Term) -> Result<TermId, Infallible> {
526        // Lazily assigns IDs to query constants that weren't in the loaded triples.
527        // These will never match any triple index entry but get unique IDs so
528        // term-equality in SPARQL expressions is still correct.
529        Ok(self.terms.intern(term))
530    }
531
532    fn externalize_term(&self, id: TermId) -> Result<Term, Infallible> {
533        Ok(self
534            .terms
535            .externalize(id)
536            .expect("TermId always originates from this dataset's internalize_term or internal_quads_for_pattern"))
537    }
538}
539
540// ─── quad-iterator helpers ───────────────────────────────────────────────────
541
542type QuadIter<'a> = Box<dyn Iterator<Item = Result<InternalQuad<TermId>, Infallible>> + 'a>;
543
544fn mk_quad(s: TermId, p: TermId, o: TermId, g: Option<TermId>) -> InternalQuad<TermId> {
545    InternalQuad {
546        subject: s,
547        predicate: p,
548        object: o,
549        graph_name: g,
550    }
551}
552
553/// Quads from the default graph matching an optional S/P/O pattern.
554fn default_graph_quads<'a>(
555    ds: &'a FrozenIndexedDataset,
556    s: Option<TermId>,
557    p: Option<TermId>,
558    o: Option<TermId>,
559) -> QuadIter<'a> {
560    match (s, p, o) {
561        // Membership check
562        (Some(s), Some(p), Some(o)) => {
563            let hit = ds.default_graph.contains(s, p, o);
564            Box::new(hit.then(|| Ok(mk_quad(s, p, o, None))).into_iter())
565        }
566        // Subject + predicate lookup (SPO range for S,P prefix)
567        (Some(s), Some(p), None) => {
568            let slice = ds.default_graph.range_sp(s, p);
569            Box::new(slice.iter().map(|&t| Ok(mk_quad(t[0], t[1], t[2], None))))
570        }
571        // Subject + object: scan S range, filter by O
572        (Some(s), None, Some(o)) => {
573            let slice = ds.default_graph.range_s(s);
574            Box::new(
575                slice
576                    .iter()
577                    .filter(move |t| t[2] == o)
578                    .map(|&t| Ok(mk_quad(t[0], t[1], t[2], None))),
579            )
580        }
581        // Subject only
582        (Some(s), None, None) => {
583            let slice = ds.default_graph.range_s(s);
584            Box::new(slice.iter().map(|&t| Ok(mk_quad(t[0], t[1], t[2], None))))
585        }
586        // Predicate + object (POS range; elements are [p,o,s])
587        (None, Some(p), Some(o)) => {
588            let slice = ds.default_graph.range_po(p, o);
589            Box::new(slice.iter().map(|&t| Ok(mk_quad(t[2], t[0], t[1], None))))
590        }
591        // Predicate only (POS range; elements are [p,o,s])
592        (None, Some(p), None) => {
593            let slice = ds.default_graph.range_p(p);
594            Box::new(slice.iter().map(|&t| Ok(mk_quad(t[2], t[0], t[1], None))))
595        }
596        // Object only (OSP range; elements are [o,s,p])
597        (None, None, Some(o)) => {
598            let slice = ds.default_graph.range_o(o);
599            Box::new(slice.iter().map(|&t| Ok(mk_quad(t[1], t[2], t[0], None))))
600        }
601        // Full scan
602        (None, None, None) => Box::new(
603            ds.default_graph
604                .spo
605                .iter()
606                .map(|&t| Ok(mk_quad(t[0], t[1], t[2], None))),
607        ),
608    }
609}
610
611/// Quads from a specific named graph matching an optional S/P/O pattern.
612/// Named graphs are small (usually just the shapes graph), so a linear scan is
613/// sufficient for correctness; stage 5 can add indexes if profiling shows need.
614fn named_graph_quads<'a>(
615    ds: &'a FrozenIndexedDataset,
616    g: TermId,
617    s: Option<TermId>,
618    p: Option<TermId>,
619    o: Option<TermId>,
620) -> QuadIter<'a> {
621    let Some(triples) = ds.named_graphs.get(&g) else {
622        return Box::new(std::iter::empty());
623    };
624    Box::new(
625        triples
626            .iter()
627            .filter(move |&&[ts, tp, to]| {
628                s.is_none_or(|v| ts == v) && p.is_none_or(|v| tp == v) && o.is_none_or(|v| to == v)
629            })
630            .map(move |&[ts, tp, to]| Ok(mk_quad(ts, tp, to, Some(g)))),
631    )
632}
633
634/// Quads from ALL named graphs (but NOT the default graph). Used when the
635/// `graph_name` pattern is `None` (SPARQL: no GRAPH clause, any named graph).
636fn all_named_quads<'a>(
637    ds: &'a FrozenIndexedDataset,
638    s: Option<TermId>,
639    p: Option<TermId>,
640    o: Option<TermId>,
641) -> QuadIter<'a> {
642    Box::new(
643        ds.named_graphs
644            .keys()
645            .copied()
646            .flat_map(move |g| named_graph_quads(ds, g, s, p, o)),
647    )
648}
649
650// ── Tests ────────────────────────────────────────────────────────────────────
651
652#[cfg(test)]
653mod tests {
654    use super::*;
655    use oxrdf::{NamedNode, Triple};
656    use spareval::QueryableDataset;
657
658    fn nn(iri: &str) -> NamedNode {
659        NamedNode::new(iri).unwrap()
660    }
661
662    fn triple_nnn(s: &str, p: &str, o: &str) -> Triple {
663        Triple::new(nn(s), nn(p), nn(o))
664    }
665
666    fn small_graph() -> Graph {
667        let mut g = Graph::new();
668        g.insert(triple_nnn("http://ex/a", "http://ex/p", "http://ex/b").as_ref());
669        g.insert(triple_nnn("http://ex/a", "http://ex/p", "http://ex/c").as_ref());
670        g.insert(triple_nnn("http://ex/b", "http://ex/q", "http://ex/c").as_ref());
671        g
672    }
673
674    #[test]
675    fn intern_round_trips() {
676        let g = small_graph();
677        let ds = FrozenIndexedDataset::from_graph(&g);
678        let a = Term::NamedNode(nn("http://ex/a"));
679        let id = ds.terms.intern(a.clone());
680        assert_eq!(ds.terms.externalize(id), Some(a));
681    }
682
683    #[test]
684    fn contains_triple() {
685        let g = small_graph();
686        let ds = FrozenIndexedDataset::from_graph(&g);
687        let s = ds.terms.get(&Term::NamedNode(nn("http://ex/a"))).unwrap();
688        let p = ds.terms.get(&Term::NamedNode(nn("http://ex/p"))).unwrap();
689        let o = ds.terms.get(&Term::NamedNode(nn("http://ex/b"))).unwrap();
690        assert!(ds.default_graph.contains(s, p, o));
691    }
692
693    #[test]
694    fn missing_triple_not_found() {
695        let g = small_graph();
696        let ds = FrozenIndexedDataset::from_graph(&g);
697        let s = ds.terms.get(&Term::NamedNode(nn("http://ex/a"))).unwrap();
698        let p = ds.terms.get(&Term::NamedNode(nn("http://ex/q"))).unwrap();
699        let o = ds.terms.get(&Term::NamedNode(nn("http://ex/c"))).unwrap();
700        assert!(!ds.default_graph.contains(s, p, o));
701    }
702
703    #[test]
704    fn range_s_returns_correct_triples() {
705        let g = small_graph();
706        let ds = FrozenIndexedDataset::from_graph(&g);
707        let s = ds.terms.get(&Term::NamedNode(nn("http://ex/a"))).unwrap();
708        let slice = ds.default_graph.range_s(s);
709        assert_eq!(slice.len(), 2); // a has two triples
710    }
711
712    #[test]
713    fn extend_triples_updates_indexes_and_statistics() {
714        let g = small_graph();
715        let mut ds = FrozenIndexedDataset::from_graph(&g);
716        let added = triple_nnn("http://ex/new", "http://ex/p", "http://ex/b");
717        ds.extend_triples([&added, &added]);
718
719        let s = ds.intern(&Term::NamedNode(nn("http://ex/new")));
720        let p = ds.intern(&Term::NamedNode(nn("http://ex/p")));
721        let o = ds.intern(&Term::NamedNode(nn("http://ex/b")));
722        assert!(ds.contains_ids(s, p, o));
723        assert_eq!(ds.stats.triple_count, 4);
724        assert_eq!(ds.stats.predicate_cardinality[&p], 3);
725    }
726
727    #[test]
728    fn internalize_unknown_term_gets_unique_id() {
729        let g = small_graph();
730        let ds = FrozenIndexedDataset::from_graph(&g);
731        let rds: &FrozenIndexedDataset = &ds;
732        let unknown1 = Term::NamedNode(nn("http://ex/unknown1"));
733        let unknown2 = Term::NamedNode(nn("http://ex/unknown2"));
734        let id1 = rds.internalize_term(unknown1.clone()).unwrap();
735        let id2 = rds.internalize_term(unknown2.clone()).unwrap();
736        assert_ne!(id1, id2, "different unknown terms must get different IDs");
737        // Same term gets same ID (idempotent)
738        let id1b = rds.internalize_term(unknown1).unwrap();
739        assert_eq!(id1, id1b);
740    }
741
742    #[test]
743    fn queryable_dataset_default_graph_pattern() {
744        let g = small_graph();
745        let ds = FrozenIndexedDataset::from_graph(&g);
746        let rds: &FrozenIndexedDataset = &ds;
747        // Query: ?s ?p ?o in default graph → should return all 3 triples
748        let quads: Vec<_> = rds
749            .internal_quads_for_pattern(None, None, None, Some(None))
750            .map(|r| r.unwrap())
751            .collect();
752        assert_eq!(quads.len(), 3);
753    }
754
755    #[test]
756    fn queryable_dataset_named_graph_empty_when_not_loaded() {
757        let g = small_graph();
758        let ds = FrozenIndexedDataset::from_graph(&g);
759        let rds: &FrozenIndexedDataset = &ds;
760        // any named graph → nothing (no named graphs loaded)
761        let quads: Vec<_> = rds
762            .internal_quads_for_pattern(None, None, None, None)
763            .collect();
764        assert!(quads.is_empty());
765    }
766
767    #[test]
768    fn from_graphs_loads_both_default_and_named() {
769        let data = small_graph();
770        let mut shapes = Graph::new();
771        shapes.insert(
772            triple_nnn(
773                "http://ex/S",
774                "http://www.w3.org/ns/shacl#targetNode",
775                "http://ex/a",
776            )
777            .as_ref(),
778        );
779        let ds = FrozenIndexedDataset::from_graphs(&data, &shapes);
780        let rds: &FrozenIndexedDataset = &ds;
781
782        // default graph has 3 triples
783        let default_quads: Vec<_> = rds
784            .internal_quads_for_pattern(None, None, None, Some(None))
785            .map(|r| r.unwrap())
786            .collect();
787        assert_eq!(default_quads.len(), 3);
788
789        // named graph has 1 triple
790        let shapes_iri_id = ds
791            .terms
792            .get(&Term::NamedNode(nn(SHAPES_GRAPH_IRI)))
793            .expect("shapes graph IRI should be interned");
794        let named_quads: Vec<_> = rds
795            .internal_quads_for_pattern(None, None, None, Some(Some(&shapes_iri_id)))
796            .map(|r| r.unwrap())
797            .collect();
798        assert_eq!(named_quads.len(), 1);
799    }
800
801    #[test]
802    fn graph_union_builds_one_deduplicated_default_graph() {
803        let left = small_graph();
804        let mut right = Graph::new();
805        right.insert(triple_nnn("http://ex/a", "http://ex/p", "http://ex/b").as_ref());
806        right.insert(triple_nnn("http://ex/new", "http://ex/p", "http://ex/b").as_ref());
807
808        let ds = FrozenIndexedDataset::from_graph_union(&left, &right);
809
810        assert_eq!(ds.stats.triple_count, 4);
811    }
812}