Skip to main content

arrow_graph_core/
kg_store.rs

1//! KgStore — full-featured Arrow-native knowledge graph store.
2//!
3//! Adds namespace prefix management, keyword search, knowledge gap tracking,
4//! and bulk operations on top of [`ArrowGraphStore`].
5//!
6//! # Quick Start
7//!
8//! ```rust
9//! use arrow_graph_core::kg_store::KgStore;
10//!
11//! let mut store = KgStore::new();
12//! store.bind_prefix("ex", "http://example.org/");
13//!
14//! store.add_triple("ex:Alice", "rdf:type", "ex:Person", None, 1.0).unwrap();
15//!
16//! let results = store.search_by_keywords(&["Alice"]);
17//! assert_eq!(results.len(), 1);
18//! ```
19
20use crate::schema::col;
21use crate::store::{ArrowGraphStore, QuerySpec, StoreError, Triple};
22use crate::triple_store::{StoredTriple, batches_to_stored_triples};
23
24use arrow::array::StringArray;
25use std::collections::HashMap;
26
27/// Standard RDF namespace prefixes.
28fn default_prefixes() -> HashMap<String, String> {
29    let mut m = HashMap::new();
30    m.insert(
31        "rdf".into(),
32        "http://www.w3.org/1999/02/22-rdf-syntax-ns#".into(),
33    );
34    m.insert(
35        "rdfs".into(),
36        "http://www.w3.org/2000/01/rdf-schema#".into(),
37    );
38    m.insert("owl".into(), "http://www.w3.org/2002/07/owl#".into());
39    m.insert("xsd".into(), "http://www.w3.org/2001/XMLSchema#".into());
40    m.insert("foaf".into(), "http://xmlns.com/foaf/0.1/".into());
41    m.insert("prov".into(), "http://www.w3.org/ns/prov#".into());
42    m
43}
44
45/// A knowledge gap — something the graph doesn't know.
46#[derive(Debug, Clone)]
47pub struct KnowledgeGap {
48    pub question: String,
49    pub keywords: Vec<String>,
50    pub confidence: f64,
51    pub missing_concepts: Vec<String>,
52    pub resolved: bool,
53}
54
55/// Statistics about the knowledge graph.
56#[derive(Debug, Clone)]
57pub struct KgStats {
58    pub total_triples: usize,
59    pub unique_subjects: usize,
60    pub unique_predicates: usize,
61    pub unique_objects: usize,
62    pub namespace_count: usize,
63}
64
65/// Full-featured Arrow-native knowledge graph store.
66///
67/// Provides:
68/// - Namespace prefix management (expand/compact URIs)
69/// - Pattern-based triple queries
70/// - Keyword search (case-insensitive substring matching)
71/// - Knowledge gap tracking
72/// - Bulk add operations
73pub struct KgStore {
74    inner: ArrowGraphStore,
75    prefixes: HashMap<String, String>,
76    gaps: Vec<KnowledgeGap>,
77    default_namespace: String,
78    default_layer: Option<u8>,
79}
80
81impl KgStore {
82    /// Create a new store with standard RDF prefixes and a single "default" namespace.
83    pub fn new() -> Self {
84        Self {
85            inner: ArrowGraphStore::new(&["default"]),
86            prefixes: default_prefixes(),
87            gaps: Vec::new(),
88            default_namespace: "default".to_string(),
89            default_layer: Some(0),
90        }
91    }
92
93    /// Create with custom namespace partitions and layer.
94    pub fn with_config(namespaces: &[&str], default_namespace: &str, layer: Option<u8>) -> Self {
95        Self {
96            inner: ArrowGraphStore::new(namespaces),
97            prefixes: default_prefixes(),
98            gaps: Vec::new(),
99            default_namespace: default_namespace.to_string(),
100            default_layer: layer,
101        }
102    }
103
104    // ── Prefix management ─────────────────────────────────────────────
105
106    /// Bind a namespace prefix (e.g., `"rdf"` → `"http://www.w3.org/..."`).
107    pub fn bind_prefix(&mut self, prefix: &str, uri: &str) {
108        self.prefixes.insert(prefix.to_string(), uri.to_string());
109    }
110
111    /// Expand a prefixed URI (e.g., "rdf:type" → full URI).
112    /// Returns the original string if no prefix matches.
113    pub fn expand_uri(&self, value: &str) -> String {
114        if let Some(idx) = value.find(':') {
115            let prefix = &value[..idx];
116            let local = &value[idx + 1..];
117            if let Some(ns_uri) = self.prefixes.get(prefix) {
118                return format!("{ns_uri}{local}");
119            }
120        }
121        value.to_string()
122    }
123
124    /// Compact a full URI to prefixed form.
125    /// Matches longest prefix first to avoid ambiguity.
126    pub fn compact_uri(&self, uri: &str) -> String {
127        let mut sorted: Vec<_> = self.prefixes.iter().collect();
128        sorted.sort_by(|a, b| b.1.len().cmp(&a.1.len()));
129
130        for (prefix, ns_uri) in sorted {
131            if let Some(local) = uri.strip_prefix(ns_uri.as_str()) {
132                return format!("{prefix}:{local}");
133            }
134        }
135        uri.to_string()
136    }
137
138    /// Get all bound prefixes.
139    pub fn prefixes(&self) -> &HashMap<String, String> {
140        &self.prefixes
141    }
142
143    // ── Triple operations ─────────────────────────────────────────────
144
145    /// Add a triple with automatic namespace expansion on URIs.
146    pub fn add_triple(
147        &mut self,
148        subject: &str,
149        predicate: &str,
150        object: &str,
151        source: Option<&str>,
152        confidence: f64,
153    ) -> Result<String, StoreError> {
154        let triple = Triple {
155            subject: self.expand_uri(subject),
156            predicate: self.expand_uri(predicate),
157            object: self.expand_uri(object),
158            graph: None,
159            confidence: Some(confidence),
160            source_document: source.map(|s| s.to_string()),
161            source_chunk_id: None,
162            extracted_by: source.map(|s| s.to_string()),
163            caused_by: None,
164            derived_from: None,
165            consolidated_at: None,
166        };
167        self.inner
168            .add_triple(&triple, &self.default_namespace, self.default_layer)
169    }
170
171    /// Add multiple triples in batch.
172    pub fn add_triples(
173        &mut self,
174        triples: &[(&str, &str, &str, f64)],
175        source: Option<&str>,
176    ) -> Result<Vec<String>, StoreError> {
177        let ts: Vec<Triple> = triples
178            .iter()
179            .map(|(s, p, o, conf)| Triple {
180                subject: self.expand_uri(s),
181                predicate: self.expand_uri(p),
182                object: self.expand_uri(o),
183                graph: None,
184                confidence: Some(*conf),
185                source_document: source.map(|s| s.to_string()),
186                source_chunk_id: None,
187                extracted_by: source.map(|s| s.to_string()),
188                caused_by: None,
189                derived_from: None,
190                consolidated_at: None,
191            })
192            .collect();
193        self.inner
194            .add_batch(&ts, &self.default_namespace, self.default_layer)
195    }
196
197    /// Query by (s, p, o) pattern. None means wildcard. URIs are expanded.
198    pub fn query(
199        &self,
200        subject: Option<&str>,
201        predicate: Option<&str>,
202        object: Option<&str>,
203    ) -> Result<Vec<StoredTriple>, StoreError> {
204        let spec = QuerySpec {
205            subject: subject.map(|s| self.expand_uri(s)),
206            predicate: predicate.map(|s| self.expand_uri(s)),
207            object: object.map(|s| self.expand_uri(s)),
208            ..Default::default()
209        };
210        let batches = self.inner.query(&spec)?;
211        Ok(batches_to_stored_triples(&batches))
212    }
213
214    /// Search by keywords (case-insensitive substring match on s/p/o).
215    pub fn search_by_keywords(&self, keywords: &[&str]) -> Vec<(StoredTriple, String)> {
216        let spec = QuerySpec::default();
217        let batches = self.inner.query(&spec).unwrap_or_default();
218        let mut results = Vec::new();
219
220        for batch in &batches {
221            let subjects = batch
222                .column(col::SUBJECT)
223                .as_any()
224                .downcast_ref::<StringArray>()
225                .expect("subject column");
226            let predicates = batch
227                .column(col::PREDICATE)
228                .as_any()
229                .downcast_ref::<StringArray>()
230                .expect("predicate column");
231            let objects = batch
232                .column(col::OBJECT)
233                .as_any()
234                .downcast_ref::<StringArray>()
235                .expect("object column");
236
237            for i in 0..batch.num_rows() {
238                let s = subjects.value(i).to_lowercase();
239                let p = predicates.value(i).to_lowercase();
240                let o = objects.value(i).to_lowercase();
241
242                for kw in keywords {
243                    let kw_lower = kw.to_lowercase();
244                    if s.contains(&kw_lower) || p.contains(&kw_lower) || o.contains(&kw_lower) {
245                        results.push((
246                            crate::triple_store::extract_stored_triple(batch, i),
247                            kw.to_string(),
248                        ));
249                        break;
250                    }
251                }
252            }
253        }
254        results
255    }
256
257    /// Clear all triples.
258    pub fn clear(&mut self) {
259        self.inner.clear();
260    }
261
262    // ── Knowledge gap tracking ────────────────────────────────────────
263
264    /// Record a knowledge gap.
265    pub fn record_knowledge_gap(
266        &mut self,
267        question: &str,
268        keywords: &[&str],
269        confidence: f64,
270        missing_concepts: &[&str],
271    ) -> usize {
272        let gap = KnowledgeGap {
273            question: question.to_string(),
274            keywords: keywords.iter().map(|s| s.to_string()).collect(),
275            confidence,
276            missing_concepts: missing_concepts.iter().map(|s| s.to_string()).collect(),
277            resolved: false,
278        };
279        self.gaps.push(gap);
280        self.gaps.len() - 1
281    }
282
283    /// Get unresolved knowledge gaps.
284    pub fn unresolved_gaps(&self) -> Vec<&KnowledgeGap> {
285        self.gaps.iter().filter(|g| !g.resolved).collect()
286    }
287
288    /// Resolve a knowledge gap by index.
289    pub fn resolve_gap(&mut self, index: usize) -> bool {
290        if let Some(gap) = self.gaps.get_mut(index) {
291            gap.resolved = true;
292            true
293        } else {
294            false
295        }
296    }
297
298    // ── Statistics ────────────────────────────────────────────────────
299
300    /// Get store statistics.
301    pub fn statistics(&self) -> KgStats {
302        let spec = QuerySpec::default();
303        let batches = self.inner.query(&spec).unwrap_or_default();
304        let triples = batches_to_stored_triples(&batches);
305
306        let mut subjects = std::collections::HashSet::new();
307        let mut predicates = std::collections::HashSet::new();
308        let mut objects = std::collections::HashSet::new();
309
310        for t in &triples {
311            subjects.insert(t.subject.clone());
312            predicates.insert(t.predicate.clone());
313            objects.insert(t.object.clone());
314        }
315
316        KgStats {
317            total_triples: triples.len(),
318            unique_subjects: subjects.len(),
319            unique_predicates: predicates.len(),
320            unique_objects: objects.len(),
321            namespace_count: self.prefixes.len(),
322        }
323    }
324
325    /// Total triple count.
326    pub fn len(&self) -> usize {
327        self.inner.len()
328    }
329
330    /// Whether empty.
331    pub fn is_empty(&self) -> bool {
332        self.inner.is_empty()
333    }
334
335    /// Get reference to underlying ArrowGraphStore.
336    pub fn inner(&self) -> &ArrowGraphStore {
337        &self.inner
338    }
339
340    /// Get mutable reference to underlying ArrowGraphStore.
341    pub fn inner_mut(&mut self) -> &mut ArrowGraphStore {
342        &mut self.inner
343    }
344}
345
346impl Default for KgStore {
347    fn default() -> Self {
348        Self::new()
349    }
350}
351
352#[cfg(test)]
353mod tests {
354    use super::*;
355
356    #[test]
357    fn test_prefix_expand() {
358        let store = KgStore::new();
359        assert_eq!(
360            store.expand_uri("rdf:type"),
361            "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
362        );
363        assert_eq!(store.expand_uri("no_prefix"), "no_prefix");
364    }
365
366    #[test]
367    fn test_prefix_compact() {
368        let store = KgStore::new();
369        assert_eq!(
370            store.compact_uri("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"),
371            "rdf:type"
372        );
373        assert_eq!(
374            store.compact_uri("http://unknown/foo"),
375            "http://unknown/foo"
376        );
377    }
378
379    #[test]
380    fn test_add_with_prefix_expansion() {
381        let mut store = KgStore::new();
382        store.bind_prefix("ex", "http://example.org/");
383        store
384            .add_triple("ex:Alice", "rdf:type", "ex:Person", None, 1.0)
385            .unwrap();
386
387        let results = store
388            .query(Some("http://example.org/Alice"), None, None)
389            .unwrap();
390        assert_eq!(results.len(), 1);
391        assert_eq!(
392            results[0].predicate,
393            "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
394        );
395    }
396
397    #[test]
398    fn test_query_with_prefix() {
399        let mut store = KgStore::new();
400        store.bind_prefix("ex", "http://example.org/");
401        store
402            .add_triple("ex:Alice", "rdf:type", "ex:Person", None, 1.0)
403            .unwrap();
404
405        let results = store.query(Some("ex:Alice"), None, None).unwrap();
406        assert_eq!(results.len(), 1);
407    }
408
409    #[test]
410    fn test_keyword_search() {
411        let mut store = KgStore::new();
412        store
413            .add_triple("Alice", "knows", "Bob", None, 1.0)
414            .unwrap();
415        store
416            .add_triple("Carol", "likes", "Dave", None, 1.0)
417            .unwrap();
418
419        let results = store.search_by_keywords(&["Alice"]);
420        assert_eq!(results.len(), 1);
421        assert_eq!(results[0].1, "Alice");
422    }
423
424    #[test]
425    fn test_keyword_search_case_insensitive() {
426        let mut store = KgStore::new();
427        store
428            .add_triple("Alice", "knows", "Bob", None, 1.0)
429            .unwrap();
430
431        assert_eq!(store.search_by_keywords(&["alice"]).len(), 1);
432        assert_eq!(store.search_by_keywords(&["ALICE"]).len(), 1);
433    }
434
435    #[test]
436    fn test_knowledge_gaps() {
437        let mut store = KgStore::new();
438        let idx = store.record_knowledge_gap(
439            "What is photosynthesis?",
440            &["photosynthesis", "plants"],
441            0.3,
442            &["chloroplast"],
443        );
444
445        assert_eq!(store.unresolved_gaps().len(), 1);
446        assert!(store.resolve_gap(idx));
447        assert_eq!(store.unresolved_gaps().len(), 0);
448    }
449
450    #[test]
451    fn test_bulk_add() {
452        let mut store = KgStore::new();
453        store.bind_prefix("ex", "http://example.org/");
454        let ids = store
455            .add_triples(
456                &[
457                    ("ex:A", "rdf:type", "ex:Person", 1.0),
458                    ("ex:B", "rdf:type", "ex:Person", 1.0),
459                    ("ex:C", "rdf:type", "ex:Person", 1.0),
460                ],
461                Some("bulk_import"),
462            )
463            .unwrap();
464        assert_eq!(ids.len(), 3);
465        assert_eq!(store.len(), 3);
466    }
467
468    #[test]
469    fn test_statistics() {
470        let mut store = KgStore::new();
471        store.add_triple("s1", "p1", "o1", None, 1.0).unwrap();
472        store.add_triple("s2", "p1", "o2", None, 1.0).unwrap();
473
474        let stats = store.statistics();
475        assert_eq!(stats.total_triples, 2);
476        assert_eq!(stats.unique_subjects, 2);
477        assert_eq!(stats.unique_predicates, 1);
478        assert!(stats.namespace_count >= 6); // Standard prefixes
479    }
480
481    #[test]
482    fn test_with_config() {
483        let store = KgStore::with_config(&["world", "code"], "world", Some(1));
484        assert!(store.is_empty());
485    }
486}