Skip to main content

sqlite_knowledge_graph/
lib.rs

1//! SQLite-based Knowledge Graph Library
2//!
3//! This library provides a knowledge graph implementation built on SQLite with support for:
4//! - Entities with typed properties
5//! - Relations between entities with weights
6//! - Vector embeddings for semantic search
7//! - Custom SQLite functions for direct SQL operations
8//! - RAG (Retrieval-Augmented Generation) query functions
9//! - Graph algorithms (PageRank, Louvain, Connected Components)
10//!
11//! ## SQLite Extension
12//!
13//! This crate can be compiled as a SQLite loadable extension:
14//! ```bash
15//! cargo build --release
16//! sqlite3 db.db ".load ./target/release/libsqlite_knowledge_graph.dylib"
17//! sqlite3 db.db "SELECT kg_version();"
18//! ```
19
20pub mod algorithms;
21pub mod embed;
22pub mod error;
23pub mod export;
24pub mod extension;
25pub mod functions;
26pub mod graph;
27pub mod migrate;
28pub mod schema;
29pub mod vector;
30
31pub use algorithms::{
32    analyze_graph, connected_components, louvain_communities, pagerank, CommunityResult,
33    PageRankConfig,
34};
35pub use embed::{
36    check_dependencies, get_entities_needing_embedding, EmbeddingConfig, EmbeddingGenerator,
37    EmbeddingStats,
38};
39pub use error::{Error, Result};
40pub use export::{D3ExportGraph, D3ExportMetadata, D3Link, D3Node, DotConfig};
41pub use extension::sqlite3_sqlite_knowledge_graph_init;
42pub use functions::register_functions;
43pub use graph::{Direction, GraphStats, PathStep, TraversalNode, TraversalPath, TraversalQuery};
44pub use graph::{Entity, Neighbor, Relation};
45pub use graph::{HigherOrderNeighbor, HigherOrderPath, HigherOrderPathStep, Hyperedge};
46pub use migrate::{
47    build_relationships, migrate_all, migrate_papers, migrate_skills, MigrationStats,
48};
49pub use schema::{create_schema, schema_exists};
50pub use vector::{cosine_similarity, SearchResult, VectorStore};
51pub use vector::{TurboQuantConfig, TurboQuantIndex, TurboQuantStats};
52
53use rusqlite::Connection;
54use serde::{Deserialize, Serialize};
55
56/// Semantic search result with entity information.
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct SearchResultWithEntity {
59    pub entity: Entity,
60    pub similarity: f32,
61}
62
63/// Graph context for an entity (root + neighbors).
64#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct GraphContext {
66    pub root_entity: Entity,
67    pub neighbors: Vec<Neighbor>,
68}
69
70/// Hybrid search result combining semantic similarity and graph context.
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct HybridSearchResult {
73    pub entity: Entity,
74    pub similarity: f32,
75    pub context: Option<GraphContext>,
76}
77
78/// Knowledge Graph Manager - main entry point for the library.
79#[derive(Debug)]
80pub struct KnowledgeGraph {
81    conn: Connection,
82}
83
84impl KnowledgeGraph {
85    /// Open a new knowledge graph database connection.
86    pub fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
87        let conn = Connection::open(path)?;
88
89        // Enable foreign keys
90        conn.execute("PRAGMA foreign_keys = ON", [])?;
91
92        // Create schema if not exists
93        if !schema_exists(&conn)? {
94            create_schema(&conn)?;
95        }
96
97        // Register custom functions
98        register_functions(&conn)?;
99
100        Ok(Self { conn })
101    }
102
103    /// Open an in-memory knowledge graph (useful for testing).
104    pub fn open_in_memory() -> Result<Self> {
105        let conn = Connection::open_in_memory()?;
106
107        // Enable foreign keys
108        conn.execute("PRAGMA foreign_keys = ON", [])?;
109
110        // Create schema
111        create_schema(&conn)?;
112
113        // Register custom functions
114        register_functions(&conn)?;
115
116        Ok(Self { conn })
117    }
118
119    /// Get a reference to the underlying SQLite connection.
120    pub fn connection(&self) -> &Connection {
121        &self.conn
122    }
123
124    /// Begin a transaction for batch operations.
125    pub fn transaction(&self) -> Result<rusqlite::Transaction<'_>> {
126        Ok(self.conn.unchecked_transaction()?)
127    }
128
129    /// Insert an entity into the knowledge graph.
130    pub fn insert_entity(&self, entity: &Entity) -> Result<i64> {
131        graph::insert_entity(&self.conn, entity)
132    }
133
134    /// Get an entity by ID.
135    pub fn get_entity(&self, id: i64) -> Result<Entity> {
136        graph::get_entity(&self.conn, id)
137    }
138
139    /// List entities with optional filtering.
140    pub fn list_entities(
141        &self,
142        entity_type: Option<&str>,
143        limit: Option<i64>,
144    ) -> Result<Vec<Entity>> {
145        graph::list_entities(&self.conn, entity_type, limit)
146    }
147
148    /// Update an entity.
149    pub fn update_entity(&self, entity: &Entity) -> Result<()> {
150        graph::update_entity(&self.conn, entity)
151    }
152
153    /// Delete an entity.
154    pub fn delete_entity(&self, id: i64) -> Result<()> {
155        graph::delete_entity(&self.conn, id)
156    }
157
158    /// Insert a relation between entities.
159    pub fn insert_relation(&self, relation: &Relation) -> Result<i64> {
160        graph::insert_relation(&self.conn, relation)
161    }
162
163    /// Get neighbors of an entity using BFS traversal.
164    pub fn get_neighbors(&self, entity_id: i64, depth: u32) -> Result<Vec<Neighbor>> {
165        graph::get_neighbors(&self.conn, entity_id, depth)
166    }
167
168    /// Insert a vector embedding for an entity.
169    pub fn insert_vector(&self, entity_id: i64, vector: Vec<f32>) -> Result<()> {
170        let store = VectorStore::new();
171        store.insert_vector(&self.conn, entity_id, vector)
172    }
173
174    /// Search for similar entities using vector embeddings.
175    pub fn search_vectors(&self, query: Vec<f32>, k: usize) -> Result<Vec<SearchResult>> {
176        let store = VectorStore::new();
177        store.search_vectors(&self.conn, query, k)
178    }
179
180    // ========== TurboQuant Vector Index ==========
181
182    /// Create a TurboQuant index for fast approximate nearest neighbor search.
183    ///
184    /// TurboQuant provides:
185    /// - Instant indexing (no training required)
186    /// - 6x memory compression
187    /// - Near-zero accuracy loss
188    ///
189    /// # Arguments
190    /// * `config` - Optional configuration (uses defaults if None)
191    ///
192    /// # Example
193    /// ```ignore
194    /// let config = TurboQuantConfig {
195    ///     dimension: 384,
196    ///     bit_width: 3,
197    ///     seed: 42,
198    /// };
199    /// let mut index = kg.create_turboquant_index(Some(config))?;
200    ///
201    /// // Add vectors to index
202    /// for (entity_id, vector) in all_vectors {
203    ///     index.add_vector(entity_id, &vector)?;
204    /// }
205    ///
206    /// // Fast search
207    /// let results = index.search(&query_vector, 10)?;
208    /// ```
209    pub fn create_turboquant_index(
210        &self,
211        config: Option<TurboQuantConfig>,
212    ) -> Result<TurboQuantIndex> {
213        let config = config.unwrap_or_default();
214
215        TurboQuantIndex::new(config)
216    }
217
218    /// Build a TurboQuant index from all existing vectors in the database.
219    /// This is a convenience method that loads all vectors and indexes them.
220    pub fn build_turboquant_index(
221        &self,
222        config: Option<TurboQuantConfig>,
223    ) -> Result<TurboQuantIndex> {
224        // Get dimension from first vector
225        let dimension = self.get_vector_dimension()?.unwrap_or(384);
226
227        let config = config.unwrap_or(TurboQuantConfig {
228            dimension,
229            bit_width: 3,
230            seed: 42,
231        });
232
233        let mut index = TurboQuantIndex::new(config)?;
234
235        // Load all vectors
236        let vectors = self.load_all_vectors()?;
237
238        for (entity_id, vector) in vectors {
239            index.add_vector(entity_id, &vector)?;
240        }
241
242        Ok(index)
243    }
244
245    /// Get the dimension of stored vectors (if any exist).
246    fn get_vector_dimension(&self) -> Result<Option<usize>> {
247        let result = self
248            .conn
249            .query_row("SELECT dimension FROM kg_vectors LIMIT 1", [], |row| {
250                row.get::<_, i64>(0)
251            });
252
253        match result {
254            Ok(dim) => Ok(Some(dim as usize)),
255            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
256            Err(e) => Err(e.into()),
257        }
258    }
259
260    /// Load all vectors from the database.
261    fn load_all_vectors(&self) -> Result<Vec<(i64, Vec<f32>)>> {
262        let mut stmt = self
263            .conn
264            .prepare("SELECT entity_id, vector, dimension FROM kg_vectors")?;
265
266        let rows = stmt.query_map([], |row| {
267            let entity_id: i64 = row.get(0)?;
268            let vector_blob: Vec<u8> = row.get(1)?;
269            let dimension: i64 = row.get(2)?;
270
271            let mut vector = Vec::with_capacity(dimension as usize);
272            for chunk in vector_blob.chunks_exact(4) {
273                let bytes: [u8; 4] = chunk.try_into().unwrap();
274                vector.push(f32::from_le_bytes(bytes));
275            }
276
277            Ok((entity_id, vector))
278        })?;
279
280        let mut vectors = Vec::new();
281        for row in rows {
282            vectors.push(row?);
283        }
284
285        Ok(vectors)
286    }
287
288    // ========== Higher-Order Relations (Hyperedges) ==========
289
290    /// Insert a hyperedge (higher-order relation) into the knowledge graph.
291    pub fn insert_hyperedge(&self, hyperedge: &Hyperedge) -> Result<i64> {
292        graph::insert_hyperedge(&self.conn, hyperedge)
293    }
294
295    /// Get a hyperedge by ID.
296    pub fn get_hyperedge(&self, id: i64) -> Result<Hyperedge> {
297        graph::get_hyperedge(&self.conn, id)
298    }
299
300    /// List hyperedges with optional filtering.
301    pub fn list_hyperedges(
302        &self,
303        hyperedge_type: Option<&str>,
304        min_arity: Option<usize>,
305        max_arity: Option<usize>,
306        limit: Option<i64>,
307    ) -> Result<Vec<Hyperedge>> {
308        graph::list_hyperedges(&self.conn, hyperedge_type, min_arity, max_arity, limit)
309    }
310
311    /// Update a hyperedge.
312    pub fn update_hyperedge(&self, hyperedge: &Hyperedge) -> Result<()> {
313        graph::update_hyperedge(&self.conn, hyperedge)
314    }
315
316    /// Delete a hyperedge by ID.
317    pub fn delete_hyperedge(&self, id: i64) -> Result<()> {
318        graph::delete_hyperedge(&self.conn, id)
319    }
320
321    /// Get higher-order neighbors of an entity (connected through hyperedges).
322    pub fn get_higher_order_neighbors(
323        &self,
324        entity_id: i64,
325        min_arity: Option<usize>,
326        max_arity: Option<usize>,
327    ) -> Result<Vec<HigherOrderNeighbor>> {
328        graph::get_higher_order_neighbors(&self.conn, entity_id, min_arity, max_arity)
329    }
330
331    /// Get all hyperedges that an entity participates in.
332    pub fn get_entity_hyperedges(&self, entity_id: i64) -> Result<Vec<Hyperedge>> {
333        graph::get_entity_hyperedges(&self.conn, entity_id)
334    }
335
336    /// Higher-order BFS traversal through hyperedges.
337    pub fn kg_higher_order_bfs(
338        &self,
339        start_id: i64,
340        max_depth: u32,
341        min_arity: Option<usize>,
342    ) -> Result<Vec<TraversalNode>> {
343        graph::higher_order_bfs(&self.conn, start_id, max_depth, min_arity)
344    }
345
346    /// Find shortest path between two entities through hyperedges.
347    pub fn kg_higher_order_shortest_path(
348        &self,
349        from_id: i64,
350        to_id: i64,
351        max_depth: u32,
352    ) -> Result<Option<HigherOrderPath>> {
353        graph::higher_order_shortest_path(&self.conn, from_id, to_id, max_depth)
354    }
355
356    /// Compute hyperedge degree centrality for an entity.
357    pub fn kg_hyperedge_degree(&self, entity_id: i64) -> Result<f64> {
358        graph::hyperedge_degree(&self.conn, entity_id)
359    }
360
361    /// Compute entity-level hypergraph PageRank using Zhou formula.
362    pub fn kg_hypergraph_entity_pagerank(
363        &self,
364        damping: Option<f64>,
365        max_iter: Option<usize>,
366        tolerance: Option<f64>,
367    ) -> Result<std::collections::HashMap<i64, f64>> {
368        graph::hypergraph_entity_pagerank(
369            &self.conn,
370            damping.unwrap_or(0.85),
371            max_iter.unwrap_or(100),
372            tolerance.unwrap_or(1e-6),
373        )
374    }
375
376    // ========== RAG Query Functions ==========
377
378    /// Semantic search using vector embeddings.
379    /// Returns entities sorted by similarity score.
380    pub fn kg_semantic_search(
381        &self,
382        query_embedding: Vec<f32>,
383        k: usize,
384    ) -> Result<Vec<SearchResultWithEntity>> {
385        let results = self.search_vectors(query_embedding, k)?;
386
387        let mut entities_with_results = Vec::new();
388        for result in results {
389            let entity = self.get_entity(result.entity_id)?;
390            entities_with_results.push(SearchResultWithEntity {
391                entity,
392                similarity: result.similarity,
393            });
394        }
395
396        Ok(entities_with_results)
397    }
398
399    /// Get context around an entity using graph traversal.
400    /// Returns neighbors up to the specified depth.
401    pub fn kg_get_context(&self, entity_id: i64, depth: u32) -> Result<GraphContext> {
402        let root_entity = self.get_entity(entity_id)?;
403        let neighbors = self.get_neighbors(entity_id, depth)?;
404
405        Ok(GraphContext {
406            root_entity,
407            neighbors,
408        })
409    }
410
411    /// Hybrid search combining semantic search and graph context.
412    /// Performs semantic search first, then retrieves context for top-k results.
413    pub fn kg_hybrid_search(
414        &self,
415        _query_text: &str,
416        query_embedding: Vec<f32>,
417        k: usize,
418    ) -> Result<Vec<HybridSearchResult>> {
419        let semantic_results = self.kg_semantic_search(query_embedding, k)?;
420
421        let mut hybrid_results = Vec::new();
422        for result in semantic_results.iter() {
423            let entity_id = result.entity.id.ok_or(Error::EntityNotFound(0))?;
424            let context = self.kg_get_context(entity_id, 1)?; // Depth 1 context
425
426            hybrid_results.push(HybridSearchResult {
427                entity: result.entity.clone(),
428                similarity: result.similarity,
429                context: Some(context),
430            });
431        }
432
433        Ok(hybrid_results)
434    }
435
436    // ========== Graph Traversal Functions ==========
437
438    /// BFS traversal from a starting entity.
439    /// Returns all reachable entities within max_depth with depth information.
440    pub fn kg_bfs_traversal(
441        &self,
442        start_id: i64,
443        direction: Direction,
444        max_depth: u32,
445    ) -> Result<Vec<TraversalNode>> {
446        let query = TraversalQuery {
447            direction,
448            max_depth,
449            ..Default::default()
450        };
451        graph::bfs_traversal(&self.conn, start_id, query)
452    }
453
454    /// DFS traversal from a starting entity.
455    /// Returns all reachable entities within max_depth.
456    pub fn kg_dfs_traversal(
457        &self,
458        start_id: i64,
459        direction: Direction,
460        max_depth: u32,
461    ) -> Result<Vec<TraversalNode>> {
462        let query = TraversalQuery {
463            direction,
464            max_depth,
465            ..Default::default()
466        };
467        graph::dfs_traversal(&self.conn, start_id, query)
468    }
469
470    /// Find shortest path between two entities using BFS.
471    /// Returns the path with all intermediate steps (if exists).
472    pub fn kg_shortest_path(
473        &self,
474        from_id: i64,
475        to_id: i64,
476        max_depth: u32,
477    ) -> Result<Option<TraversalPath>> {
478        graph::find_shortest_path(&self.conn, from_id, to_id, max_depth)
479    }
480
481    /// Compute graph statistics.
482    pub fn kg_graph_stats(&self) -> Result<GraphStats> {
483        graph::compute_graph_stats(&self.conn)
484    }
485
486    // ========== Graph Algorithms ==========
487
488    /// Compute PageRank scores for all entities.
489    /// Returns a vector of (entity_id, score) sorted by score descending.
490    pub fn kg_pagerank(&self, config: Option<PageRankConfig>) -> Result<Vec<(i64, f64)>> {
491        algorithms::pagerank(&self.conn, config.unwrap_or_default())
492    }
493
494    /// Detect communities using Louvain algorithm.
495    /// Returns community memberships and modularity score.
496    pub fn kg_louvain(&self) -> Result<CommunityResult> {
497        algorithms::louvain_communities(&self.conn)
498    }
499
500    /// Find connected components in the graph.
501    /// Returns a list of components, each being a list of entity IDs.
502    pub fn kg_connected_components(&self) -> Result<Vec<Vec<i64>>> {
503        algorithms::connected_components(&self.conn)
504    }
505
506    /// Run full graph analysis (PageRank + Louvain + Connected Components).
507    pub fn kg_analyze(&self) -> Result<algorithms::GraphAnalysis> {
508        algorithms::analyze_graph(&self.conn)
509    }
510
511    // ========== Visualization Export ==========
512
513    /// Export the knowledge graph in D3.js JSON format.
514    ///
515    /// Returns a `D3ExportGraph` containing nodes, links, and metadata,
516    /// ready for use with D3.js force-directed graph visualizations.
517    ///
518    /// # Example
519    /// ```ignore
520    /// let graph = kg.export_json()?;
521    /// let json = serde_json::to_string_pretty(&graph)?;
522    /// std::fs::write("graph.json", json)?;
523    /// ```
524    pub fn export_json(&self) -> Result<D3ExportGraph> {
525        export::export_d3_json(&self.conn)
526    }
527
528    /// Export the knowledge graph in DOT (Graphviz) format.
529    ///
530    /// Returns a DOT format string that can be rendered with Graphviz tools
531    /// (`dot`, `neato`, `fdp`, etc.).
532    ///
533    /// # Example
534    /// ```ignore
535    /// let dot = kg.export_dot(&DotConfig::default())?;
536    /// std::fs::write("graph.dot", dot)?;
537    /// // Then: dot -Tpng graph.dot -o graph.png
538    /// ```
539    pub fn export_dot(&self, config: &DotConfig) -> Result<String> {
540        export::export_dot(&self.conn, config)
541    }
542}
543
544#[cfg(test)]
545mod tests {
546    use super::*;
547
548    #[test]
549    fn test_open_in_memory() {
550        let kg = KnowledgeGraph::open_in_memory().unwrap();
551        assert!(schema_exists(kg.connection()).unwrap());
552    }
553
554    #[test]
555    fn test_crud_operations() {
556        let kg = KnowledgeGraph::open_in_memory().unwrap();
557
558        // Create entity
559        let mut entity = Entity::new("paper", "Test Paper");
560        entity.set_property("author", serde_json::json!("John Doe"));
561        let id = kg.insert_entity(&entity).unwrap();
562
563        // Read entity
564        let retrieved = kg.get_entity(id).unwrap();
565        assert_eq!(retrieved.name, "Test Paper");
566
567        // List entities
568        let entities = kg.list_entities(Some("paper"), None).unwrap();
569        assert_eq!(entities.len(), 1);
570
571        // Update entity
572        let mut updated = retrieved.clone();
573        updated.set_property("year", serde_json::json!(2024));
574        kg.update_entity(&updated).unwrap();
575
576        // Delete entity
577        kg.delete_entity(id).unwrap();
578        let entities = kg.list_entities(None, None).unwrap();
579        assert_eq!(entities.len(), 0);
580    }
581
582    #[test]
583    fn test_graph_traversal() {
584        let kg = KnowledgeGraph::open_in_memory().unwrap();
585
586        // Create entities
587        let id1 = kg.insert_entity(&Entity::new("paper", "Paper 1")).unwrap();
588        let id2 = kg.insert_entity(&Entity::new("paper", "Paper 2")).unwrap();
589        let id3 = kg.insert_entity(&Entity::new("paper", "Paper 3")).unwrap();
590
591        // Create relations
592        kg.insert_relation(&Relation::new(id1, id2, "cites", 0.8).unwrap())
593            .unwrap();
594        kg.insert_relation(&Relation::new(id2, id3, "cites", 0.9).unwrap())
595            .unwrap();
596
597        // Get neighbors depth 1
598        let neighbors = kg.get_neighbors(id1, 1).unwrap();
599        assert_eq!(neighbors.len(), 1);
600
601        // Get neighbors depth 2
602        let neighbors = kg.get_neighbors(id1, 2).unwrap();
603        assert_eq!(neighbors.len(), 2);
604    }
605
606    #[test]
607    fn test_vector_search() {
608        let kg = KnowledgeGraph::open_in_memory().unwrap();
609
610        // Create entities
611        let id1 = kg.insert_entity(&Entity::new("paper", "Paper 1")).unwrap();
612        let id2 = kg.insert_entity(&Entity::new("paper", "Paper 2")).unwrap();
613
614        // Insert vectors
615        kg.insert_vector(id1, vec![1.0, 0.0, 0.0]).unwrap();
616        kg.insert_vector(id2, vec![0.0, 1.0, 0.0]).unwrap();
617
618        // Search
619        let results = kg.search_vectors(vec![1.0, 0.0, 0.0], 2).unwrap();
620        assert_eq!(results.len(), 2);
621        assert_eq!(results[0].entity_id, id1);
622    }
623}