Skip to main content

sqlite_knowledge_graph/
lib.rs

1//! SQLite-based Knowledge Graph Library
2//!
3//! This library provides a knowledge graph implementation built on SQLite with support for:
4//! - Entities with typed properties
5//! - Relations between entities with weights
6//! - Vector embeddings for semantic search
7//! - Custom SQLite functions for direct SQL operations
8//! - RAG (Retrieval-Augmented Generation) query functions
9//! - Graph algorithms (PageRank, Louvain, Connected Components)
10//!
11//! ## SQLite Extension
12//!
13//! This crate can be compiled as a SQLite loadable extension:
14//! ```bash
15//! cargo build --release
16//! sqlite3 db.db ".load ./target/release/libsqlite_knowledge_graph.dylib"
17//! sqlite3 db.db "SELECT kg_version();"
18//! ```
19
20pub mod algorithms;
21pub mod embed;
22pub mod error;
23pub mod export;
24pub mod extension;
25pub mod functions;
26pub mod graph;
27pub mod migrate;
28pub mod rag;
29pub mod schema;
30pub mod vector;
31
32pub use algorithms::{
33    analyze_graph, connected_components, louvain_communities, pagerank, CommunityResult,
34    PageRankConfig,
35};
36pub use embed::{
37    check_dependencies, get_entities_needing_embedding, EmbeddingConfig, EmbeddingGenerator,
38    EmbeddingStats,
39};
40pub use error::{Error, Result};
41pub use export::{D3ExportGraph, D3ExportMetadata, D3Link, D3Node, DotConfig};
42pub use extension::sqlite3_sqlite_knowledge_graph_init;
43pub use functions::register_functions;
44pub use graph::{Direction, GraphStats, PathStep, TraversalNode, TraversalPath, TraversalQuery};
45pub use graph::{Entity, Neighbor, Relation};
46pub use graph::{HigherOrderNeighbor, HigherOrderPath, HigherOrderPathStep, Hyperedge};
47pub use migrate::{
48    build_relationships, migrate_all, migrate_papers, migrate_skills, MigrationStats,
49};
50pub use rag::{embedder::Embedder, embedder::FixedEmbedder, RagConfig, RagEngine, RagResult};
51pub use schema::{create_schema, schema_exists};
52pub use vector::{cosine_similarity, SearchResult, VectorStore};
53pub use vector::{TurboQuantConfig, TurboQuantIndex, TurboQuantStats};
54
55use rusqlite::Connection;
56use serde::{Deserialize, Serialize};
57
58/// Semantic search result with entity information.
59#[derive(Debug, Clone, Serialize, Deserialize)]
60pub struct SearchResultWithEntity {
61    pub entity: Entity,
62    pub similarity: f32,
63}
64
65/// Graph context for an entity (root + neighbors).
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct GraphContext {
68    pub root_entity: Entity,
69    pub neighbors: Vec<Neighbor>,
70}
71
72/// Hybrid search result combining semantic similarity and graph context.
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct HybridSearchResult {
75    pub entity: Entity,
76    pub similarity: f32,
77    pub context: Option<GraphContext>,
78}
79
80/// Knowledge Graph Manager - main entry point for the library.
81#[derive(Debug)]
82pub struct KnowledgeGraph {
83    conn: Connection,
84}
85
86impl KnowledgeGraph {
87    /// Open a new knowledge graph database connection.
88    pub fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
89        let conn = Connection::open(path)?;
90
91        // Enable foreign keys
92        conn.execute("PRAGMA foreign_keys = ON", [])?;
93
94        // Create schema if not exists
95        if !schema_exists(&conn)? {
96            create_schema(&conn)?;
97        }
98
99        // Register custom functions
100        register_functions(&conn)?;
101
102        Ok(Self { conn })
103    }
104
105    /// Open an in-memory knowledge graph (useful for testing).
106    pub fn open_in_memory() -> Result<Self> {
107        let conn = Connection::open_in_memory()?;
108
109        // Enable foreign keys
110        conn.execute("PRAGMA foreign_keys = ON", [])?;
111
112        // Create schema
113        create_schema(&conn)?;
114
115        // Register custom functions
116        register_functions(&conn)?;
117
118        Ok(Self { conn })
119    }
120
121    /// Get a reference to the underlying SQLite connection.
122    pub fn connection(&self) -> &Connection {
123        &self.conn
124    }
125
126    /// Begin a transaction for batch operations.
127    pub fn transaction(&self) -> Result<rusqlite::Transaction<'_>> {
128        Ok(self.conn.unchecked_transaction()?)
129    }
130
131    /// Insert an entity into the knowledge graph.
132    pub fn insert_entity(&self, entity: &Entity) -> Result<i64> {
133        graph::insert_entity(&self.conn, entity)
134    }
135
136    /// Get an entity by ID.
137    pub fn get_entity(&self, id: i64) -> Result<Entity> {
138        graph::get_entity(&self.conn, id)
139    }
140
141    /// List entities with optional filtering.
142    pub fn list_entities(
143        &self,
144        entity_type: Option<&str>,
145        limit: Option<i64>,
146    ) -> Result<Vec<Entity>> {
147        graph::list_entities(&self.conn, entity_type, limit)
148    }
149
150    /// Update an entity.
151    pub fn update_entity(&self, entity: &Entity) -> Result<()> {
152        graph::update_entity(&self.conn, entity)
153    }
154
155    /// Delete an entity.
156    pub fn delete_entity(&self, id: i64) -> Result<()> {
157        graph::delete_entity(&self.conn, id)
158    }
159
160    /// Insert a relation between entities.
161    pub fn insert_relation(&self, relation: &Relation) -> Result<i64> {
162        graph::insert_relation(&self.conn, relation)
163    }
164
165    /// Get neighbors of an entity using BFS traversal.
166    pub fn get_neighbors(&self, entity_id: i64, depth: u32) -> Result<Vec<Neighbor>> {
167        graph::get_neighbors(&self.conn, entity_id, depth)
168    }
169
170    /// Insert a vector embedding for an entity.
171    pub fn insert_vector(&self, entity_id: i64, vector: Vec<f32>) -> Result<()> {
172        let store = VectorStore::new();
173        store.insert_vector(&self.conn, entity_id, vector)
174    }
175
176    /// Search for similar entities using vector embeddings.
177    pub fn search_vectors(&self, query: Vec<f32>, k: usize) -> Result<Vec<SearchResult>> {
178        let store = VectorStore::new();
179        store.search_vectors(&self.conn, query, k)
180    }
181
182    // ========== TurboQuant Vector Index ==========
183
184    /// Create a TurboQuant index for fast approximate nearest neighbor search.
185    ///
186    /// TurboQuant provides:
187    /// - Instant indexing (no training required)
188    /// - 6x memory compression
189    /// - Near-zero accuracy loss
190    ///
191    /// # Arguments
192    /// * `config` - Optional configuration (uses defaults if None)
193    ///
194    /// # Example
195    /// ```ignore
196    /// let config = TurboQuantConfig {
197    ///     dimension: 384,
198    ///     bit_width: 3,
199    ///     seed: 42,
200    /// };
201    /// let mut index = kg.create_turboquant_index(Some(config))?;
202    ///
203    /// // Add vectors to index
204    /// for (entity_id, vector) in all_vectors {
205    ///     index.add_vector(entity_id, &vector)?;
206    /// }
207    ///
208    /// // Fast search
209    /// let results = index.search(&query_vector, 10)?;
210    /// ```
211    pub fn create_turboquant_index(
212        &self,
213        config: Option<TurboQuantConfig>,
214    ) -> Result<TurboQuantIndex> {
215        let config = config.unwrap_or_default();
216
217        TurboQuantIndex::new(config)
218    }
219
220    /// Build a TurboQuant index from all existing vectors in the database.
221    /// This is a convenience method that loads all vectors and indexes them.
222    pub fn build_turboquant_index(
223        &self,
224        config: Option<TurboQuantConfig>,
225    ) -> Result<TurboQuantIndex> {
226        // Get dimension from first vector
227        let dimension = self.get_vector_dimension()?.unwrap_or(384);
228
229        let config = config.unwrap_or(TurboQuantConfig {
230            dimension,
231            bit_width: 3,
232            seed: 42,
233        });
234
235        let mut index = TurboQuantIndex::new(config)?;
236
237        // Load all vectors
238        let vectors = self.load_all_vectors()?;
239
240        for (entity_id, vector) in vectors {
241            index.add_vector(entity_id, &vector)?;
242        }
243
244        Ok(index)
245    }
246
247    /// Get the dimension of stored vectors (if any exist).
248    fn get_vector_dimension(&self) -> Result<Option<usize>> {
249        let result = self
250            .conn
251            .query_row("SELECT dimension FROM kg_vectors LIMIT 1", [], |row| {
252                row.get::<_, i64>(0)
253            });
254
255        match result {
256            Ok(dim) => Ok(Some(dim as usize)),
257            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
258            Err(e) => Err(e.into()),
259        }
260    }
261
262    /// Load all vectors from the database.
263    fn load_all_vectors(&self) -> Result<Vec<(i64, Vec<f32>)>> {
264        let mut stmt = self
265            .conn
266            .prepare("SELECT entity_id, vector, dimension FROM kg_vectors")?;
267
268        let rows = stmt.query_map([], |row| {
269            let entity_id: i64 = row.get(0)?;
270            let vector_blob: Vec<u8> = row.get(1)?;
271            let dimension: i64 = row.get(2)?;
272
273            let mut vector = Vec::with_capacity(dimension as usize);
274            for chunk in vector_blob.chunks_exact(4) {
275                let bytes: [u8; 4] = chunk.try_into().unwrap();
276                vector.push(f32::from_le_bytes(bytes));
277            }
278
279            Ok((entity_id, vector))
280        })?;
281
282        let mut vectors = Vec::new();
283        for row in rows {
284            vectors.push(row?);
285        }
286
287        Ok(vectors)
288    }
289
290    // ========== Higher-Order Relations (Hyperedges) ==========
291
292    /// Insert a hyperedge (higher-order relation) into the knowledge graph.
293    pub fn insert_hyperedge(&self, hyperedge: &Hyperedge) -> Result<i64> {
294        graph::insert_hyperedge(&self.conn, hyperedge)
295    }
296
297    /// Get a hyperedge by ID.
298    pub fn get_hyperedge(&self, id: i64) -> Result<Hyperedge> {
299        graph::get_hyperedge(&self.conn, id)
300    }
301
302    /// List hyperedges with optional filtering.
303    pub fn list_hyperedges(
304        &self,
305        hyperedge_type: Option<&str>,
306        min_arity: Option<usize>,
307        max_arity: Option<usize>,
308        limit: Option<i64>,
309    ) -> Result<Vec<Hyperedge>> {
310        graph::list_hyperedges(&self.conn, hyperedge_type, min_arity, max_arity, limit)
311    }
312
313    /// Update a hyperedge.
314    pub fn update_hyperedge(&self, hyperedge: &Hyperedge) -> Result<()> {
315        graph::update_hyperedge(&self.conn, hyperedge)
316    }
317
318    /// Delete a hyperedge by ID.
319    pub fn delete_hyperedge(&self, id: i64) -> Result<()> {
320        graph::delete_hyperedge(&self.conn, id)
321    }
322
323    /// Get higher-order neighbors of an entity (connected through hyperedges).
324    pub fn get_higher_order_neighbors(
325        &self,
326        entity_id: i64,
327        min_arity: Option<usize>,
328        max_arity: Option<usize>,
329    ) -> Result<Vec<HigherOrderNeighbor>> {
330        graph::get_higher_order_neighbors(&self.conn, entity_id, min_arity, max_arity)
331    }
332
333    /// Get all hyperedges that an entity participates in.
334    pub fn get_entity_hyperedges(&self, entity_id: i64) -> Result<Vec<Hyperedge>> {
335        graph::get_entity_hyperedges(&self.conn, entity_id)
336    }
337
338    /// Higher-order BFS traversal through hyperedges.
339    pub fn kg_higher_order_bfs(
340        &self,
341        start_id: i64,
342        max_depth: u32,
343        min_arity: Option<usize>,
344    ) -> Result<Vec<TraversalNode>> {
345        graph::higher_order_bfs(&self.conn, start_id, max_depth, min_arity)
346    }
347
348    /// Find shortest path between two entities through hyperedges.
349    pub fn kg_higher_order_shortest_path(
350        &self,
351        from_id: i64,
352        to_id: i64,
353        max_depth: u32,
354    ) -> Result<Option<HigherOrderPath>> {
355        graph::higher_order_shortest_path(&self.conn, from_id, to_id, max_depth)
356    }
357
358    /// Compute hyperedge degree centrality for an entity.
359    pub fn kg_hyperedge_degree(&self, entity_id: i64) -> Result<f64> {
360        graph::hyperedge_degree(&self.conn, entity_id)
361    }
362
363    /// Compute entity-level hypergraph PageRank using Zhou formula.
364    pub fn kg_hypergraph_entity_pagerank(
365        &self,
366        damping: Option<f64>,
367        max_iter: Option<usize>,
368        tolerance: Option<f64>,
369    ) -> Result<std::collections::HashMap<i64, f64>> {
370        graph::hypergraph_entity_pagerank(
371            &self.conn,
372            damping.unwrap_or(0.85),
373            max_iter.unwrap_or(100),
374            tolerance.unwrap_or(1e-6),
375        )
376    }
377
378    // ========== RAG Query Functions ==========
379
380    /// Semantic search using vector embeddings.
381    /// Returns entities sorted by similarity score.
382    pub fn kg_semantic_search(
383        &self,
384        query_embedding: Vec<f32>,
385        k: usize,
386    ) -> Result<Vec<SearchResultWithEntity>> {
387        let results = self.search_vectors(query_embedding, k)?;
388
389        let mut entities_with_results = Vec::new();
390        for result in results {
391            let entity = self.get_entity(result.entity_id)?;
392            entities_with_results.push(SearchResultWithEntity {
393                entity,
394                similarity: result.similarity,
395            });
396        }
397
398        Ok(entities_with_results)
399    }
400
401    /// Get context around an entity using graph traversal.
402    /// Returns neighbors up to the specified depth.
403    pub fn kg_get_context(&self, entity_id: i64, depth: u32) -> Result<GraphContext> {
404        let root_entity = self.get_entity(entity_id)?;
405        let neighbors = self.get_neighbors(entity_id, depth)?;
406
407        Ok(GraphContext {
408            root_entity,
409            neighbors,
410        })
411    }
412
413    /// Hybrid search combining semantic search and graph context.
414    /// Performs semantic search first, then retrieves context for top-k results.
415    pub fn kg_hybrid_search(
416        &self,
417        _query_text: &str,
418        query_embedding: Vec<f32>,
419        k: usize,
420    ) -> Result<Vec<HybridSearchResult>> {
421        let semantic_results = self.kg_semantic_search(query_embedding, k)?;
422
423        let mut hybrid_results = Vec::new();
424        for result in semantic_results.iter() {
425            let entity_id = result.entity.id.ok_or(Error::EntityNotFound(0))?;
426            let context = self.kg_get_context(entity_id, 1)?; // Depth 1 context
427
428            hybrid_results.push(HybridSearchResult {
429                entity: result.entity.clone(),
430                similarity: result.similarity,
431                context: Some(context),
432            });
433        }
434
435        Ok(hybrid_results)
436    }
437
438    // ========== Graph Traversal Functions ==========
439
440    /// BFS traversal from a starting entity.
441    /// Returns all reachable entities within max_depth with depth information.
442    pub fn kg_bfs_traversal(
443        &self,
444        start_id: i64,
445        direction: Direction,
446        max_depth: u32,
447    ) -> Result<Vec<TraversalNode>> {
448        let query = TraversalQuery {
449            direction,
450            max_depth,
451            ..Default::default()
452        };
453        graph::bfs_traversal(&self.conn, start_id, query)
454    }
455
456    /// DFS traversal from a starting entity.
457    /// Returns all reachable entities within max_depth.
458    pub fn kg_dfs_traversal(
459        &self,
460        start_id: i64,
461        direction: Direction,
462        max_depth: u32,
463    ) -> Result<Vec<TraversalNode>> {
464        let query = TraversalQuery {
465            direction,
466            max_depth,
467            ..Default::default()
468        };
469        graph::dfs_traversal(&self.conn, start_id, query)
470    }
471
472    /// Find shortest path between two entities using BFS.
473    /// Returns the path with all intermediate steps (if exists).
474    pub fn kg_shortest_path(
475        &self,
476        from_id: i64,
477        to_id: i64,
478        max_depth: u32,
479    ) -> Result<Option<TraversalPath>> {
480        graph::find_shortest_path(&self.conn, from_id, to_id, max_depth)
481    }
482
483    /// Compute graph statistics.
484    pub fn kg_graph_stats(&self) -> Result<GraphStats> {
485        graph::compute_graph_stats(&self.conn)
486    }
487
488    // ========== Graph Algorithms ==========
489
490    /// Compute PageRank scores for all entities.
491    /// Returns a vector of (entity_id, score) sorted by score descending.
492    pub fn kg_pagerank(&self, config: Option<PageRankConfig>) -> Result<Vec<(i64, f64)>> {
493        algorithms::pagerank(&self.conn, config.unwrap_or_default())
494    }
495
496    /// Detect communities using Louvain algorithm.
497    /// Returns community memberships and modularity score.
498    pub fn kg_louvain(&self) -> Result<CommunityResult> {
499        algorithms::louvain_communities(&self.conn)
500    }
501
502    /// Find connected components in the graph.
503    /// Returns a list of components, each being a list of entity IDs.
504    pub fn kg_connected_components(&self) -> Result<Vec<Vec<i64>>> {
505        algorithms::connected_components(&self.conn)
506    }
507
508    /// Run full graph analysis (PageRank + Louvain + Connected Components).
509    pub fn kg_analyze(&self) -> Result<algorithms::GraphAnalysis> {
510        algorithms::analyze_graph(&self.conn)
511    }
512
513    // ========== Visualization Export ==========
514
515    /// Export the knowledge graph in D3.js JSON format.
516    ///
517    /// Returns a `D3ExportGraph` containing nodes, links, and metadata,
518    /// ready for use with D3.js force-directed graph visualizations.
519    ///
520    /// # Example
521    /// ```ignore
522    /// let graph = kg.export_json()?;
523    /// let json = serde_json::to_string_pretty(&graph)?;
524    /// std::fs::write("graph.json", json)?;
525    /// ```
526    pub fn export_json(&self) -> Result<D3ExportGraph> {
527        export::export_d3_json(&self.conn)
528    }
529
530    /// Export the knowledge graph in DOT (Graphviz) format.
531    ///
532    /// Returns a DOT format string that can be rendered with Graphviz tools
533    /// (`dot`, `neato`, `fdp`, etc.).
534    ///
535    /// # Example
536    /// ```ignore
537    /// let dot = kg.export_dot(&DotConfig::default())?;
538    /// std::fs::write("graph.dot", dot)?;
539    /// // Then: dot -Tpng graph.dot -o graph.png
540    /// ```
541    pub fn export_dot(&self, config: &DotConfig) -> Result<String> {
542        export::export_dot(&self.conn, config)
543    }
544}
545
546#[cfg(test)]
547mod tests {
548    use super::*;
549
550    #[test]
551    fn test_open_in_memory() {
552        let kg = KnowledgeGraph::open_in_memory().unwrap();
553        assert!(schema_exists(kg.connection()).unwrap());
554    }
555
556    #[test]
557    fn test_crud_operations() {
558        let kg = KnowledgeGraph::open_in_memory().unwrap();
559
560        // Create entity
561        let mut entity = Entity::new("paper", "Test Paper");
562        entity.set_property("author", serde_json::json!("John Doe"));
563        let id = kg.insert_entity(&entity).unwrap();
564
565        // Read entity
566        let retrieved = kg.get_entity(id).unwrap();
567        assert_eq!(retrieved.name, "Test Paper");
568
569        // List entities
570        let entities = kg.list_entities(Some("paper"), None).unwrap();
571        assert_eq!(entities.len(), 1);
572
573        // Update entity
574        let mut updated = retrieved.clone();
575        updated.set_property("year", serde_json::json!(2024));
576        kg.update_entity(&updated).unwrap();
577
578        // Delete entity
579        kg.delete_entity(id).unwrap();
580        let entities = kg.list_entities(None, None).unwrap();
581        assert_eq!(entities.len(), 0);
582    }
583
584    #[test]
585    fn test_graph_traversal() {
586        let kg = KnowledgeGraph::open_in_memory().unwrap();
587
588        // Create entities
589        let id1 = kg.insert_entity(&Entity::new("paper", "Paper 1")).unwrap();
590        let id2 = kg.insert_entity(&Entity::new("paper", "Paper 2")).unwrap();
591        let id3 = kg.insert_entity(&Entity::new("paper", "Paper 3")).unwrap();
592
593        // Create relations
594        kg.insert_relation(&Relation::new(id1, id2, "cites", 0.8).unwrap())
595            .unwrap();
596        kg.insert_relation(&Relation::new(id2, id3, "cites", 0.9).unwrap())
597            .unwrap();
598
599        // Get neighbors depth 1
600        let neighbors = kg.get_neighbors(id1, 1).unwrap();
601        assert_eq!(neighbors.len(), 1);
602
603        // Get neighbors depth 2
604        let neighbors = kg.get_neighbors(id1, 2).unwrap();
605        assert_eq!(neighbors.len(), 2);
606    }
607
608    #[test]
609    fn test_vector_search() {
610        let kg = KnowledgeGraph::open_in_memory().unwrap();
611
612        // Create entities
613        let id1 = kg.insert_entity(&Entity::new("paper", "Paper 1")).unwrap();
614        let id2 = kg.insert_entity(&Entity::new("paper", "Paper 2")).unwrap();
615
616        // Insert vectors
617        kg.insert_vector(id1, vec![1.0, 0.0, 0.0]).unwrap();
618        kg.insert_vector(id2, vec![0.0, 1.0, 0.0]).unwrap();
619
620        // Search
621        let results = kg.search_vectors(vec![1.0, 0.0, 0.0], 2).unwrap();
622        assert_eq!(results.len(), 2);
623        assert_eq!(results[0].entity_id, id1);
624    }
625}