Skip to main content

graphrag_core/graph/
mod.rs

1use crate::{
2    core::{Document, Entity, KnowledgeGraph, Relationship},
3    entity::EntityExtractor,
4    text::TextProcessor,
5    Result,
6};
7
8#[cfg(feature = "parallel-processing")]
9use rayon::prelude::*;
10
11use std::collections::HashMap;
12
13// Incremental updates require async feature
14#[cfg(feature = "async")]
15pub mod incremental;
16
17pub mod analytics;
18pub mod embeddings;
19pub mod temporal;
20pub mod traversal;
21
22// PageRank module is only available when the feature is enabled
23#[cfg(feature = "pagerank")]
24pub mod pagerank;
25
26// Leiden community detection (feature-gated)
27#[cfg(feature = "leiden")]
28pub mod leiden;
29
30#[cfg(feature = "async")]
31pub use incremental::{
32    ConflictStrategy, IncrementalGraphManager, IncrementalStatistics,
33};
34
35pub use analytics::{
36    Community, CentralityScores, Path, GraphAnalytics,
37};
38
39pub use embeddings::{
40    EmbeddingConfig, EmbeddingGraph, Node2Vec, GraphSAGE,
41    GraphSAGEConfig, Aggregator,
42};
43
44pub use temporal::{
45    TemporalGraph, TemporalEdge, Snapshot, TemporalQuery,
46    TemporalAnalytics, EvolutionMetrics,
47};
48
49pub use traversal::{
50    GraphTraversal, TraversalConfig, TraversalResult,
51};
52
53// PageRank exports are only available when the feature is enabled
54#[cfg(feature = "pagerank")]
55pub use pagerank::{MultiModalScores, PageRankConfig, PersonalizedPageRank, ScoreWeights};
56
57// Leiden exports are only available when the feature is enabled
58#[cfg(feature = "leiden")]
59pub use leiden::{
60    EntityMetadata, HierarchicalCommunities, LeidenConfig, LeidenCommunityDetector,
61};
62
63/// Graph builder for constructing knowledge graphs from documents
64pub struct GraphBuilder {
65    text_processor: TextProcessor,
66    entity_extractor: EntityExtractor,
67    similarity_threshold: f32,
68    max_connections: usize,
69}
70
71impl GraphBuilder {
72    /// Create a new graph builder
73    pub fn new(
74        chunk_size: usize,
75        chunk_overlap: usize,
76        min_confidence: f32,
77        similarity_threshold: f32,
78        max_connections: usize,
79    ) -> Result<Self> {
80        Ok(Self {
81            text_processor: TextProcessor::new(chunk_size, chunk_overlap)?,
82            entity_extractor: EntityExtractor::new(min_confidence)?,
83            similarity_threshold,
84            max_connections,
85        })
86    }
87
88    /// Build a knowledge graph from a collection of documents
89    pub fn build_graph(&mut self, documents: Vec<Document>) -> Result<KnowledgeGraph> {
90        let mut graph = KnowledgeGraph::new();
91
92        // Process documents (in parallel if feature enabled)
93        #[cfg(feature = "parallel-processing")]
94        let processed_docs: Result<Vec<_>> = documents
95            .into_par_iter()
96            .map(|doc| self.process_document(doc))
97            .collect();
98
99        #[cfg(not(feature = "parallel-processing"))]
100        let processed_docs: Result<Vec<_>> = documents
101            .into_iter()
102            .map(|doc| self.process_document(doc))
103            .collect();
104
105        let processed_docs = processed_docs?;
106
107        // Add all documents and their chunks to the graph
108        for (doc, chunks, entities) in processed_docs {
109            let updated_doc = doc.with_chunks(chunks);
110
111            // Add entities to the graph
112            let mut entity_map = HashMap::new();
113            for entity in entities {
114                let node_idx = graph.add_entity(entity.clone())?;
115                entity_map.insert(entity.id.clone(), node_idx);
116            }
117
118            // Extract and add relationships
119            for chunk in &updated_doc.chunks {
120                let chunk_entities: Vec<Entity> = chunk
121                    .entities
122                    .iter()
123                    .filter_map(|id| graph.get_entity(id).cloned())
124                    .collect();
125
126                let relationships = self
127                    .entity_extractor
128                    .extract_relationships(&chunk_entities, chunk)?;
129
130                for (source_id, target_id, relation_type) in relationships {
131                    let relationship = Relationship {
132                        source: source_id,
133                        target: target_id,
134                        relation_type,
135                        confidence: 0.8, // Default confidence for extracted relationships
136                        context: vec![chunk.id.clone()],
137                    };
138
139                    graph.add_relationship(relationship)?;
140                }
141            }
142
143            graph.add_document(updated_doc)?;
144        }
145
146        // Build semantic similarity connections
147        self.build_semantic_connections(&mut graph)?;
148
149        Ok(graph)
150    }
151
152    /// Process a single document
153    fn process_document(
154        &self,
155        document: Document,
156    ) -> Result<(Document, Vec<crate::core::TextChunk>, Vec<Entity>)> {
157        // Chunk the document
158        let chunks = self.text_processor.chunk_text(&document)?;
159
160        // Extract entities from chunks (in parallel if feature enabled)
161        #[cfg(feature = "parallel-processing")]
162        let entities_per_chunk: Result<Vec<_>> = chunks
163            .par_iter()
164            .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
165            .collect();
166
167        #[cfg(not(feature = "parallel-processing"))]
168        let entities_per_chunk: Result<Vec<_>> = chunks
169            .iter()
170            .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
171            .collect();
172
173        let entities_per_chunk = entities_per_chunk?;
174
175        // Flatten and deduplicate entities
176        let mut all_entities = Vec::new();
177        let mut entity_to_chunks = HashMap::new();
178
179        for (chunk_idx, entities) in entities_per_chunk.into_iter().enumerate() {
180            for entity in entities {
181                let entity_id = entity.id.clone();
182
183                // Track which chunks contain this entity
184                entity_to_chunks
185                    .entry(entity_id.clone())
186                    .or_insert_with(Vec::new)
187                    .push(chunk_idx);
188
189                all_entities.push(entity);
190            }
191        }
192
193        // Deduplicate entities and merge mentions
194        let deduplicated_entities = self.deduplicate_and_merge_entities(all_entities)?;
195
196        // Update chunks with entity references
197        let mut updated_chunks = chunks;
198        for (entity_id, chunk_indices) in entity_to_chunks {
199            for &chunk_idx in &chunk_indices {
200                if chunk_idx < updated_chunks.len() {
201                    updated_chunks[chunk_idx].entities.push(entity_id.clone());
202                }
203            }
204        }
205
206        Ok((document, updated_chunks, deduplicated_entities))
207    }
208
209    /// Deduplicate entities and merge their mentions
210    fn deduplicate_and_merge_entities(&self, entities: Vec<Entity>) -> Result<Vec<Entity>> {
211        let mut entity_map: HashMap<String, Entity> = HashMap::new();
212
213        for entity in entities {
214            let key = format!("{}_{}", entity.entity_type, entity.name.to_lowercase());
215
216            match entity_map.get_mut(&key) {
217                Some(existing) => {
218                    // Merge mentions
219                    existing.mentions.extend(entity.mentions);
220                    // Take the highest confidence
221                    if entity.confidence > existing.confidence {
222                        existing.confidence = entity.confidence;
223                    }
224                }
225                None => {
226                    entity_map.insert(key, entity);
227                }
228            }
229        }
230
231        Ok(entity_map.into_values().collect())
232    }
233
234    /// Build semantic similarity connections between entities
235    fn build_semantic_connections(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
236        let entities: Vec<Entity> = graph.entities().cloned().collect();
237
238        // For entities with embeddings, find similar entities
239        for (i, entity1) in entities.iter().enumerate() {
240            if let Some(embedding1) = &entity1.embedding {
241                let mut similarities = Vec::new();
242
243                for (j, entity2) in entities.iter().enumerate() {
244                    if i != j {
245                        if let Some(embedding2) = &entity2.embedding {
246                            let similarity = self.cosine_similarity(embedding1, embedding2);
247                            if similarity > self.similarity_threshold {
248                                similarities.push((j, similarity));
249                            }
250                        }
251                    }
252                }
253
254                // Sort by similarity and take top connections
255                similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
256                similarities.truncate(self.max_connections);
257
258                // Add semantic similarity relationships
259                for (j, similarity) in similarities {
260                    let entity2 = &entities[j];
261                    let relationship = Relationship {
262                        source: entity1.id.clone(),
263                        target: entity2.id.clone(),
264                        relation_type: "SEMANTICALLY_SIMILAR".to_string(),
265                        confidence: similarity,
266                        context: Vec::new(),
267                    };
268
269                    graph.add_relationship(relationship)?;
270                }
271            }
272        }
273
274        Ok(())
275    }
276
277    /// Calculate cosine similarity between two vectors
278    fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
279        if a.len() != b.len() {
280            return 0.0;
281        }
282
283        let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
284        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
285        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
286
287        if norm_a == 0.0 || norm_b == 0.0 {
288            0.0
289        } else {
290            dot_product / (norm_a * norm_b)
291        }
292    }
293
294    /// Add embeddings to entities
295    pub fn add_entity_embeddings(
296        &mut self,
297        graph: &mut KnowledgeGraph,
298        embedding_fn: impl Fn(&str) -> Result<Vec<f32>>,
299    ) -> Result<()> {
300        // This would typically call an embedding service
301        // For now, we'll create placeholder embeddings
302
303        for entity in graph.entities() {
304            if entity.embedding.is_none() {
305                let _embedding = embedding_fn(&entity.name)?;
306                // Note: In a real implementation, you'd need to update the entity in the graph
307                // This requires a mutable reference to the entity, which is not available here
308                // You'd need to redesign the graph structure to allow updating entities
309            }
310        }
311
312        Ok(())
313    }
314
315    /// Analyze graph statistics
316    pub fn analyze_graph(&self, graph: &KnowledgeGraph) -> GraphStatistics {
317        let entity_count = graph.entities().count();
318        let document_count = graph.documents().count();
319        let chunk_count = graph.chunks().count();
320
321        let entity_types: HashMap<String, usize> =
322            graph.entities().fold(HashMap::new(), |mut acc, entity| {
323                *acc.entry(entity.entity_type.clone()).or_insert(0) += 1;
324                acc
325            });
326
327        GraphStatistics {
328            entity_count,
329            document_count,
330            chunk_count,
331            entity_types,
332            average_entities_per_chunk: if chunk_count > 0 {
333                entity_count as f32 / chunk_count as f32
334            } else {
335                0.0
336            },
337        }
338    }
339}
340
341/// Statistics about the constructed graph
342#[derive(Debug)]
343pub struct GraphStatistics {
344    /// Total number of entities
345    pub entity_count: usize,
346    /// Total number of documents
347    pub document_count: usize,
348    /// Total number of chunks
349    pub chunk_count: usize,
350    /// Count of entities by type
351    pub entity_types: HashMap<String, usize>,
352    /// Average number of entities per chunk
353    pub average_entities_per_chunk: f32,
354}
355
356impl GraphStatistics {
357    /// Print graph statistics
358    #[allow(dead_code)]
359    pub fn print(&self) {
360        tracing::info!("Graph Statistics:");
361        tracing::info!("  Documents: {}", self.document_count);
362        tracing::info!("  Chunks: {}", self.chunk_count);
363        tracing::info!("  Entities: {}", self.entity_count);
364        tracing::info!(
365            "  Average entities per chunk: {:.2}",
366            self.average_entities_per_chunk
367        );
368        tracing::info!("  Entity types:");
369        for (entity_type, count) in &self.entity_types {
370            tracing::info!("    {entity_type}: {count}");
371        }
372    }
373}
374
375#[cfg(test)]
376mod tests {
377    use super::*;
378    use crate::core::{Document, DocumentId};
379
380    #[test]
381    fn test_graph_building() {
382        let mut builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
383
384        let documents = vec![
385            Document::new(
386                DocumentId::new("doc1".to_string()),
387                "Test Document 1".to_string(),
388                "John Smith works at Acme Corp. The company is based in New York.".to_string(),
389            ),
390            Document::new(
391                DocumentId::new("doc2".to_string()),
392                "Test Document 2".to_string(),
393                "Jane Doe is a professor at MIT. She lives in Boston.".to_string(),
394            ),
395        ];
396
397        let graph = builder.build_graph(documents).unwrap();
398        let stats = builder.analyze_graph(&graph);
399
400        assert!(stats.entity_count > 0);
401        assert_eq!(stats.document_count, 2);
402        assert!(stats.chunk_count >= 2);
403    }
404
405    #[test]
406    fn test_cosine_similarity() {
407        let builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
408
409        let vec1 = vec![1.0, 0.0, 0.0];
410        let vec2 = vec![1.0, 0.0, 0.0];
411        let vec3 = vec![0.0, 1.0, 0.0];
412
413        assert!((builder.cosine_similarity(&vec1, &vec2) - 1.0).abs() < 0.001);
414        assert!((builder.cosine_similarity(&vec1, &vec3) - 0.0).abs() < 0.001);
415    }
416}