Skip to main content

graphrag_core/graph/
mod.rs

1use crate::{
2    core::{Document, Entity, KnowledgeGraph, Relationship},
3    entity::EntityExtractor,
4    text::TextProcessor,
5    Result,
6};
7
8#[cfg(feature = "parallel-processing")]
9use rayon::prelude::*;
10
11use std::collections::HashMap;
12
13// Incremental updates require async feature
14#[cfg(feature = "async")]
15pub mod incremental;
16
17pub mod analytics;
18pub mod embeddings;
19pub mod temporal;
20pub mod traversal;
21
22// Hierarchical relationship clustering (Phase 3.1)
23#[cfg(feature = "async")]
24pub mod hierarchical_relationships;
25
26// PageRank module is only available when the feature is enabled
27#[cfg(feature = "pagerank")]
28pub mod pagerank;
29
30// Leiden community detection (feature-gated)
31#[cfg(feature = "leiden")]
32pub mod leiden;
33
34#[cfg(feature = "async")]
35pub use incremental::{ConflictStrategy, IncrementalGraphManager, IncrementalStatistics};
36
37pub use analytics::{CentralityScores, Community, GraphAnalytics, Path};
38
39pub use embeddings::{
40    Aggregator, EmbeddingConfig, EmbeddingGraph, GraphSAGE, GraphSAGEConfig, Node2Vec,
41};
42
43pub use temporal::{
44    EvolutionMetrics, Snapshot, TemporalAnalytics, TemporalEdge, TemporalGraph, TemporalQuery,
45};
46
47pub use traversal::{GraphTraversal, TraversalConfig, TraversalResult};
48
49// PageRank exports are only available when the feature is enabled
50#[cfg(feature = "pagerank")]
51pub use pagerank::{MultiModalScores, PageRankConfig, PersonalizedPageRank, ScoreWeights};
52
53// Leiden exports are only available when the feature is enabled
54#[cfg(feature = "leiden")]
55pub use leiden::{EntityMetadata, HierarchicalCommunities, LeidenCommunityDetector, LeidenConfig};
56
57// Hierarchical relationships exports are only available when async is enabled
58#[cfg(feature = "async")]
59pub use hierarchical_relationships::{
60    HierarchyBuilder, HierarchyLevel, RelationshipCluster, RelationshipHierarchy,
61};
62
63/// Graph builder for constructing knowledge graphs from documents
64pub struct GraphBuilder {
65    text_processor: TextProcessor,
66    entity_extractor: EntityExtractor,
67    similarity_threshold: f32,
68    max_connections: usize,
69}
70
71impl GraphBuilder {
72    /// Create a new graph builder
73    pub fn new(
74        chunk_size: usize,
75        chunk_overlap: usize,
76        min_confidence: f32,
77        similarity_threshold: f32,
78        max_connections: usize,
79    ) -> Result<Self> {
80        Ok(Self {
81            text_processor: TextProcessor::new(chunk_size, chunk_overlap)?,
82            entity_extractor: EntityExtractor::new(min_confidence)?,
83            similarity_threshold,
84            max_connections,
85        })
86    }
87
88    /// Build a knowledge graph from a collection of documents
89    pub fn build_graph(&mut self, documents: Vec<Document>) -> Result<KnowledgeGraph> {
90        let mut graph = KnowledgeGraph::new();
91
92        // Process documents (in parallel if feature enabled)
93        #[cfg(feature = "parallel-processing")]
94        let processed_docs: Result<Vec<_>> = documents
95            .into_par_iter()
96            .map(|doc| self.process_document(doc))
97            .collect();
98
99        #[cfg(not(feature = "parallel-processing"))]
100        let processed_docs: Result<Vec<_>> = documents
101            .into_iter()
102            .map(|doc| self.process_document(doc))
103            .collect();
104
105        let processed_docs = processed_docs?;
106
107        // Add all documents and their chunks to the graph
108        for (doc, chunks, entities) in processed_docs {
109            let updated_doc = doc.with_chunks(chunks);
110
111            // Add entities to the graph
112            let mut entity_map = HashMap::new();
113            for entity in entities {
114                let node_idx = graph.add_entity(entity.clone())?;
115                entity_map.insert(entity.id.clone(), node_idx);
116            }
117
118            // Extract and add relationships
119            for chunk in &updated_doc.chunks {
120                let chunk_entities: Vec<Entity> = chunk
121                    .entities
122                    .iter()
123                    .filter_map(|id| graph.get_entity(id).cloned())
124                    .collect();
125
126                let relationships = self
127                    .entity_extractor
128                    .extract_relationships(&chunk_entities, chunk)?;
129
130                for (source_id, target_id, relation_type) in relationships {
131                    let relationship = Relationship {
132                        source: source_id,
133                        target: target_id,
134                        relation_type,
135                        confidence: 0.8, // Default confidence for extracted relationships
136                        context: vec![chunk.id.clone()],
137                        embedding: None,
138                        temporal_type: None,
139                        temporal_range: None,
140                        causal_strength: None,
141                    };
142
143                    graph.add_relationship(relationship)?;
144                }
145            }
146
147            graph.add_document(updated_doc)?;
148        }
149
150        // Build semantic similarity connections
151        self.build_semantic_connections(&mut graph)?;
152
153        Ok(graph)
154    }
155
156    /// Process a single document
157    fn process_document(
158        &self,
159        document: Document,
160    ) -> Result<(Document, Vec<crate::core::TextChunk>, Vec<Entity>)> {
161        // Chunk the document
162        let chunks = self.text_processor.chunk_text(&document)?;
163
164        // Extract entities from chunks (in parallel if feature enabled)
165        #[cfg(feature = "parallel-processing")]
166        let entities_per_chunk: Result<Vec<_>> = chunks
167            .par_iter()
168            .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
169            .collect();
170
171        #[cfg(not(feature = "parallel-processing"))]
172        let entities_per_chunk: Result<Vec<_>> = chunks
173            .iter()
174            .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
175            .collect();
176
177        let entities_per_chunk = entities_per_chunk?;
178
179        // Flatten and deduplicate entities
180        let mut all_entities = Vec::new();
181        let mut entity_to_chunks = HashMap::new();
182
183        for (chunk_idx, entities) in entities_per_chunk.into_iter().enumerate() {
184            for entity in entities {
185                let entity_id = entity.id.clone();
186
187                // Track which chunks contain this entity
188                entity_to_chunks
189                    .entry(entity_id.clone())
190                    .or_insert_with(Vec::new)
191                    .push(chunk_idx);
192
193                all_entities.push(entity);
194            }
195        }
196
197        // Deduplicate entities and merge mentions
198        let deduplicated_entities = self.deduplicate_and_merge_entities(all_entities)?;
199
200        // Update chunks with entity references
201        let mut updated_chunks = chunks;
202        for (entity_id, chunk_indices) in entity_to_chunks {
203            for &chunk_idx in &chunk_indices {
204                if chunk_idx < updated_chunks.len() {
205                    updated_chunks[chunk_idx].entities.push(entity_id.clone());
206                }
207            }
208        }
209
210        Ok((document, updated_chunks, deduplicated_entities))
211    }
212
213    /// Deduplicate entities and merge their mentions
214    fn deduplicate_and_merge_entities(&self, entities: Vec<Entity>) -> Result<Vec<Entity>> {
215        let mut entity_map: HashMap<String, Entity> = HashMap::new();
216
217        for entity in entities {
218            let key = format!("{}_{}", entity.entity_type, entity.name.to_lowercase());
219
220            match entity_map.get_mut(&key) {
221                Some(existing) => {
222                    // Merge mentions
223                    existing.mentions.extend(entity.mentions);
224                    // Take the highest confidence
225                    if entity.confidence > existing.confidence {
226                        existing.confidence = entity.confidence;
227                    }
228                },
229                None => {
230                    entity_map.insert(key, entity);
231                },
232            }
233        }
234
235        Ok(entity_map.into_values().collect())
236    }
237
238    /// Build semantic similarity connections between entities
239    fn build_semantic_connections(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
240        let entities: Vec<Entity> = graph.entities().cloned().collect();
241
242        // For entities with embeddings, find similar entities
243        for (i, entity1) in entities.iter().enumerate() {
244            if let Some(embedding1) = &entity1.embedding {
245                let mut similarities = Vec::new();
246
247                for (j, entity2) in entities.iter().enumerate() {
248                    if i != j {
249                        if let Some(embedding2) = &entity2.embedding {
250                            let similarity = self.cosine_similarity(embedding1, embedding2);
251                            if similarity > self.similarity_threshold {
252                                similarities.push((j, similarity));
253                            }
254                        }
255                    }
256                }
257
258                // Sort by similarity and take top connections
259                similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
260                similarities.truncate(self.max_connections);
261
262                // Add semantic similarity relationships
263                for (j, similarity) in similarities {
264                    let entity2 = &entities[j];
265                    let relationship = Relationship {
266                        source: entity1.id.clone(),
267                        target: entity2.id.clone(),
268                        relation_type: "SEMANTICALLY_SIMILAR".to_string(),
269                        confidence: similarity,
270                        context: Vec::new(),
271                        embedding: None,
272                        temporal_type: None,
273                        temporal_range: None,
274                        causal_strength: None,
275                    };
276
277                    graph.add_relationship(relationship)?;
278                }
279            }
280        }
281
282        Ok(())
283    }
284
285    /// Calculate cosine similarity between two vectors
286    fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
287        if a.len() != b.len() {
288            return 0.0;
289        }
290
291        let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
292        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
293        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
294
295        if norm_a == 0.0 || norm_b == 0.0 {
296            0.0
297        } else {
298            dot_product / (norm_a * norm_b)
299        }
300    }
301
302    /// Add embeddings to entities
303    pub fn add_entity_embeddings(
304        &mut self,
305        graph: &mut KnowledgeGraph,
306        embedding_fn: impl Fn(&str) -> Result<Vec<f32>>,
307    ) -> Result<()> {
308        // This would typically call an embedding service
309        // For now, we'll create placeholder embeddings
310
311        for entity in graph.entities() {
312            if entity.embedding.is_none() {
313                let _embedding = embedding_fn(&entity.name)?;
314                // Note: In a real implementation, you'd need to update the entity in the graph
315                // This requires a mutable reference to the entity, which is not available here
316                // You'd need to redesign the graph structure to allow updating entities
317            }
318        }
319
320        Ok(())
321    }
322
323    /// Analyze graph statistics
324    pub fn analyze_graph(&self, graph: &KnowledgeGraph) -> GraphStatistics {
325        let entity_count = graph.entities().count();
326        let document_count = graph.documents().count();
327        let chunk_count = graph.chunks().count();
328
329        let entity_types: HashMap<String, usize> =
330            graph.entities().fold(HashMap::new(), |mut acc, entity| {
331                *acc.entry(entity.entity_type.clone()).or_insert(0) += 1;
332                acc
333            });
334
335        GraphStatistics {
336            entity_count,
337            document_count,
338            chunk_count,
339            entity_types,
340            average_entities_per_chunk: if chunk_count > 0 {
341                entity_count as f32 / chunk_count as f32
342            } else {
343                0.0
344            },
345        }
346    }
347}
348
349/// Statistics about the constructed graph
350#[derive(Debug)]
351pub struct GraphStatistics {
352    /// Total number of entities
353    pub entity_count: usize,
354    /// Total number of documents
355    pub document_count: usize,
356    /// Total number of chunks
357    pub chunk_count: usize,
358    /// Count of entities by type
359    pub entity_types: HashMap<String, usize>,
360    /// Average number of entities per chunk
361    pub average_entities_per_chunk: f32,
362}
363
364impl GraphStatistics {
365    /// Print graph statistics
366    #[allow(dead_code)]
367    pub fn print(&self) {
368        tracing::info!("Graph Statistics:");
369        tracing::info!("  Documents: {}", self.document_count);
370        tracing::info!("  Chunks: {}", self.chunk_count);
371        tracing::info!("  Entities: {}", self.entity_count);
372        tracing::info!(
373            "  Average entities per chunk: {:.2}",
374            self.average_entities_per_chunk
375        );
376        tracing::info!("  Entity types:");
377        for (entity_type, count) in &self.entity_types {
378            tracing::info!("    {entity_type}: {count}");
379        }
380    }
381}
382
383#[cfg(test)]
384mod tests {
385    use super::*;
386    use crate::core::{Document, DocumentId};
387
388    #[test]
389    fn test_graph_building() {
390        let mut builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
391
392        let documents = vec![
393            Document::new(
394                DocumentId::new("doc1".to_string()),
395                "Test Document 1".to_string(),
396                "John Smith works at Acme Corp. The company is based in New York.".to_string(),
397            ),
398            Document::new(
399                DocumentId::new("doc2".to_string()),
400                "Test Document 2".to_string(),
401                "Jane Doe is a professor at MIT. She lives in Boston.".to_string(),
402            ),
403        ];
404
405        let graph = builder.build_graph(documents).unwrap();
406        let stats = builder.analyze_graph(&graph);
407
408        assert!(stats.entity_count > 0);
409        assert_eq!(stats.document_count, 2);
410        assert!(stats.chunk_count >= 2);
411    }
412
413    #[test]
414    fn test_cosine_similarity() {
415        let builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
416
417        let vec1 = vec![1.0, 0.0, 0.0];
418        let vec2 = vec![1.0, 0.0, 0.0];
419        let vec3 = vec![0.0, 1.0, 0.0];
420
421        assert!((builder.cosine_similarity(&vec1, &vec2) - 1.0).abs() < 0.001);
422        assert!((builder.cosine_similarity(&vec1, &vec3) - 0.0).abs() < 0.001);
423    }
424}