graphrag_core/graph/
mod.rs

1//! Knowledge-graph construction and graph algorithms.
2//!
3//! Builds the entity/relationship graph from processed documents and exposes graph
4//! operations (e.g. PageRank scoring) over the resulting [`KnowledgeGraph`].
5
6use crate::{
7    core::{Document, Entity, KnowledgeGraph, Relationship},
8    entity::EntityExtractor,
9    text::TextProcessor,
10    Result,
11};
12
13#[cfg(feature = "parallel-processing")]
14use rayon::prelude::*;
15
16use std::collections::HashMap;
17
18// Incremental updates require async feature
19#[cfg(feature = "async")]
20pub mod incremental;
21
22pub mod analytics;
23pub mod embeddings;
24pub mod temporal;
25pub mod traversal;
26
27// Hierarchical relationship clustering (Phase 3.1)
28#[cfg(feature = "async")]
29pub mod hierarchical_relationships;
30
31// PageRank module is only available when the feature is enabled
32#[cfg(feature = "pagerank")]
33pub mod pagerank;
34
35// Leiden community detection (feature-gated)
36#[cfg(feature = "leiden")]
37pub mod leiden;
38
39#[cfg(feature = "async")]
40pub use incremental::{ConflictStrategy, IncrementalGraphManager, IncrementalStatistics};
41
42pub use analytics::{CentralityScores, Community, GraphAnalytics, Path};
43
44pub use embeddings::{
45    Aggregator, EmbeddingConfig, EmbeddingGraph, GraphSAGE, GraphSAGEConfig, Node2Vec,
46};
47
48pub use temporal::{
49    EvolutionMetrics, Snapshot, TemporalAnalytics, TemporalEdge, TemporalGraph, TemporalQuery,
50};
51
52pub use traversal::{GraphTraversal, TraversalConfig, TraversalResult};
53
54// PageRank exports are only available when the feature is enabled
55#[cfg(feature = "pagerank")]
56pub use pagerank::{MultiModalScores, PageRankConfig, PersonalizedPageRank, ScoreWeights};
57
58// Leiden exports are only available when the feature is enabled
59#[cfg(feature = "leiden")]
60pub use leiden::{EntityMetadata, HierarchicalCommunities, LeidenCommunityDetector, LeidenConfig};
61
62// Hierarchical relationships exports are only available when async is enabled
63#[cfg(feature = "async")]
64pub use hierarchical_relationships::{
65    HierarchyBuilder, HierarchyLevel, RelationshipCluster, RelationshipHierarchy,
66};
67
68/// Graph builder for constructing knowledge graphs from documents
69pub struct GraphBuilder {
70    text_processor: TextProcessor,
71    entity_extractor: EntityExtractor,
72    similarity_threshold: f32,
73    max_connections: usize,
74}
75
76impl GraphBuilder {
77    /// Create a new graph builder
78    pub fn new(
79        chunk_size: usize,
80        chunk_overlap: usize,
81        min_confidence: f32,
82        similarity_threshold: f32,
83        max_connections: usize,
84    ) -> Result<Self> {
85        Ok(Self {
86            text_processor: TextProcessor::new(chunk_size, chunk_overlap)?,
87            entity_extractor: EntityExtractor::new(min_confidence)?,
88            similarity_threshold,
89            max_connections,
90        })
91    }
92
93    /// Build a knowledge graph from a collection of documents
94    pub fn build_graph(&mut self, documents: Vec<Document>) -> Result<KnowledgeGraph> {
95        let mut graph = KnowledgeGraph::new();
96
97        // Process documents (in parallel if feature enabled)
98        #[cfg(feature = "parallel-processing")]
99        let processed_docs: Result<Vec<_>> = documents
100            .into_par_iter()
101            .map(|doc| self.process_document(doc))
102            .collect();
103
104        #[cfg(not(feature = "parallel-processing"))]
105        let processed_docs: Result<Vec<_>> = documents
106            .into_iter()
107            .map(|doc| self.process_document(doc))
108            .collect();
109
110        let processed_docs = processed_docs?;
111
112        // Add all documents and their chunks to the graph
113        for (doc, chunks, entities) in processed_docs {
114            let updated_doc = doc.with_chunks(chunks);
115
116            // Add entities to the graph
117            let mut entity_map = HashMap::new();
118            for entity in entities {
119                let node_idx = graph.add_entity(entity.clone())?;
120                entity_map.insert(entity.id.clone(), node_idx);
121            }
122
123            // Extract and add relationships
124            for chunk in &updated_doc.chunks {
125                let chunk_entities: Vec<Entity> = chunk
126                    .entities
127                    .iter()
128                    .filter_map(|id| graph.get_entity(id).cloned())
129                    .collect();
130
131                let relationships = self
132                    .entity_extractor
133                    .extract_relationships(&chunk_entities, chunk)?;
134
135                for (source_id, target_id, relation_type) in relationships {
136                    let relationship = Relationship {
137                        source: source_id,
138                        target: target_id,
139                        relation_type,
140                        confidence: 0.8, // Default confidence for extracted relationships
141                        context: vec![chunk.id.clone()],
142                        embedding: None,
143                        temporal_type: None,
144                        temporal_range: None,
145                        causal_strength: None,
146                    };
147
148                    graph.add_relationship(relationship)?;
149                }
150            }
151
152            graph.add_document(updated_doc)?;
153        }
154
155        // Build semantic similarity connections
156        self.build_semantic_connections(&mut graph)?;
157
158        Ok(graph)
159    }
160
161    /// Process a single document
162    fn process_document(
163        &self,
164        document: Document,
165    ) -> Result<(Document, Vec<crate::core::TextChunk>, Vec<Entity>)> {
166        // Chunk the document
167        let chunks = self.text_processor.chunk_text(&document)?;
168
169        // Extract entities from chunks (in parallel if feature enabled)
170        #[cfg(feature = "parallel-processing")]
171        let entities_per_chunk: Result<Vec<_>> = chunks
172            .par_iter()
173            .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
174            .collect();
175
176        #[cfg(not(feature = "parallel-processing"))]
177        let entities_per_chunk: Result<Vec<_>> = chunks
178            .iter()
179            .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
180            .collect();
181
182        let entities_per_chunk = entities_per_chunk?;
183
184        // Flatten and deduplicate entities
185        let mut all_entities = Vec::new();
186        let mut entity_to_chunks = HashMap::new();
187
188        for (chunk_idx, entities) in entities_per_chunk.into_iter().enumerate() {
189            for entity in entities {
190                let entity_id = entity.id.clone();
191
192                // Track which chunks contain this entity
193                entity_to_chunks
194                    .entry(entity_id.clone())
195                    .or_insert_with(Vec::new)
196                    .push(chunk_idx);
197
198                all_entities.push(entity);
199            }
200        }
201
202        // Deduplicate entities and merge mentions
203        let deduplicated_entities = self.deduplicate_and_merge_entities(all_entities)?;
204
205        // Update chunks with entity references
206        let mut updated_chunks = chunks;
207        for (entity_id, chunk_indices) in entity_to_chunks {
208            for &chunk_idx in &chunk_indices {
209                if chunk_idx < updated_chunks.len() {
210                    updated_chunks[chunk_idx].entities.push(entity_id.clone());
211                }
212            }
213        }
214
215        Ok((document, updated_chunks, deduplicated_entities))
216    }
217
218    /// Deduplicate entities and merge their mentions
219    fn deduplicate_and_merge_entities(&self, entities: Vec<Entity>) -> Result<Vec<Entity>> {
220        let mut entity_map: HashMap<String, Entity> = HashMap::new();
221
222        for entity in entities {
223            let key = format!("{}_{}", entity.entity_type, entity.name.to_lowercase());
224
225            match entity_map.get_mut(&key) {
226                Some(existing) => {
227                    // Merge mentions
228                    existing.mentions.extend(entity.mentions);
229                    // Take the highest confidence
230                    if entity.confidence > existing.confidence {
231                        existing.confidence = entity.confidence;
232                    }
233                },
234                None => {
235                    entity_map.insert(key, entity);
236                },
237            }
238        }
239
240        Ok(entity_map.into_values().collect())
241    }
242
243    /// Build semantic similarity connections between entities
244    fn build_semantic_connections(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
245        let entities: Vec<Entity> = graph.entities().cloned().collect();
246
247        // For entities with embeddings, find similar entities
248        for (i, entity1) in entities.iter().enumerate() {
249            if let Some(embedding1) = &entity1.embedding {
250                let mut similarities = Vec::new();
251
252                for (j, entity2) in entities.iter().enumerate() {
253                    if i != j {
254                        if let Some(embedding2) = &entity2.embedding {
255                            let similarity = self.cosine_similarity(embedding1, embedding2);
256                            if similarity > self.similarity_threshold {
257                                similarities.push((j, similarity));
258                            }
259                        }
260                    }
261                }
262
263                // Sort by similarity and take top connections
264                similarities
265                    .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
266                similarities.truncate(self.max_connections);
267
268                // Add semantic similarity relationships
269                for (j, similarity) in similarities {
270                    let entity2 = &entities[j];
271                    let relationship = Relationship {
272                        source: entity1.id.clone(),
273                        target: entity2.id.clone(),
274                        relation_type: "SEMANTICALLY_SIMILAR".to_string(),
275                        confidence: similarity,
276                        context: Vec::new(),
277                        embedding: None,
278                        temporal_type: None,
279                        temporal_range: None,
280                        causal_strength: None,
281                    };
282
283                    graph.add_relationship(relationship)?;
284                }
285            }
286        }
287
288        Ok(())
289    }
290
291    /// Calculate cosine similarity between two vectors
292    fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
293        if a.len() != b.len() {
294            return 0.0;
295        }
296
297        let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
298        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
299        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
300
301        if norm_a == 0.0 || norm_b == 0.0 {
302            0.0
303        } else {
304            dot_product / (norm_a * norm_b)
305        }
306    }
307
308    /// Add embeddings to entities
309    pub fn add_entity_embeddings(
310        &mut self,
311        graph: &mut KnowledgeGraph,
312        embedding_fn: impl Fn(&str) -> Result<Vec<f32>>,
313    ) -> Result<()> {
314        // This would typically call an embedding service
315        // For now, we'll create placeholder embeddings
316
317        for entity in graph.entities() {
318            if entity.embedding.is_none() {
319                let _embedding = embedding_fn(&entity.name)?;
320                // Note: In a real implementation, you'd need to update the entity in the graph
321                // This requires a mutable reference to the entity, which is not available here
322                // You'd need to redesign the graph structure to allow updating entities
323            }
324        }
325
326        Ok(())
327    }
328
329    /// Analyze graph statistics
330    pub fn analyze_graph(&self, graph: &KnowledgeGraph) -> GraphStatistics {
331        let entity_count = graph.entities().count();
332        let document_count = graph.documents().count();
333        let chunk_count = graph.chunks().count();
334
335        let entity_types: HashMap<String, usize> =
336            graph.entities().fold(HashMap::new(), |mut acc, entity| {
337                *acc.entry(entity.entity_type.clone()).or_insert(0) += 1;
338                acc
339            });
340
341        GraphStatistics {
342            entity_count,
343            document_count,
344            chunk_count,
345            entity_types,
346            average_entities_per_chunk: if chunk_count > 0 {
347                entity_count as f32 / chunk_count as f32
348            } else {
349                0.0
350            },
351        }
352    }
353}
354
355/// Statistics about the constructed graph
356#[derive(Debug)]
357pub struct GraphStatistics {
358    /// Total number of entities
359    pub entity_count: usize,
360    /// Total number of documents
361    pub document_count: usize,
362    /// Total number of chunks
363    pub chunk_count: usize,
364    /// Count of entities by type
365    pub entity_types: HashMap<String, usize>,
366    /// Average number of entities per chunk
367    pub average_entities_per_chunk: f32,
368}
369
370impl GraphStatistics {
371    /// Print graph statistics
372    #[allow(dead_code)]
373    pub fn print(&self) {
374        #[cfg(feature = "tracing")]
375        tracing::info!("Graph Statistics:");
376        #[cfg(feature = "tracing")]
377        tracing::info!("  Documents: {}", self.document_count);
378        #[cfg(feature = "tracing")]
379        tracing::info!("  Chunks: {}", self.chunk_count);
380        #[cfg(feature = "tracing")]
381        tracing::info!("  Entities: {}", self.entity_count);
382        #[cfg(feature = "tracing")]
383        tracing::info!(
384            "  Average entities per chunk: {:.2}",
385            self.average_entities_per_chunk
386        );
387        #[cfg(feature = "tracing")]
388        tracing::info!("  Entity types:");
389        #[cfg(feature = "tracing")]
390        for (entity_type, count) in &self.entity_types {
391            tracing::info!("    {entity_type}: {count}");
392        }
393    }
394}
395
396#[cfg(test)]
397mod tests {
398    use super::*;
399    use crate::core::{Document, DocumentId};
400
401    #[test]
402    fn test_graph_building() {
403        let mut builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
404
405        let documents = vec![
406            Document::new(
407                DocumentId::new("doc1".to_string()),
408                "Test Document 1".to_string(),
409                "John Smith works at Acme Corp. The company is based in New York.".to_string(),
410            ),
411            Document::new(
412                DocumentId::new("doc2".to_string()),
413                "Test Document 2".to_string(),
414                "Jane Doe is a professor at MIT. She lives in Boston.".to_string(),
415            ),
416        ];
417
418        let graph = builder.build_graph(documents).unwrap();
419        let stats = builder.analyze_graph(&graph);
420
421        assert!(stats.entity_count > 0);
422        assert_eq!(stats.document_count, 2);
423        assert!(stats.chunk_count >= 2);
424    }
425
426    #[test]
427    fn test_cosine_similarity() {
428        let builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
429
430        let vec1 = vec![1.0, 0.0, 0.0];
431        let vec2 = vec![1.0, 0.0, 0.0];
432        let vec3 = vec![0.0, 1.0, 0.0];
433
434        assert!((builder.cosine_similarity(&vec1, &vec2) - 1.0).abs() < 0.001);
435        assert!((builder.cosine_similarity(&vec1, &vec3) - 0.0).abs() < 0.001);
436    }
437}
graphrag_core/graph/mod.rs

graphrag_core/graph/
mod.rs