1use crate::{
7 core::{Document, Entity, KnowledgeGraph, Relationship},
8 entity::EntityExtractor,
9 text::TextProcessor,
10 Result,
11};
12
13#[cfg(feature = "parallel-processing")]
14use rayon::prelude::*;
15
16use std::collections::HashMap;
17
18#[cfg(feature = "async")]
20pub mod incremental;
21
22pub mod analytics;
23pub mod embeddings;
24pub mod temporal;
25pub mod traversal;
26
27#[cfg(feature = "async")]
29pub mod hierarchical_relationships;
30
31#[cfg(feature = "pagerank")]
33pub mod pagerank;
34
35#[cfg(feature = "leiden")]
37pub mod leiden;
38
39#[cfg(feature = "async")]
40pub use incremental::{ConflictStrategy, IncrementalGraphManager, IncrementalStatistics};
41
42pub use analytics::{CentralityScores, Community, GraphAnalytics, Path};
43
44pub use embeddings::{
45 Aggregator, EmbeddingConfig, EmbeddingGraph, GraphSAGE, GraphSAGEConfig, Node2Vec,
46};
47
48pub use temporal::{
49 EvolutionMetrics, Snapshot, TemporalAnalytics, TemporalEdge, TemporalGraph, TemporalQuery,
50};
51
52pub use traversal::{GraphTraversal, TraversalConfig, TraversalResult};
53
54#[cfg(feature = "pagerank")]
56pub use pagerank::{MultiModalScores, PageRankConfig, PersonalizedPageRank, ScoreWeights};
57
58#[cfg(feature = "leiden")]
60pub use leiden::{EntityMetadata, HierarchicalCommunities, LeidenCommunityDetector, LeidenConfig};
61
62#[cfg(feature = "async")]
64pub use hierarchical_relationships::{
65 HierarchyBuilder, HierarchyLevel, RelationshipCluster, RelationshipHierarchy,
66};
67
68pub struct GraphBuilder {
70 text_processor: TextProcessor,
71 entity_extractor: EntityExtractor,
72 similarity_threshold: f32,
73 max_connections: usize,
74}
75
76impl GraphBuilder {
77 pub fn new(
79 chunk_size: usize,
80 chunk_overlap: usize,
81 min_confidence: f32,
82 similarity_threshold: f32,
83 max_connections: usize,
84 ) -> Result<Self> {
85 Ok(Self {
86 text_processor: TextProcessor::new(chunk_size, chunk_overlap)?,
87 entity_extractor: EntityExtractor::new(min_confidence)?,
88 similarity_threshold,
89 max_connections,
90 })
91 }
92
93 pub fn build_graph(&mut self, documents: Vec<Document>) -> Result<KnowledgeGraph> {
95 let mut graph = KnowledgeGraph::new();
96
97 #[cfg(feature = "parallel-processing")]
99 let processed_docs: Result<Vec<_>> = documents
100 .into_par_iter()
101 .map(|doc| self.process_document(doc))
102 .collect();
103
104 #[cfg(not(feature = "parallel-processing"))]
105 let processed_docs: Result<Vec<_>> = documents
106 .into_iter()
107 .map(|doc| self.process_document(doc))
108 .collect();
109
110 let processed_docs = processed_docs?;
111
112 for (doc, chunks, entities) in processed_docs {
114 let updated_doc = doc.with_chunks(chunks);
115
116 let mut entity_map = HashMap::new();
118 for entity in entities {
119 let node_idx = graph.add_entity(entity.clone())?;
120 entity_map.insert(entity.id.clone(), node_idx);
121 }
122
123 for chunk in &updated_doc.chunks {
125 let chunk_entities: Vec<Entity> = chunk
126 .entities
127 .iter()
128 .filter_map(|id| graph.get_entity(id).cloned())
129 .collect();
130
131 let relationships = self
132 .entity_extractor
133 .extract_relationships(&chunk_entities, chunk)?;
134
135 for (source_id, target_id, relation_type) in relationships {
136 let relationship = Relationship {
137 source: source_id,
138 target: target_id,
139 relation_type,
140 confidence: 0.8, context: vec![chunk.id.clone()],
142 embedding: None,
143 temporal_type: None,
144 temporal_range: None,
145 causal_strength: None,
146 };
147
148 graph.add_relationship(relationship)?;
149 }
150 }
151
152 graph.add_document(updated_doc)?;
153 }
154
155 self.build_semantic_connections(&mut graph)?;
157
158 Ok(graph)
159 }
160
161 fn process_document(
163 &self,
164 document: Document,
165 ) -> Result<(Document, Vec<crate::core::TextChunk>, Vec<Entity>)> {
166 let chunks = self.text_processor.chunk_text(&document)?;
168
169 #[cfg(feature = "parallel-processing")]
171 let entities_per_chunk: Result<Vec<_>> = chunks
172 .par_iter()
173 .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
174 .collect();
175
176 #[cfg(not(feature = "parallel-processing"))]
177 let entities_per_chunk: Result<Vec<_>> = chunks
178 .iter()
179 .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
180 .collect();
181
182 let entities_per_chunk = entities_per_chunk?;
183
184 let mut all_entities = Vec::new();
186 let mut entity_to_chunks = HashMap::new();
187
188 for (chunk_idx, entities) in entities_per_chunk.into_iter().enumerate() {
189 for entity in entities {
190 let entity_id = entity.id.clone();
191
192 entity_to_chunks
194 .entry(entity_id.clone())
195 .or_insert_with(Vec::new)
196 .push(chunk_idx);
197
198 all_entities.push(entity);
199 }
200 }
201
202 let deduplicated_entities = self.deduplicate_and_merge_entities(all_entities)?;
204
205 let mut updated_chunks = chunks;
207 for (entity_id, chunk_indices) in entity_to_chunks {
208 for &chunk_idx in &chunk_indices {
209 if chunk_idx < updated_chunks.len() {
210 updated_chunks[chunk_idx].entities.push(entity_id.clone());
211 }
212 }
213 }
214
215 Ok((document, updated_chunks, deduplicated_entities))
216 }
217
218 fn deduplicate_and_merge_entities(&self, entities: Vec<Entity>) -> Result<Vec<Entity>> {
220 let mut entity_map: HashMap<String, Entity> = HashMap::new();
221
222 for entity in entities {
223 let key = format!("{}_{}", entity.entity_type, entity.name.to_lowercase());
224
225 match entity_map.get_mut(&key) {
226 Some(existing) => {
227 existing.mentions.extend(entity.mentions);
229 if entity.confidence > existing.confidence {
231 existing.confidence = entity.confidence;
232 }
233 },
234 None => {
235 entity_map.insert(key, entity);
236 },
237 }
238 }
239
240 Ok(entity_map.into_values().collect())
241 }
242
243 fn build_semantic_connections(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
245 let entities: Vec<Entity> = graph.entities().cloned().collect();
246
247 for (i, entity1) in entities.iter().enumerate() {
249 if let Some(embedding1) = &entity1.embedding {
250 let mut similarities = Vec::new();
251
252 for (j, entity2) in entities.iter().enumerate() {
253 if i != j {
254 if let Some(embedding2) = &entity2.embedding {
255 let similarity = self.cosine_similarity(embedding1, embedding2);
256 if similarity > self.similarity_threshold {
257 similarities.push((j, similarity));
258 }
259 }
260 }
261 }
262
263 similarities
265 .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
266 similarities.truncate(self.max_connections);
267
268 for (j, similarity) in similarities {
270 let entity2 = &entities[j];
271 let relationship = Relationship {
272 source: entity1.id.clone(),
273 target: entity2.id.clone(),
274 relation_type: "SEMANTICALLY_SIMILAR".to_string(),
275 confidence: similarity,
276 context: Vec::new(),
277 embedding: None,
278 temporal_type: None,
279 temporal_range: None,
280 causal_strength: None,
281 };
282
283 graph.add_relationship(relationship)?;
284 }
285 }
286 }
287
288 Ok(())
289 }
290
291 fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
293 if a.len() != b.len() {
294 return 0.0;
295 }
296
297 let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
298 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
299 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
300
301 if norm_a == 0.0 || norm_b == 0.0 {
302 0.0
303 } else {
304 dot_product / (norm_a * norm_b)
305 }
306 }
307
308 pub fn add_entity_embeddings(
310 &mut self,
311 graph: &mut KnowledgeGraph,
312 embedding_fn: impl Fn(&str) -> Result<Vec<f32>>,
313 ) -> Result<()> {
314 for entity in graph.entities() {
318 if entity.embedding.is_none() {
319 let _embedding = embedding_fn(&entity.name)?;
320 }
324 }
325
326 Ok(())
327 }
328
329 pub fn analyze_graph(&self, graph: &KnowledgeGraph) -> GraphStatistics {
331 let entity_count = graph.entities().count();
332 let document_count = graph.documents().count();
333 let chunk_count = graph.chunks().count();
334
335 let entity_types: HashMap<String, usize> =
336 graph.entities().fold(HashMap::new(), |mut acc, entity| {
337 *acc.entry(entity.entity_type.clone()).or_insert(0) += 1;
338 acc
339 });
340
341 GraphStatistics {
342 entity_count,
343 document_count,
344 chunk_count,
345 entity_types,
346 average_entities_per_chunk: if chunk_count > 0 {
347 entity_count as f32 / chunk_count as f32
348 } else {
349 0.0
350 },
351 }
352 }
353}
354
355#[derive(Debug)]
357pub struct GraphStatistics {
358 pub entity_count: usize,
360 pub document_count: usize,
362 pub chunk_count: usize,
364 pub entity_types: HashMap<String, usize>,
366 pub average_entities_per_chunk: f32,
368}
369
370impl GraphStatistics {
371 #[allow(dead_code)]
373 pub fn print(&self) {
374 #[cfg(feature = "tracing")]
375 tracing::info!("Graph Statistics:");
376 #[cfg(feature = "tracing")]
377 tracing::info!(" Documents: {}", self.document_count);
378 #[cfg(feature = "tracing")]
379 tracing::info!(" Chunks: {}", self.chunk_count);
380 #[cfg(feature = "tracing")]
381 tracing::info!(" Entities: {}", self.entity_count);
382 #[cfg(feature = "tracing")]
383 tracing::info!(
384 " Average entities per chunk: {:.2}",
385 self.average_entities_per_chunk
386 );
387 #[cfg(feature = "tracing")]
388 tracing::info!(" Entity types:");
389 #[cfg(feature = "tracing")]
390 for (entity_type, count) in &self.entity_types {
391 tracing::info!(" {entity_type}: {count}");
392 }
393 }
394}
395
396#[cfg(test)]
397mod tests {
398 use super::*;
399 use crate::core::{Document, DocumentId};
400
401 #[test]
402 fn test_graph_building() {
403 let mut builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
404
405 let documents = vec![
406 Document::new(
407 DocumentId::new("doc1".to_string()),
408 "Test Document 1".to_string(),
409 "John Smith works at Acme Corp. The company is based in New York.".to_string(),
410 ),
411 Document::new(
412 DocumentId::new("doc2".to_string()),
413 "Test Document 2".to_string(),
414 "Jane Doe is a professor at MIT. She lives in Boston.".to_string(),
415 ),
416 ];
417
418 let graph = builder.build_graph(documents).unwrap();
419 let stats = builder.analyze_graph(&graph);
420
421 assert!(stats.entity_count > 0);
422 assert_eq!(stats.document_count, 2);
423 assert!(stats.chunk_count >= 2);
424 }
425
426 #[test]
427 fn test_cosine_similarity() {
428 let builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
429
430 let vec1 = vec![1.0, 0.0, 0.0];
431 let vec2 = vec![1.0, 0.0, 0.0];
432 let vec3 = vec![0.0, 1.0, 0.0];
433
434 assert!((builder.cosine_similarity(&vec1, &vec2) - 1.0).abs() < 0.001);
435 assert!((builder.cosine_similarity(&vec1, &vec3) - 0.0).abs() < 0.001);
436 }
437}