1use crate::{
2 core::{Document, Entity, KnowledgeGraph, Relationship},
3 entity::EntityExtractor,
4 text::TextProcessor,
5 Result,
6};
7
8#[cfg(feature = "parallel-processing")]
9use rayon::prelude::*;
10
11use std::collections::HashMap;
12
13#[cfg(feature = "async")]
15pub mod incremental;
16
17pub mod analytics;
18pub mod embeddings;
19pub mod temporal;
20pub mod traversal;
21
22#[cfg(feature = "async")]
24pub mod hierarchical_relationships;
25
26#[cfg(feature = "pagerank")]
28pub mod pagerank;
29
30#[cfg(feature = "leiden")]
32pub mod leiden;
33
34#[cfg(feature = "async")]
35pub use incremental::{ConflictStrategy, IncrementalGraphManager, IncrementalStatistics};
36
37pub use analytics::{CentralityScores, Community, GraphAnalytics, Path};
38
39pub use embeddings::{
40 Aggregator, EmbeddingConfig, EmbeddingGraph, GraphSAGE, GraphSAGEConfig, Node2Vec,
41};
42
43pub use temporal::{
44 EvolutionMetrics, Snapshot, TemporalAnalytics, TemporalEdge, TemporalGraph, TemporalQuery,
45};
46
47pub use traversal::{GraphTraversal, TraversalConfig, TraversalResult};
48
49#[cfg(feature = "pagerank")]
51pub use pagerank::{MultiModalScores, PageRankConfig, PersonalizedPageRank, ScoreWeights};
52
53#[cfg(feature = "leiden")]
55pub use leiden::{EntityMetadata, HierarchicalCommunities, LeidenCommunityDetector, LeidenConfig};
56
57#[cfg(feature = "async")]
59pub use hierarchical_relationships::{
60 HierarchyBuilder, HierarchyLevel, RelationshipCluster, RelationshipHierarchy,
61};
62
63pub struct GraphBuilder {
65 text_processor: TextProcessor,
66 entity_extractor: EntityExtractor,
67 similarity_threshold: f32,
68 max_connections: usize,
69}
70
71impl GraphBuilder {
72 pub fn new(
74 chunk_size: usize,
75 chunk_overlap: usize,
76 min_confidence: f32,
77 similarity_threshold: f32,
78 max_connections: usize,
79 ) -> Result<Self> {
80 Ok(Self {
81 text_processor: TextProcessor::new(chunk_size, chunk_overlap)?,
82 entity_extractor: EntityExtractor::new(min_confidence)?,
83 similarity_threshold,
84 max_connections,
85 })
86 }
87
88 pub fn build_graph(&mut self, documents: Vec<Document>) -> Result<KnowledgeGraph> {
90 let mut graph = KnowledgeGraph::new();
91
92 #[cfg(feature = "parallel-processing")]
94 let processed_docs: Result<Vec<_>> = documents
95 .into_par_iter()
96 .map(|doc| self.process_document(doc))
97 .collect();
98
99 #[cfg(not(feature = "parallel-processing"))]
100 let processed_docs: Result<Vec<_>> = documents
101 .into_iter()
102 .map(|doc| self.process_document(doc))
103 .collect();
104
105 let processed_docs = processed_docs?;
106
107 for (doc, chunks, entities) in processed_docs {
109 let updated_doc = doc.with_chunks(chunks);
110
111 let mut entity_map = HashMap::new();
113 for entity in entities {
114 let node_idx = graph.add_entity(entity.clone())?;
115 entity_map.insert(entity.id.clone(), node_idx);
116 }
117
118 for chunk in &updated_doc.chunks {
120 let chunk_entities: Vec<Entity> = chunk
121 .entities
122 .iter()
123 .filter_map(|id| graph.get_entity(id).cloned())
124 .collect();
125
126 let relationships = self
127 .entity_extractor
128 .extract_relationships(&chunk_entities, chunk)?;
129
130 for (source_id, target_id, relation_type) in relationships {
131 let relationship = Relationship {
132 source: source_id,
133 target: target_id,
134 relation_type,
135 confidence: 0.8, context: vec![chunk.id.clone()],
137 embedding: None,
138 temporal_type: None,
139 temporal_range: None,
140 causal_strength: None,
141 };
142
143 graph.add_relationship(relationship)?;
144 }
145 }
146
147 graph.add_document(updated_doc)?;
148 }
149
150 self.build_semantic_connections(&mut graph)?;
152
153 Ok(graph)
154 }
155
156 fn process_document(
158 &self,
159 document: Document,
160 ) -> Result<(Document, Vec<crate::core::TextChunk>, Vec<Entity>)> {
161 let chunks = self.text_processor.chunk_text(&document)?;
163
164 #[cfg(feature = "parallel-processing")]
166 let entities_per_chunk: Result<Vec<_>> = chunks
167 .par_iter()
168 .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
169 .collect();
170
171 #[cfg(not(feature = "parallel-processing"))]
172 let entities_per_chunk: Result<Vec<_>> = chunks
173 .iter()
174 .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
175 .collect();
176
177 let entities_per_chunk = entities_per_chunk?;
178
179 let mut all_entities = Vec::new();
181 let mut entity_to_chunks = HashMap::new();
182
183 for (chunk_idx, entities) in entities_per_chunk.into_iter().enumerate() {
184 for entity in entities {
185 let entity_id = entity.id.clone();
186
187 entity_to_chunks
189 .entry(entity_id.clone())
190 .or_insert_with(Vec::new)
191 .push(chunk_idx);
192
193 all_entities.push(entity);
194 }
195 }
196
197 let deduplicated_entities = self.deduplicate_and_merge_entities(all_entities)?;
199
200 let mut updated_chunks = chunks;
202 for (entity_id, chunk_indices) in entity_to_chunks {
203 for &chunk_idx in &chunk_indices {
204 if chunk_idx < updated_chunks.len() {
205 updated_chunks[chunk_idx].entities.push(entity_id.clone());
206 }
207 }
208 }
209
210 Ok((document, updated_chunks, deduplicated_entities))
211 }
212
213 fn deduplicate_and_merge_entities(&self, entities: Vec<Entity>) -> Result<Vec<Entity>> {
215 let mut entity_map: HashMap<String, Entity> = HashMap::new();
216
217 for entity in entities {
218 let key = format!("{}_{}", entity.entity_type, entity.name.to_lowercase());
219
220 match entity_map.get_mut(&key) {
221 Some(existing) => {
222 existing.mentions.extend(entity.mentions);
224 if entity.confidence > existing.confidence {
226 existing.confidence = entity.confidence;
227 }
228 },
229 None => {
230 entity_map.insert(key, entity);
231 },
232 }
233 }
234
235 Ok(entity_map.into_values().collect())
236 }
237
238 fn build_semantic_connections(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
240 let entities: Vec<Entity> = graph.entities().cloned().collect();
241
242 for (i, entity1) in entities.iter().enumerate() {
244 if let Some(embedding1) = &entity1.embedding {
245 let mut similarities = Vec::new();
246
247 for (j, entity2) in entities.iter().enumerate() {
248 if i != j {
249 if let Some(embedding2) = &entity2.embedding {
250 let similarity = self.cosine_similarity(embedding1, embedding2);
251 if similarity > self.similarity_threshold {
252 similarities.push((j, similarity));
253 }
254 }
255 }
256 }
257
258 similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
260 similarities.truncate(self.max_connections);
261
262 for (j, similarity) in similarities {
264 let entity2 = &entities[j];
265 let relationship = Relationship {
266 source: entity1.id.clone(),
267 target: entity2.id.clone(),
268 relation_type: "SEMANTICALLY_SIMILAR".to_string(),
269 confidence: similarity,
270 context: Vec::new(),
271 embedding: None,
272 temporal_type: None,
273 temporal_range: None,
274 causal_strength: None,
275 };
276
277 graph.add_relationship(relationship)?;
278 }
279 }
280 }
281
282 Ok(())
283 }
284
285 fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
287 if a.len() != b.len() {
288 return 0.0;
289 }
290
291 let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
292 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
293 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
294
295 if norm_a == 0.0 || norm_b == 0.0 {
296 0.0
297 } else {
298 dot_product / (norm_a * norm_b)
299 }
300 }
301
302 pub fn add_entity_embeddings(
304 &mut self,
305 graph: &mut KnowledgeGraph,
306 embedding_fn: impl Fn(&str) -> Result<Vec<f32>>,
307 ) -> Result<()> {
308 for entity in graph.entities() {
312 if entity.embedding.is_none() {
313 let _embedding = embedding_fn(&entity.name)?;
314 }
318 }
319
320 Ok(())
321 }
322
323 pub fn analyze_graph(&self, graph: &KnowledgeGraph) -> GraphStatistics {
325 let entity_count = graph.entities().count();
326 let document_count = graph.documents().count();
327 let chunk_count = graph.chunks().count();
328
329 let entity_types: HashMap<String, usize> =
330 graph.entities().fold(HashMap::new(), |mut acc, entity| {
331 *acc.entry(entity.entity_type.clone()).or_insert(0) += 1;
332 acc
333 });
334
335 GraphStatistics {
336 entity_count,
337 document_count,
338 chunk_count,
339 entity_types,
340 average_entities_per_chunk: if chunk_count > 0 {
341 entity_count as f32 / chunk_count as f32
342 } else {
343 0.0
344 },
345 }
346 }
347}
348
349#[derive(Debug)]
351pub struct GraphStatistics {
352 pub entity_count: usize,
354 pub document_count: usize,
356 pub chunk_count: usize,
358 pub entity_types: HashMap<String, usize>,
360 pub average_entities_per_chunk: f32,
362}
363
364impl GraphStatistics {
365 #[allow(dead_code)]
367 pub fn print(&self) {
368 tracing::info!("Graph Statistics:");
369 tracing::info!(" Documents: {}", self.document_count);
370 tracing::info!(" Chunks: {}", self.chunk_count);
371 tracing::info!(" Entities: {}", self.entity_count);
372 tracing::info!(
373 " Average entities per chunk: {:.2}",
374 self.average_entities_per_chunk
375 );
376 tracing::info!(" Entity types:");
377 for (entity_type, count) in &self.entity_types {
378 tracing::info!(" {entity_type}: {count}");
379 }
380 }
381}
382
383#[cfg(test)]
384mod tests {
385 use super::*;
386 use crate::core::{Document, DocumentId};
387
388 #[test]
389 fn test_graph_building() {
390 let mut builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
391
392 let documents = vec![
393 Document::new(
394 DocumentId::new("doc1".to_string()),
395 "Test Document 1".to_string(),
396 "John Smith works at Acme Corp. The company is based in New York.".to_string(),
397 ),
398 Document::new(
399 DocumentId::new("doc2".to_string()),
400 "Test Document 2".to_string(),
401 "Jane Doe is a professor at MIT. She lives in Boston.".to_string(),
402 ),
403 ];
404
405 let graph = builder.build_graph(documents).unwrap();
406 let stats = builder.analyze_graph(&graph);
407
408 assert!(stats.entity_count > 0);
409 assert_eq!(stats.document_count, 2);
410 assert!(stats.chunk_count >= 2);
411 }
412
413 #[test]
414 fn test_cosine_similarity() {
415 let builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
416
417 let vec1 = vec![1.0, 0.0, 0.0];
418 let vec2 = vec![1.0, 0.0, 0.0];
419 let vec3 = vec![0.0, 1.0, 0.0];
420
421 assert!((builder.cosine_similarity(&vec1, &vec2) - 1.0).abs() < 0.001);
422 assert!((builder.cosine_similarity(&vec1, &vec3) - 0.0).abs() < 0.001);
423 }
424}