1use crate::{
2 core::{Document, Entity, KnowledgeGraph, Relationship},
3 entity::EntityExtractor,
4 text::TextProcessor,
5 Result,
6};
7
8#[cfg(feature = "parallel-processing")]
9use rayon::prelude::*;
10
11use std::collections::HashMap;
12
13#[cfg(feature = "async")]
15pub mod incremental;
16
17pub mod analytics;
18pub mod embeddings;
19pub mod temporal;
20pub mod traversal;
21
22#[cfg(feature = "pagerank")]
24pub mod pagerank;
25
26#[cfg(feature = "leiden")]
28pub mod leiden;
29
30#[cfg(feature = "async")]
31pub use incremental::{
32 ConflictStrategy, IncrementalGraphManager, IncrementalStatistics,
33};
34
35pub use analytics::{
36 Community, CentralityScores, Path, GraphAnalytics,
37};
38
39pub use embeddings::{
40 EmbeddingConfig, EmbeddingGraph, Node2Vec, GraphSAGE,
41 GraphSAGEConfig, Aggregator,
42};
43
44pub use temporal::{
45 TemporalGraph, TemporalEdge, Snapshot, TemporalQuery,
46 TemporalAnalytics, EvolutionMetrics,
47};
48
49pub use traversal::{
50 GraphTraversal, TraversalConfig, TraversalResult,
51};
52
53#[cfg(feature = "pagerank")]
55pub use pagerank::{MultiModalScores, PageRankConfig, PersonalizedPageRank, ScoreWeights};
56
57#[cfg(feature = "leiden")]
59pub use leiden::{
60 EntityMetadata, HierarchicalCommunities, LeidenConfig, LeidenCommunityDetector,
61};
62
63pub struct GraphBuilder {
65 text_processor: TextProcessor,
66 entity_extractor: EntityExtractor,
67 similarity_threshold: f32,
68 max_connections: usize,
69}
70
71impl GraphBuilder {
72 pub fn new(
74 chunk_size: usize,
75 chunk_overlap: usize,
76 min_confidence: f32,
77 similarity_threshold: f32,
78 max_connections: usize,
79 ) -> Result<Self> {
80 Ok(Self {
81 text_processor: TextProcessor::new(chunk_size, chunk_overlap)?,
82 entity_extractor: EntityExtractor::new(min_confidence)?,
83 similarity_threshold,
84 max_connections,
85 })
86 }
87
88 pub fn build_graph(&mut self, documents: Vec<Document>) -> Result<KnowledgeGraph> {
90 let mut graph = KnowledgeGraph::new();
91
92 #[cfg(feature = "parallel-processing")]
94 let processed_docs: Result<Vec<_>> = documents
95 .into_par_iter()
96 .map(|doc| self.process_document(doc))
97 .collect();
98
99 #[cfg(not(feature = "parallel-processing"))]
100 let processed_docs: Result<Vec<_>> = documents
101 .into_iter()
102 .map(|doc| self.process_document(doc))
103 .collect();
104
105 let processed_docs = processed_docs?;
106
107 for (doc, chunks, entities) in processed_docs {
109 let updated_doc = doc.with_chunks(chunks);
110
111 let mut entity_map = HashMap::new();
113 for entity in entities {
114 let node_idx = graph.add_entity(entity.clone())?;
115 entity_map.insert(entity.id.clone(), node_idx);
116 }
117
118 for chunk in &updated_doc.chunks {
120 let chunk_entities: Vec<Entity> = chunk
121 .entities
122 .iter()
123 .filter_map(|id| graph.get_entity(id).cloned())
124 .collect();
125
126 let relationships = self
127 .entity_extractor
128 .extract_relationships(&chunk_entities, chunk)?;
129
130 for (source_id, target_id, relation_type) in relationships {
131 let relationship = Relationship {
132 source: source_id,
133 target: target_id,
134 relation_type,
135 confidence: 0.8, context: vec![chunk.id.clone()],
137 };
138
139 graph.add_relationship(relationship)?;
140 }
141 }
142
143 graph.add_document(updated_doc)?;
144 }
145
146 self.build_semantic_connections(&mut graph)?;
148
149 Ok(graph)
150 }
151
152 fn process_document(
154 &self,
155 document: Document,
156 ) -> Result<(Document, Vec<crate::core::TextChunk>, Vec<Entity>)> {
157 let chunks = self.text_processor.chunk_text(&document)?;
159
160 #[cfg(feature = "parallel-processing")]
162 let entities_per_chunk: Result<Vec<_>> = chunks
163 .par_iter()
164 .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
165 .collect();
166
167 #[cfg(not(feature = "parallel-processing"))]
168 let entities_per_chunk: Result<Vec<_>> = chunks
169 .iter()
170 .map(|chunk| self.entity_extractor.extract_from_chunk(chunk))
171 .collect();
172
173 let entities_per_chunk = entities_per_chunk?;
174
175 let mut all_entities = Vec::new();
177 let mut entity_to_chunks = HashMap::new();
178
179 for (chunk_idx, entities) in entities_per_chunk.into_iter().enumerate() {
180 for entity in entities {
181 let entity_id = entity.id.clone();
182
183 entity_to_chunks
185 .entry(entity_id.clone())
186 .or_insert_with(Vec::new)
187 .push(chunk_idx);
188
189 all_entities.push(entity);
190 }
191 }
192
193 let deduplicated_entities = self.deduplicate_and_merge_entities(all_entities)?;
195
196 let mut updated_chunks = chunks;
198 for (entity_id, chunk_indices) in entity_to_chunks {
199 for &chunk_idx in &chunk_indices {
200 if chunk_idx < updated_chunks.len() {
201 updated_chunks[chunk_idx].entities.push(entity_id.clone());
202 }
203 }
204 }
205
206 Ok((document, updated_chunks, deduplicated_entities))
207 }
208
209 fn deduplicate_and_merge_entities(&self, entities: Vec<Entity>) -> Result<Vec<Entity>> {
211 let mut entity_map: HashMap<String, Entity> = HashMap::new();
212
213 for entity in entities {
214 let key = format!("{}_{}", entity.entity_type, entity.name.to_lowercase());
215
216 match entity_map.get_mut(&key) {
217 Some(existing) => {
218 existing.mentions.extend(entity.mentions);
220 if entity.confidence > existing.confidence {
222 existing.confidence = entity.confidence;
223 }
224 }
225 None => {
226 entity_map.insert(key, entity);
227 }
228 }
229 }
230
231 Ok(entity_map.into_values().collect())
232 }
233
234 fn build_semantic_connections(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
236 let entities: Vec<Entity> = graph.entities().cloned().collect();
237
238 for (i, entity1) in entities.iter().enumerate() {
240 if let Some(embedding1) = &entity1.embedding {
241 let mut similarities = Vec::new();
242
243 for (j, entity2) in entities.iter().enumerate() {
244 if i != j {
245 if let Some(embedding2) = &entity2.embedding {
246 let similarity = self.cosine_similarity(embedding1, embedding2);
247 if similarity > self.similarity_threshold {
248 similarities.push((j, similarity));
249 }
250 }
251 }
252 }
253
254 similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
256 similarities.truncate(self.max_connections);
257
258 for (j, similarity) in similarities {
260 let entity2 = &entities[j];
261 let relationship = Relationship {
262 source: entity1.id.clone(),
263 target: entity2.id.clone(),
264 relation_type: "SEMANTICALLY_SIMILAR".to_string(),
265 confidence: similarity,
266 context: Vec::new(),
267 };
268
269 graph.add_relationship(relationship)?;
270 }
271 }
272 }
273
274 Ok(())
275 }
276
277 fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
279 if a.len() != b.len() {
280 return 0.0;
281 }
282
283 let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
284 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
285 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
286
287 if norm_a == 0.0 || norm_b == 0.0 {
288 0.0
289 } else {
290 dot_product / (norm_a * norm_b)
291 }
292 }
293
294 pub fn add_entity_embeddings(
296 &mut self,
297 graph: &mut KnowledgeGraph,
298 embedding_fn: impl Fn(&str) -> Result<Vec<f32>>,
299 ) -> Result<()> {
300 for entity in graph.entities() {
304 if entity.embedding.is_none() {
305 let _embedding = embedding_fn(&entity.name)?;
306 }
310 }
311
312 Ok(())
313 }
314
315 pub fn analyze_graph(&self, graph: &KnowledgeGraph) -> GraphStatistics {
317 let entity_count = graph.entities().count();
318 let document_count = graph.documents().count();
319 let chunk_count = graph.chunks().count();
320
321 let entity_types: HashMap<String, usize> =
322 graph.entities().fold(HashMap::new(), |mut acc, entity| {
323 *acc.entry(entity.entity_type.clone()).or_insert(0) += 1;
324 acc
325 });
326
327 GraphStatistics {
328 entity_count,
329 document_count,
330 chunk_count,
331 entity_types,
332 average_entities_per_chunk: if chunk_count > 0 {
333 entity_count as f32 / chunk_count as f32
334 } else {
335 0.0
336 },
337 }
338 }
339}
340
341#[derive(Debug)]
343pub struct GraphStatistics {
344 pub entity_count: usize,
346 pub document_count: usize,
348 pub chunk_count: usize,
350 pub entity_types: HashMap<String, usize>,
352 pub average_entities_per_chunk: f32,
354}
355
356impl GraphStatistics {
357 #[allow(dead_code)]
359 pub fn print(&self) {
360 tracing::info!("Graph Statistics:");
361 tracing::info!(" Documents: {}", self.document_count);
362 tracing::info!(" Chunks: {}", self.chunk_count);
363 tracing::info!(" Entities: {}", self.entity_count);
364 tracing::info!(
365 " Average entities per chunk: {:.2}",
366 self.average_entities_per_chunk
367 );
368 tracing::info!(" Entity types:");
369 for (entity_type, count) in &self.entity_types {
370 tracing::info!(" {entity_type}: {count}");
371 }
372 }
373}
374
375#[cfg(test)]
376mod tests {
377 use super::*;
378 use crate::core::{Document, DocumentId};
379
380 #[test]
381 fn test_graph_building() {
382 let mut builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
383
384 let documents = vec![
385 Document::new(
386 DocumentId::new("doc1".to_string()),
387 "Test Document 1".to_string(),
388 "John Smith works at Acme Corp. The company is based in New York.".to_string(),
389 ),
390 Document::new(
391 DocumentId::new("doc2".to_string()),
392 "Test Document 2".to_string(),
393 "Jane Doe is a professor at MIT. She lives in Boston.".to_string(),
394 ),
395 ];
396
397 let graph = builder.build_graph(documents).unwrap();
398 let stats = builder.analyze_graph(&graph);
399
400 assert!(stats.entity_count > 0);
401 assert_eq!(stats.document_count, 2);
402 assert!(stats.chunk_count >= 2);
403 }
404
405 #[test]
406 fn test_cosine_similarity() {
407 let builder = GraphBuilder::new(500, 100, 0.7, 0.8, 5).unwrap();
408
409 let vec1 = vec![1.0, 0.0, 0.0];
410 let vec2 = vec![1.0, 0.0, 0.0];
411 let vec3 = vec![0.0, 1.0, 0.0];
412
413 assert!((builder.cosine_similarity(&vec1, &vec2) - 1.0).abs() < 0.001);
414 assert!((builder.cosine_similarity(&vec1, &vec3) - 0.0).abs() < 0.001);
415 }
416}