1use crate::{
10 core::{Entity, Relationship, Result, TextChunk},
11 entity::{
12 llm_extractor::LLMEntityExtractor,
13 prompts::{EntityData, RelationshipData},
14 },
15 ollama::OllamaClient,
16};
17use serde::{Deserialize, Serialize};
18use std::collections::HashMap;
19
20#[derive(Debug, Clone)]
22pub struct GleaningConfig {
23 pub max_gleaning_rounds: usize,
25 pub completion_threshold: f64,
27 pub entity_confidence_threshold: f64,
29 pub use_llm_completion_check: bool,
31 pub entity_types: Vec<String>,
33 pub temperature: f32,
35 pub max_tokens: usize,
37}
38
39impl Default for GleaningConfig {
40 fn default() -> Self {
41 Self {
42 max_gleaning_rounds: 4, completion_threshold: 0.85,
44 entity_confidence_threshold: 0.7,
45 use_llm_completion_check: true, entity_types: vec![
47 "PERSON".to_string(),
48 "ORGANIZATION".to_string(),
49 "LOCATION".to_string(),
50 "EVENT".to_string(),
51 "CONCEPT".to_string(),
52 ],
53 temperature: 0.0, max_tokens: 1500,
55 }
56 }
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct ExtractionCompletionStatus {
62 pub is_complete: bool,
64 pub confidence: f64,
66 pub missing_aspects: Vec<String>,
68 pub suggestions: Vec<String>,
70}
71
72pub struct GleaningEntityExtractor {
77 llm_extractor: LLMEntityExtractor,
78 config: GleaningConfig,
79}
80
81impl GleaningEntityExtractor {
82 pub fn new(ollama_client: OllamaClient, config: GleaningConfig) -> Self {
88 let keep_alive = ollama_client.config().keep_alive.clone();
90
91 let llm_extractor = LLMEntityExtractor::new(ollama_client, config.entity_types.clone())
93 .with_temperature(config.temperature)
94 .with_max_tokens(config.max_tokens)
95 .with_keep_alive(keep_alive);
96
97 Self {
98 llm_extractor,
99 config,
100 }
101 }
102
103 #[cfg(feature = "async")]
113 pub async fn extract_with_gleaning(
114 &self,
115 chunk: &TextChunk,
116 ) -> Result<(Vec<Entity>, Vec<Relationship>)> {
117 tracing::info!(
118 "🔍 Starting REAL LLM gleaning extraction for chunk: {} ({} chars)",
119 chunk.id,
120 chunk.content.len()
121 );
122
123 let start_time = std::time::Instant::now();
124
125 let mut all_entity_data: Vec<EntityData> = Vec::new();
127 let mut all_relationship_data: Vec<RelationshipData> = Vec::new();
128
129 tracing::info!("📝 Round 1: Initial LLM extraction...");
131 let round_start = std::time::Instant::now();
132
133 let (initial_entities, initial_relationships) =
134 self.llm_extractor.extract_from_chunk(chunk).await?;
135
136 tracing::info!(
137 "✅ Round 1 complete: {} entities, {} relationships ({:.1}s)",
138 initial_entities.len(),
139 initial_relationships.len(),
140 round_start.elapsed().as_secs_f32()
141 );
142
143 let mut entity_data = self.convert_entities_to_data(&initial_entities);
145 let mut relationship_data = self.convert_relationships_to_data(&initial_relationships);
146
147 all_entity_data.append(&mut entity_data);
148 all_relationship_data.append(&mut relationship_data);
149
150 for round in 2..=self.config.max_gleaning_rounds {
152 tracing::info!("📝 Round {}: Gleaning continuation...", round);
153 let round_start = std::time::Instant::now();
154
155 if self.config.use_llm_completion_check {
157 let is_complete = self
158 .llm_extractor
159 .check_completion(chunk, &all_entity_data, &all_relationship_data)
160 .await?;
161
162 if is_complete {
163 tracing::info!(
164 "✅ LLM determined extraction is COMPLETE after {} rounds ({:.1}s total)",
165 round - 1,
166 start_time.elapsed().as_secs_f32()
167 );
168 break;
169 }
170
171 tracing::debug!("⚠️ LLM determined extraction is INCOMPLETE, continuing...");
172 }
173
174 let (additional_entities, additional_relationships) = self
176 .llm_extractor
177 .extract_additional(chunk, &all_entity_data, &all_relationship_data)
178 .await?;
179
180 tracing::info!(
181 "✅ Round {} complete: {} new entities, {} new relationships ({:.1}s)",
182 round,
183 additional_entities.len(),
184 additional_relationships.len(),
185 round_start.elapsed().as_secs_f32()
186 );
187
188 if additional_entities.is_empty() && additional_relationships.is_empty() {
190 tracing::info!(
191 "🛑 No additional entities found in round {}, stopping gleaning",
192 round
193 );
194 break;
195 }
196
197 let new_entity_data = self.convert_entities_to_data(&additional_entities);
199 let mut new_relationship_data =
200 self.convert_relationships_to_data(&additional_relationships);
201
202 all_entity_data = self.merge_entity_data(all_entity_data, new_entity_data);
204 all_relationship_data.append(&mut new_relationship_data);
205 }
206
207 let final_entities =
209 self.convert_data_to_entities(&all_entity_data, &chunk.id, &chunk.content)?;
210 let final_relationships =
211 self.convert_data_to_relationships(&all_relationship_data, &final_entities)?;
212
213 let deduplicated_relationships = self.deduplicate_relationships(final_relationships);
215
216 let total_time = start_time.elapsed().as_secs_f32();
217
218 tracing::info!(
219 "🎉 REAL LLM gleaning complete: {} entities, {} relationships ({:.1}s total)",
220 final_entities.len(),
221 deduplicated_relationships.len(),
222 total_time
223 );
224
225 Ok((final_entities, deduplicated_relationships))
226 }
227
228 fn merge_entity_data(
233 &self,
234 existing: Vec<EntityData>,
235 new: Vec<EntityData>,
236 ) -> Vec<EntityData> {
237 let mut merged: HashMap<String, EntityData> = HashMap::new();
238
239 for entity in existing {
241 let key = entity.name.to_lowercase();
242 merged.insert(key, entity);
243 }
244
245 for new_entity in new {
247 let key = new_entity.name.to_lowercase();
248
249 match merged.get(&key) {
250 Some(existing_entity) => {
251 if new_entity.description.len() > existing_entity.description.len() {
253 tracing::debug!(
254 "📝 Merging entity '{}': keeping longer description ({} chars vs {} chars)",
255 new_entity.name,
256 new_entity.description.len(),
257 existing_entity.description.len()
258 );
259 merged.insert(key, new_entity);
260 } else {
261 tracing::debug!(
262 "📝 Entity '{}' already exists with longer description, keeping existing",
263 new_entity.name
264 );
265 }
266 },
267 None => {
268 merged.insert(key, new_entity);
270 },
271 }
272 }
273
274 merged.into_values().collect()
275 }
276
277 fn convert_entities_to_data(&self, entities: &[Entity]) -> Vec<EntityData> {
279 entities
280 .iter()
281 .map(|e| EntityData {
282 name: e.name.clone(),
283 entity_type: e.entity_type.clone(),
284 description: format!("{} (confidence: {:.2})", e.entity_type, e.confidence),
285 })
286 .collect()
287 }
288
289 fn convert_relationships_to_data(
291 &self,
292 relationships: &[Relationship],
293 ) -> Vec<RelationshipData> {
294 relationships
295 .iter()
296 .map(|r| RelationshipData {
297 source: r.source.0.clone(),
298 target: r.target.0.clone(),
299 description: r.relation_type.clone(),
300 strength: r.confidence as f64,
301 })
302 .collect()
303 }
304
305 fn convert_data_to_entities(
307 &self,
308 entity_data: &[EntityData],
309 chunk_id: &crate::core::ChunkId,
310 chunk_text: &str,
311 ) -> Result<Vec<Entity>> {
312 let mut entities = Vec::new();
313
314 for entity_item in entity_data {
315 let entity_id = crate::core::EntityId::new(format!(
317 "{}_{}",
318 entity_item.entity_type,
319 self.normalize_name(&entity_item.name)
320 ));
321
322 let mentions = self.find_mentions(&entity_item.name, chunk_id, chunk_text);
324
325 let entity = Entity::new(
327 entity_id,
328 entity_item.name.clone(),
329 entity_item.entity_type.clone(),
330 0.9, )
332 .with_mentions(mentions);
333
334 entities.push(entity);
335 }
336
337 Ok(entities)
338 }
339
340 fn find_mentions(
342 &self,
343 name: &str,
344 chunk_id: &crate::core::ChunkId,
345 text: &str,
346 ) -> Vec<crate::core::EntityMention> {
347 let mut mentions = Vec::new();
348 let mut start = 0;
349
350 while let Some(pos) = text[start..].find(name) {
351 let actual_pos = start + pos;
352 mentions.push(crate::core::EntityMention {
353 chunk_id: chunk_id.clone(),
354 start_offset: actual_pos,
355 end_offset: actual_pos + name.len(),
356 confidence: 0.9,
357 });
358 start = actual_pos + name.len();
359 }
360
361 if mentions.is_empty() {
363 let name_lower = name.to_lowercase();
364 let text_lower = text.to_lowercase();
365 let mut start = 0;
366
367 while let Some(pos) = text_lower[start..].find(&name_lower) {
368 let actual_pos = start + pos;
369 mentions.push(crate::core::EntityMention {
370 chunk_id: chunk_id.clone(),
371 start_offset: actual_pos,
372 end_offset: actual_pos + name.len(),
373 confidence: 0.85,
374 });
375 start = actual_pos + name.len();
376 }
377 }
378
379 mentions
380 }
381
382 fn convert_data_to_relationships(
384 &self,
385 relationship_data: &[RelationshipData],
386 entities: &[Entity],
387 ) -> Result<Vec<Relationship>> {
388 let mut relationships = Vec::new();
389
390 let mut name_to_entity: HashMap<String, &Entity> = HashMap::new();
392 for entity in entities {
393 name_to_entity.insert(entity.name.to_lowercase(), entity);
394 }
395
396 for rel_item in relationship_data {
397 let source_entity = name_to_entity.get(&rel_item.source.to_lowercase());
399 let target_entity = name_to_entity.get(&rel_item.target.to_lowercase());
400
401 if let (Some(source), Some(target)) = (source_entity, target_entity) {
402 let relationship = Relationship {
403 source: source.id.clone(),
404 target: target.id.clone(),
405 relation_type: rel_item.description.clone(),
406 confidence: rel_item.strength as f32,
407 context: vec![],
408 embedding: None,
409 temporal_type: None,
410 temporal_range: None,
411 causal_strength: None,
412 };
413
414 relationships.push(relationship);
415 } else {
416 tracing::warn!(
417 "Skipping relationship: entity not found. Source: {}, Target: {}",
418 rel_item.source,
419 rel_item.target
420 );
421 }
422 }
423
424 Ok(relationships)
425 }
426
427 fn deduplicate_relationships(&self, relationships: Vec<Relationship>) -> Vec<Relationship> {
429 let mut seen = std::collections::HashSet::new();
430 let mut deduplicated = Vec::new();
431
432 for relationship in relationships {
433 let key = format!(
434 "{}->{}:{}",
435 relationship.source, relationship.target, relationship.relation_type
436 );
437
438 if !seen.contains(&key) {
439 seen.insert(key);
440 deduplicated.push(relationship);
441 }
442 }
443
444 deduplicated
445 }
446
447 fn normalize_name(&self, name: &str) -> String {
449 name.to_lowercase()
450 .chars()
451 .filter(|c| c.is_alphanumeric() || *c == '_')
452 .collect::<String>()
453 .replace(' ', "_")
454 }
455
456 pub fn get_statistics(&self) -> GleaningStatistics {
458 GleaningStatistics {
459 config: self.config.clone(),
460 llm_available: true, }
462 }
463}
464
465#[derive(Debug, Clone)]
467pub struct GleaningStatistics {
468 pub config: GleaningConfig,
470 pub llm_available: bool,
472}
473
474impl GleaningStatistics {
475 #[allow(dead_code)]
477 pub fn print(&self) {
478 tracing::info!("🔍 REAL LLM Gleaning Extraction Statistics");
479 tracing::info!(" Max rounds: {}", self.config.max_gleaning_rounds);
480 tracing::info!(
481 " Completion threshold: {:.2}",
482 self.config.completion_threshold
483 );
484 tracing::info!(
485 " Entity confidence threshold: {:.2}",
486 self.config.entity_confidence_threshold
487 );
488 tracing::info!(
489 " Uses LLM completion check: {}",
490 self.config.use_llm_completion_check
491 );
492 tracing::info!(" LLM available: {} ✅", self.llm_available);
493 tracing::info!(" Entity types: {:?}", self.config.entity_types);
494 tracing::info!(" Temperature: {}", self.config.temperature);
495 }
496}
497
498#[cfg(test)]
499mod tests {
500 use super::*;
501 use crate::{
502 core::{ChunkId, DocumentId, TextChunk},
503 ollama::OllamaConfig,
504 };
505
506 fn create_test_chunk() -> TextChunk {
507 TextChunk::new(
508 ChunkId::new("test_chunk".to_string()),
509 DocumentId::new("test_doc".to_string()),
510 "Tom Sawyer is a young boy who lives in St. Petersburg with his Aunt Polly. Tom is best friends with Huckleberry Finn.".to_string(),
511 0,
512 120,
513 )
514 }
515
516 #[test]
517 fn test_gleaning_extractor_creation() {
518 let ollama_config = OllamaConfig::default();
519 let ollama_client = OllamaClient::new(ollama_config);
520 let config = GleaningConfig::default();
521
522 let extractor = GleaningEntityExtractor::new(ollama_client, config);
523
524 let stats = extractor.get_statistics();
525 assert_eq!(stats.config.max_gleaning_rounds, 4);
526 assert!(stats.llm_available);
527 }
528
529 #[test]
530 fn test_merge_entity_data() {
531 let ollama_config = OllamaConfig::default();
532 let ollama_client = OllamaClient::new(ollama_config);
533 let config = GleaningConfig::default();
534 let extractor = GleaningEntityExtractor::new(ollama_client, config);
535
536 let existing = vec![EntityData {
537 name: "Tom Sawyer".to_string(),
538 entity_type: "PERSON".to_string(),
539 description: "A boy".to_string(),
540 }];
541
542 let new = vec![
543 EntityData {
544 name: "Tom Sawyer".to_string(),
545 entity_type: "PERSON".to_string(),
546 description: "A young boy who lives in St. Petersburg".to_string(), },
548 EntityData {
549 name: "Huck Finn".to_string(),
550 entity_type: "PERSON".to_string(),
551 description: "Tom's friend".to_string(),
552 },
553 ];
554
555 let merged = extractor.merge_entity_data(existing, new);
556
557 assert_eq!(merged.len(), 2); let tom = merged.iter().find(|e| e.name == "Tom Sawyer").unwrap();
559 assert!(tom.description.len() > 10); }
561
562 #[test]
563 fn test_normalize_name() {
564 let ollama_config = OllamaConfig::default();
565 let ollama_client = OllamaClient::new(ollama_config);
566 let config = GleaningConfig::default();
567 let extractor = GleaningEntityExtractor::new(ollama_client, config);
568
569 assert_eq!(extractor.normalize_name("Tom Sawyer"), "tom_sawyer");
570 assert_eq!(extractor.normalize_name("St. Petersburg"), "st_petersburg");
571 }
572
573 #[test]
574 fn test_find_mentions() {
575 let ollama_config = OllamaConfig::default();
576 let ollama_client = OllamaClient::new(ollama_config);
577 let config = GleaningConfig::default();
578 let extractor = GleaningEntityExtractor::new(ollama_client, config);
579
580 let chunk = create_test_chunk();
581 let mentions = extractor.find_mentions("Tom", &chunk.id, &chunk.content);
582
583 assert!(!mentions.is_empty());
584 assert!(mentions.len() >= 2); }
586
587 #[test]
588 fn test_deduplicate_relationships() {
589 let ollama_config = OllamaConfig::default();
590 let ollama_client = OllamaClient::new(ollama_config);
591 let config = GleaningConfig::default();
592 let extractor = GleaningEntityExtractor::new(ollama_client, config);
593
594 let relationships = vec![
595 Relationship::new(
596 crate::core::EntityId::new("person_tom".to_string()),
597 crate::core::EntityId::new("person_huck".to_string()),
598 "FRIENDS_WITH".to_string(),
599 0.9,
600 ),
601 Relationship::new(
602 crate::core::EntityId::new("person_tom".to_string()),
603 crate::core::EntityId::new("person_huck".to_string()),
604 "FRIENDS_WITH".to_string(), 0.85,
606 ),
607 Relationship::new(
608 crate::core::EntityId::new("person_tom".to_string()),
609 crate::core::EntityId::new("location_stpetersburg".to_string()),
610 "LIVES_IN".to_string(),
611 0.8,
612 ),
613 ];
614
615 let deduplicated = extractor.deduplicate_relationships(relationships);
616
617 assert_eq!(deduplicated.len(), 2); }
619}