1use crate::{
10 core::{Entity, Relationship, Result, TextChunk},
11 entity::{
12 llm_extractor::LLMEntityExtractor,
13 prompts::{EntityData, RelationshipData},
14 },
15 ollama::OllamaClient,
16};
17use serde::{Deserialize, Serialize};
18use std::collections::HashMap;
19
20#[derive(Debug, Clone)]
22pub struct GleaningConfig {
23 pub max_gleaning_rounds: usize,
25 pub completion_threshold: f64,
27 pub entity_confidence_threshold: f64,
29 pub use_llm_completion_check: bool,
31 pub entity_types: Vec<String>,
33 pub temperature: f32,
35 pub max_tokens: usize,
37}
38
39impl Default for GleaningConfig {
40 fn default() -> Self {
41 Self {
42 max_gleaning_rounds: 4, completion_threshold: 0.85,
44 entity_confidence_threshold: 0.7,
45 use_llm_completion_check: true, entity_types: vec![
47 "PERSON".to_string(),
48 "ORGANIZATION".to_string(),
49 "LOCATION".to_string(),
50 "EVENT".to_string(),
51 "CONCEPT".to_string(),
52 ],
53 temperature: 0.1, max_tokens: 1500,
55 }
56 }
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct ExtractionCompletionStatus {
62 pub is_complete: bool,
64 pub confidence: f64,
66 pub missing_aspects: Vec<String>,
68 pub suggestions: Vec<String>,
70}
71
72pub struct GleaningEntityExtractor {
77 llm_extractor: LLMEntityExtractor,
78 config: GleaningConfig,
79}
80
81impl GleaningEntityExtractor {
82 pub fn new(ollama_client: OllamaClient, config: GleaningConfig) -> Self {
88 let llm_extractor = LLMEntityExtractor::new(
90 ollama_client,
91 config.entity_types.clone(),
92 )
93 .with_temperature(config.temperature)
94 .with_max_tokens(config.max_tokens);
95
96 Self {
97 llm_extractor,
98 config,
99 }
100 }
101
102 #[cfg(feature = "async")]
112 pub async fn extract_with_gleaning(
113 &self,
114 chunk: &TextChunk,
115 ) -> Result<(Vec<Entity>, Vec<Relationship>)> {
116 tracing::info!(
117 "🔍 Starting REAL LLM gleaning extraction for chunk: {} ({} chars)",
118 chunk.id,
119 chunk.content.len()
120 );
121
122 let start_time = std::time::Instant::now();
123
124 let mut all_entity_data: Vec<EntityData> = Vec::new();
126 let mut all_relationship_data: Vec<RelationshipData> = Vec::new();
127
128 tracing::info!("📝 Round 1: Initial LLM extraction...");
130 let round_start = std::time::Instant::now();
131
132 let (initial_entities, initial_relationships) =
133 self.llm_extractor.extract_from_chunk(chunk).await?;
134
135 tracing::info!(
136 "✅ Round 1 complete: {} entities, {} relationships ({:.1}s)",
137 initial_entities.len(),
138 initial_relationships.len(),
139 round_start.elapsed().as_secs_f32()
140 );
141
142 let mut entity_data = self.convert_entities_to_data(&initial_entities);
144 let mut relationship_data = self.convert_relationships_to_data(&initial_relationships);
145
146 all_entity_data.append(&mut entity_data);
147 all_relationship_data.append(&mut relationship_data);
148
149 for round in 2..=self.config.max_gleaning_rounds {
151 tracing::info!("📝 Round {}: Gleaning continuation...", round);
152 let round_start = std::time::Instant::now();
153
154 if self.config.use_llm_completion_check {
156 let is_complete = self
157 .llm_extractor
158 .check_completion(chunk, &all_entity_data, &all_relationship_data)
159 .await?;
160
161 if is_complete {
162 tracing::info!(
163 "✅ LLM determined extraction is COMPLETE after {} rounds ({:.1}s total)",
164 round - 1,
165 start_time.elapsed().as_secs_f32()
166 );
167 break;
168 }
169
170 tracing::debug!("⚠️ LLM determined extraction is INCOMPLETE, continuing...");
171 }
172
173 let (additional_entities, additional_relationships) = self
175 .llm_extractor
176 .extract_additional(chunk, &all_entity_data, &all_relationship_data)
177 .await?;
178
179 tracing::info!(
180 "✅ Round {} complete: {} new entities, {} new relationships ({:.1}s)",
181 round,
182 additional_entities.len(),
183 additional_relationships.len(),
184 round_start.elapsed().as_secs_f32()
185 );
186
187 if additional_entities.is_empty() && additional_relationships.is_empty() {
189 tracing::info!(
190 "🛑 No additional entities found in round {}, stopping gleaning",
191 round
192 );
193 break;
194 }
195
196 let new_entity_data = self.convert_entities_to_data(&additional_entities);
198 let mut new_relationship_data =
199 self.convert_relationships_to_data(&additional_relationships);
200
201 all_entity_data = self.merge_entity_data(all_entity_data, new_entity_data);
203 all_relationship_data.append(&mut new_relationship_data);
204 }
205
206 let final_entities = self.convert_data_to_entities(&all_entity_data, &chunk.id, &chunk.content)?;
208 let final_relationships = self.convert_data_to_relationships(&all_relationship_data, &final_entities)?;
209
210 let deduplicated_relationships = self.deduplicate_relationships(final_relationships);
212
213 let total_time = start_time.elapsed().as_secs_f32();
214
215 tracing::info!(
216 "🎉 REAL LLM gleaning complete: {} entities, {} relationships ({:.1}s total)",
217 final_entities.len(),
218 deduplicated_relationships.len(),
219 total_time
220 );
221
222 Ok((final_entities, deduplicated_relationships))
223 }
224
225 fn merge_entity_data(
230 &self,
231 existing: Vec<EntityData>,
232 new: Vec<EntityData>,
233 ) -> Vec<EntityData> {
234 let mut merged: HashMap<String, EntityData> = HashMap::new();
235
236 for entity in existing {
238 let key = entity.name.to_lowercase();
239 merged.insert(key, entity);
240 }
241
242 for new_entity in new {
244 let key = new_entity.name.to_lowercase();
245
246 match merged.get(&key) {
247 Some(existing_entity) => {
248 if new_entity.description.len() > existing_entity.description.len() {
250 tracing::debug!(
251 "📝 Merging entity '{}': keeping longer description ({} chars vs {} chars)",
252 new_entity.name,
253 new_entity.description.len(),
254 existing_entity.description.len()
255 );
256 merged.insert(key, new_entity);
257 } else {
258 tracing::debug!(
259 "📝 Entity '{}' already exists with longer description, keeping existing",
260 new_entity.name
261 );
262 }
263 }
264 None => {
265 merged.insert(key, new_entity);
267 }
268 }
269 }
270
271 merged.into_values().collect()
272 }
273
274 fn convert_entities_to_data(&self, entities: &[Entity]) -> Vec<EntityData> {
276 entities
277 .iter()
278 .map(|e| EntityData {
279 name: e.name.clone(),
280 entity_type: e.entity_type.clone(),
281 description: format!("{} (confidence: {:.2})", e.entity_type, e.confidence),
282 })
283 .collect()
284 }
285
286 fn convert_relationships_to_data(&self, relationships: &[Relationship]) -> Vec<RelationshipData> {
288 relationships
289 .iter()
290 .map(|r| RelationshipData {
291 source: r.source.0.clone(),
292 target: r.target.0.clone(),
293 description: r.relation_type.clone(),
294 strength: r.confidence as f64,
295 })
296 .collect()
297 }
298
299 fn convert_data_to_entities(
301 &self,
302 entity_data: &[EntityData],
303 chunk_id: &crate::core::ChunkId,
304 chunk_text: &str,
305 ) -> Result<Vec<Entity>> {
306 let mut entities = Vec::new();
307
308 for data in entity_data {
309 let entity_id = crate::core::EntityId::new(format!(
311 "{}_{}",
312 data.entity_type,
313 self.normalize_name(&data.name)
314 ));
315
316 let mentions = self.find_mentions(&data.name, chunk_id, chunk_text);
318
319 let entity = Entity::new(
321 entity_id,
322 data.name.clone(),
323 data.entity_type.clone(),
324 0.9, )
326 .with_mentions(mentions);
327
328 entities.push(entity);
329 }
330
331 Ok(entities)
332 }
333
334 fn find_mentions(
336 &self,
337 name: &str,
338 chunk_id: &crate::core::ChunkId,
339 text: &str,
340 ) -> Vec<crate::core::EntityMention> {
341 let mut mentions = Vec::new();
342 let mut start = 0;
343
344 while let Some(pos) = text[start..].find(name) {
345 let actual_pos = start + pos;
346 mentions.push(crate::core::EntityMention {
347 chunk_id: chunk_id.clone(),
348 start_offset: actual_pos,
349 end_offset: actual_pos + name.len(),
350 confidence: 0.9,
351 });
352 start = actual_pos + name.len();
353 }
354
355 if mentions.is_empty() {
357 let name_lower = name.to_lowercase();
358 let text_lower = text.to_lowercase();
359 let mut start = 0;
360
361 while let Some(pos) = text_lower[start..].find(&name_lower) {
362 let actual_pos = start + pos;
363 mentions.push(crate::core::EntityMention {
364 chunk_id: chunk_id.clone(),
365 start_offset: actual_pos,
366 end_offset: actual_pos + name.len(),
367 confidence: 0.85,
368 });
369 start = actual_pos + name.len();
370 }
371 }
372
373 mentions
374 }
375
376 fn convert_data_to_relationships(
378 &self,
379 relationship_data: &[RelationshipData],
380 entities: &[Entity],
381 ) -> Result<Vec<Relationship>> {
382 let mut relationships = Vec::new();
383
384 let mut name_to_entity: HashMap<String, &Entity> = HashMap::new();
386 for entity in entities {
387 name_to_entity.insert(entity.name.to_lowercase(), entity);
388 }
389
390 for data in relationship_data {
391 let source_entity = name_to_entity.get(&data.source.to_lowercase());
393 let target_entity = name_to_entity.get(&data.target.to_lowercase());
394
395 if let (Some(source), Some(target)) = (source_entity, target_entity) {
396 let relationship = Relationship {
397 source: source.id.clone(),
398 target: target.id.clone(),
399 relation_type: data.description.clone(),
400 confidence: data.strength as f32,
401 context: vec![],
402 };
403
404 relationships.push(relationship);
405 } else {
406 tracing::warn!(
407 "Skipping relationship: entity not found. Source: {}, Target: {}",
408 data.source,
409 data.target
410 );
411 }
412 }
413
414 Ok(relationships)
415 }
416
417 fn deduplicate_relationships(&self, relationships: Vec<Relationship>) -> Vec<Relationship> {
419 let mut seen = std::collections::HashSet::new();
420 let mut deduplicated = Vec::new();
421
422 for relationship in relationships {
423 let key = format!(
424 "{}->{}:{}",
425 relationship.source, relationship.target, relationship.relation_type
426 );
427
428 if !seen.contains(&key) {
429 seen.insert(key);
430 deduplicated.push(relationship);
431 }
432 }
433
434 deduplicated
435 }
436
437 fn normalize_name(&self, name: &str) -> String {
439 name.to_lowercase()
440 .chars()
441 .filter(|c| c.is_alphanumeric() || *c == '_')
442 .collect::<String>()
443 .replace(' ', "_")
444 }
445
446 pub fn get_statistics(&self) -> GleaningStatistics {
448 GleaningStatistics {
449 config: self.config.clone(),
450 llm_available: true, }
452 }
453}
454
455#[derive(Debug, Clone)]
457pub struct GleaningStatistics {
458 pub config: GleaningConfig,
460 pub llm_available: bool,
462}
463
464impl GleaningStatistics {
465 #[allow(dead_code)]
467 pub fn print(&self) {
468 tracing::info!("🔍 REAL LLM Gleaning Extraction Statistics");
469 tracing::info!(" Max rounds: {}", self.config.max_gleaning_rounds);
470 tracing::info!(
471 " Completion threshold: {:.2}",
472 self.config.completion_threshold
473 );
474 tracing::info!(
475 " Entity confidence threshold: {:.2}",
476 self.config.entity_confidence_threshold
477 );
478 tracing::info!(
479 " Uses LLM completion check: {}",
480 self.config.use_llm_completion_check
481 );
482 tracing::info!(" LLM available: {} ✅", self.llm_available);
483 tracing::info!(" Entity types: {:?}", self.config.entity_types);
484 tracing::info!(" Temperature: {}", self.config.temperature);
485 }
486}
487
488#[cfg(test)]
489mod tests {
490 use super::*;
491 use crate::{
492 core::{ChunkId, DocumentId, TextChunk},
493 ollama::OllamaConfig,
494 };
495
496 fn create_test_chunk() -> TextChunk {
497 TextChunk::new(
498 ChunkId::new("test_chunk".to_string()),
499 DocumentId::new("test_doc".to_string()),
500 "Tom Sawyer is a young boy who lives in St. Petersburg with his Aunt Polly. Tom is best friends with Huckleberry Finn.".to_string(),
501 0,
502 120,
503 )
504 }
505
506 #[test]
507 fn test_gleaning_extractor_creation() {
508 let ollama_config = OllamaConfig::default();
509 let ollama_client = OllamaClient::new(ollama_config);
510 let config = GleaningConfig::default();
511
512 let extractor = GleaningEntityExtractor::new(ollama_client, config);
513
514 let stats = extractor.get_statistics();
515 assert_eq!(stats.config.max_gleaning_rounds, 4);
516 assert!(stats.llm_available);
517 }
518
519 #[test]
520 fn test_merge_entity_data() {
521 let ollama_config = OllamaConfig::default();
522 let ollama_client = OllamaClient::new(ollama_config);
523 let config = GleaningConfig::default();
524 let extractor = GleaningEntityExtractor::new(ollama_client, config);
525
526 let existing = vec![
527 EntityData {
528 name: "Tom Sawyer".to_string(),
529 entity_type: "PERSON".to_string(),
530 description: "A boy".to_string(),
531 },
532 ];
533
534 let new = vec![
535 EntityData {
536 name: "Tom Sawyer".to_string(),
537 entity_type: "PERSON".to_string(),
538 description: "A young boy who lives in St. Petersburg".to_string(), },
540 EntityData {
541 name: "Huck Finn".to_string(),
542 entity_type: "PERSON".to_string(),
543 description: "Tom's friend".to_string(),
544 },
545 ];
546
547 let merged = extractor.merge_entity_data(existing, new);
548
549 assert_eq!(merged.len(), 2); let tom = merged.iter().find(|e| e.name == "Tom Sawyer").unwrap();
551 assert!(tom.description.len() > 10); }
553
554 #[test]
555 fn test_normalize_name() {
556 let ollama_config = OllamaConfig::default();
557 let ollama_client = OllamaClient::new(ollama_config);
558 let config = GleaningConfig::default();
559 let extractor = GleaningEntityExtractor::new(ollama_client, config);
560
561 assert_eq!(extractor.normalize_name("Tom Sawyer"), "tom_sawyer");
562 assert_eq!(extractor.normalize_name("St. Petersburg"), "st_petersburg");
563 }
564
565 #[test]
566 fn test_find_mentions() {
567 let ollama_config = OllamaConfig::default();
568 let ollama_client = OllamaClient::new(ollama_config);
569 let config = GleaningConfig::default();
570 let extractor = GleaningEntityExtractor::new(ollama_client, config);
571
572 let chunk = create_test_chunk();
573 let mentions = extractor.find_mentions("Tom", &chunk.id, &chunk.content);
574
575 assert!(!mentions.is_empty());
576 assert!(mentions.len() >= 2); }
578
579 #[test]
580 fn test_deduplicate_relationships() {
581 let ollama_config = OllamaConfig::default();
582 let ollama_client = OllamaClient::new(ollama_config);
583 let config = GleaningConfig::default();
584 let extractor = GleaningEntityExtractor::new(ollama_client, config);
585
586 let relationships = vec![
587 Relationship {
588 source: crate::core::EntityId::new("person_tom".to_string()),
589 target: crate::core::EntityId::new("person_huck".to_string()),
590 relation_type: "FRIENDS_WITH".to_string(),
591 confidence: 0.9,
592 context: vec![],
593 },
594 Relationship {
595 source: crate::core::EntityId::new("person_tom".to_string()),
596 target: crate::core::EntityId::new("person_huck".to_string()),
597 relation_type: "FRIENDS_WITH".to_string(), confidence: 0.85,
599 context: vec![],
600 },
601 Relationship {
602 source: crate::core::EntityId::new("person_tom".to_string()),
603 target: crate::core::EntityId::new("location_stpetersburg".to_string()),
604 relation_type: "LIVES_IN".to_string(),
605 confidence: 0.8,
606 context: vec![],
607 },
608 ];
609
610 let deduplicated = extractor.deduplicate_relationships(relationships);
611
612 assert_eq!(deduplicated.len(), 2); }
614}