graphrag_core/entity/
gleaning_extractor.rs

1//! Gleaning-based entity extraction with TRUE LLM inference
2//!
3//! This module implements iterative gleaning refinement using actual LLM calls,
4//! not pattern matching. Based on Microsoft GraphRAG and LightRAG research.
5//!
6//! Expected performance: 15-30 seconds per chunk per round. For a 1000-page book
7//! with 4 gleaning rounds, expect 2-4 hours of processing time.
8
9use crate::{
10    core::{Entity, Relationship, Result, TextChunk},
11    entity::{
12        llm_extractor::LLMEntityExtractor,
13        prompts::{EntityData, RelationshipData},
14    },
15    ollama::OllamaClient,
16};
17use serde::{Deserialize, Serialize};
18use std::collections::HashMap;
19
20/// Configuration for gleaning-based entity extraction
21#[derive(Debug, Clone)]
22pub struct GleaningConfig {
23    /// Maximum number of gleaning rounds (typically 3-4)
24    pub max_gleaning_rounds: usize,
25    /// Threshold for extraction completion (0.0-1.0)
26    pub completion_threshold: f64,
27    /// Minimum confidence for extracted entities (0.0-1.0)
28    pub entity_confidence_threshold: f64,
29    /// Whether to use LLM for completion checking (always true for real gleaning)
30    pub use_llm_completion_check: bool,
31    /// Entity types to extract
32    pub entity_types: Vec<String>,
33    /// LLM temperature for extraction (lower = more consistent)
34    pub temperature: f32,
35    /// Maximum tokens for LLM responses
36    pub max_tokens: usize,
37}
38
39impl Default for GleaningConfig {
40    fn default() -> Self {
41        Self {
42            max_gleaning_rounds: 4, // Microsoft GraphRAG uses 4 rounds
43            completion_threshold: 0.85,
44            entity_confidence_threshold: 0.7,
45            use_llm_completion_check: true, // Always use LLM for real gleaning
46            entity_types: vec![
47                "PERSON".to_string(),
48                "ORGANIZATION".to_string(),
49                "LOCATION".to_string(),
50                "EVENT".to_string(),
51                "CONCEPT".to_string(),
52            ],
53            temperature: 0.1, // Low temperature for consistent extraction
54            max_tokens: 1500,
55        }
56    }
57}
58
59/// Status of entity extraction completion
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct ExtractionCompletionStatus {
62    /// Whether extraction is considered complete
63    pub is_complete: bool,
64    /// Confidence score for completeness (0.0-1.0)
65    pub confidence: f64,
66    /// Aspects that may be missing from extraction
67    pub missing_aspects: Vec<String>,
68    /// Suggestions for improving extraction
69    pub suggestions: Vec<String>,
70}
71
72/// Entity extractor with iterative gleaning refinement using TRUE LLM calls
73///
74/// This is the REAL implementation that makes actual LLM API calls for every extraction.
75/// It replaces the fake pattern-based extraction with genuine language model inference.
76pub struct GleaningEntityExtractor {
77    llm_extractor: LLMEntityExtractor,
78    config: GleaningConfig,
79}
80
81impl GleaningEntityExtractor {
82    /// Create a new gleaning entity extractor with LLM client
83    ///
84    /// # Arguments
85    /// * `ollama_client` - Ollama client for LLM inference (REQUIRED)
86    /// * `config` - Gleaning configuration
87    pub fn new(ollama_client: OllamaClient, config: GleaningConfig) -> Self {
88        // Create LLM extractor with configured entity types
89        let llm_extractor = LLMEntityExtractor::new(
90            ollama_client,
91            config.entity_types.clone(),
92        )
93        .with_temperature(config.temperature)
94        .with_max_tokens(config.max_tokens);
95
96        Self {
97            llm_extractor,
98            config,
99        }
100    }
101
102    /// Extract entities with iterative refinement (gleaning) using TRUE LLM calls
103    ///
104    /// This is the REAL implementation that makes actual LLM API calls.
105    /// Expected time: 15-30 seconds per round = 60-120 seconds total for 4 rounds per chunk.
106    ///
107    /// # Performance
108    /// - 1 chunk, 4 rounds: ~2 minutes
109    /// - 100 chunks, 4 rounds: ~3-4 hours
110    /// - Tom Sawyer (1000 pages): ~2-4 hours
111    #[cfg(feature = "async")]
112    pub async fn extract_with_gleaning(
113        &self,
114        chunk: &TextChunk,
115    ) -> Result<(Vec<Entity>, Vec<Relationship>)> {
116        tracing::info!(
117            "🔍 Starting REAL LLM gleaning extraction for chunk: {} ({} chars)",
118            chunk.id,
119            chunk.content.len()
120        );
121
122        let start_time = std::time::Instant::now();
123
124        // Track all extracted entities and relationships across rounds
125        let mut all_entity_data: Vec<EntityData> = Vec::new();
126        let mut all_relationship_data: Vec<RelationshipData> = Vec::new();
127
128        // Round 1: Initial extraction (THIS IS A REAL LLM CALL!)
129        tracing::info!("📝 Round 1: Initial LLM extraction...");
130        let round_start = std::time::Instant::now();
131
132        let (initial_entities, initial_relationships) =
133            self.llm_extractor.extract_from_chunk(chunk).await?;
134
135        tracing::info!(
136            "✅ Round 1 complete: {} entities, {} relationships ({:.1}s)",
137            initial_entities.len(),
138            initial_relationships.len(),
139            round_start.elapsed().as_secs_f32()
140        );
141
142        // Convert to EntityData for tracking
143        let mut entity_data = self.convert_entities_to_data(&initial_entities);
144        let mut relationship_data = self.convert_relationships_to_data(&initial_relationships);
145
146        all_entity_data.append(&mut entity_data);
147        all_relationship_data.append(&mut relationship_data);
148
149        // Rounds 2-N: Gleaning continuation rounds
150        for round in 2..=self.config.max_gleaning_rounds {
151            tracing::info!("📝 Round {}: Gleaning continuation...", round);
152            let round_start = std::time::Instant::now();
153
154            // Check if extraction is complete using LLM (REAL LLM CALL!)
155            if self.config.use_llm_completion_check {
156                let is_complete = self
157                    .llm_extractor
158                    .check_completion(chunk, &all_entity_data, &all_relationship_data)
159                    .await?;
160
161                if is_complete {
162                    tracing::info!(
163                        "✅ LLM determined extraction is COMPLETE after {} rounds ({:.1}s total)",
164                        round - 1,
165                        start_time.elapsed().as_secs_f32()
166                    );
167                    break;
168                }
169
170                tracing::debug!("⚠️  LLM determined extraction is INCOMPLETE, continuing...");
171            }
172
173            // Perform additional extraction round (REAL LLM CALL!)
174            let (additional_entities, additional_relationships) = self
175                .llm_extractor
176                .extract_additional(chunk, &all_entity_data, &all_relationship_data)
177                .await?;
178
179            tracing::info!(
180                "✅ Round {} complete: {} new entities, {} new relationships ({:.1}s)",
181                round,
182                additional_entities.len(),
183                additional_relationships.len(),
184                round_start.elapsed().as_secs_f32()
185            );
186
187            // If no new entities found, stop gleaning
188            if additional_entities.is_empty() && additional_relationships.is_empty() {
189                tracing::info!(
190                    "🛑 No additional entities found in round {}, stopping gleaning",
191                    round
192                );
193                break;
194            }
195
196            // Convert and merge new results
197            let new_entity_data = self.convert_entities_to_data(&additional_entities);
198            let mut new_relationship_data =
199                self.convert_relationships_to_data(&additional_relationships);
200
201            // Merge with length-based strategy (LightRAG approach)
202            all_entity_data = self.merge_entity_data(all_entity_data, new_entity_data);
203            all_relationship_data.append(&mut new_relationship_data);
204        }
205
206        // Convert back to domain entities and relationships
207        let final_entities = self.convert_data_to_entities(&all_entity_data, &chunk.id, &chunk.content)?;
208        let final_relationships = self.convert_data_to_relationships(&all_relationship_data, &final_entities)?;
209
210        // Deduplicate relationships
211        let deduplicated_relationships = self.deduplicate_relationships(final_relationships);
212
213        let total_time = start_time.elapsed().as_secs_f32();
214
215        tracing::info!(
216            "🎉 REAL LLM gleaning complete: {} entities, {} relationships ({:.1}s total)",
217            final_entities.len(),
218            deduplicated_relationships.len(),
219            total_time
220        );
221
222        Ok((final_entities, deduplicated_relationships))
223    }
224
225    /// Merge entity data using length-based strategy (LightRAG approach)
226    ///
227    /// When multiple rounds produce the same entity, keep the version with the longer description
228    /// as it likely contains more information.
229    fn merge_entity_data(
230        &self,
231        existing: Vec<EntityData>,
232        new: Vec<EntityData>,
233    ) -> Vec<EntityData> {
234        let mut merged: HashMap<String, EntityData> = HashMap::new();
235
236        // Add existing entities to map (normalized by lowercase name)
237        for entity in existing {
238            let key = entity.name.to_lowercase();
239            merged.insert(key, entity);
240        }
241
242        // Merge new entities - keep longer descriptions
243        for new_entity in new {
244            let key = new_entity.name.to_lowercase();
245
246            match merged.get(&key) {
247                Some(existing_entity) => {
248                    // Keep the entity with the longer description (more information)
249                    if new_entity.description.len() > existing_entity.description.len() {
250                        tracing::debug!(
251                            "📝 Merging entity '{}': keeping longer description ({} chars vs {} chars)",
252                            new_entity.name,
253                            new_entity.description.len(),
254                            existing_entity.description.len()
255                        );
256                        merged.insert(key, new_entity);
257                    } else {
258                        tracing::debug!(
259                            "📝 Entity '{}' already exists with longer description, keeping existing",
260                            new_entity.name
261                        );
262                    }
263                }
264                None => {
265                    // New entity, add it
266                    merged.insert(key, new_entity);
267                }
268            }
269        }
270
271        merged.into_values().collect()
272    }
273
274    /// Convert domain entities to EntityData
275    fn convert_entities_to_data(&self, entities: &[Entity]) -> Vec<EntityData> {
276        entities
277            .iter()
278            .map(|e| EntityData {
279                name: e.name.clone(),
280                entity_type: e.entity_type.clone(),
281                description: format!("{} (confidence: {:.2})", e.entity_type, e.confidence),
282            })
283            .collect()
284    }
285
286    /// Convert domain relationships to RelationshipData
287    fn convert_relationships_to_data(&self, relationships: &[Relationship]) -> Vec<RelationshipData> {
288        relationships
289            .iter()
290            .map(|r| RelationshipData {
291                source: r.source.0.clone(),
292                target: r.target.0.clone(),
293                description: r.relation_type.clone(),
294                strength: r.confidence as f64,
295            })
296            .collect()
297    }
298
299    /// Convert EntityData back to domain entities
300    fn convert_data_to_entities(
301        &self,
302        entity_data: &[EntityData],
303        chunk_id: &crate::core::ChunkId,
304        chunk_text: &str,
305    ) -> Result<Vec<Entity>> {
306        let mut entities = Vec::new();
307
308        for data in entity_data {
309            // Generate entity ID
310            let entity_id = crate::core::EntityId::new(format!(
311                "{}_{}",
312                data.entity_type,
313                self.normalize_name(&data.name)
314            ));
315
316            // Find mentions in chunk
317            let mentions = self.find_mentions(&data.name, chunk_id, chunk_text);
318
319            // Create entity with mentions
320            let entity = Entity::new(
321                entity_id,
322                data.name.clone(),
323                data.entity_type.clone(),
324                0.9, // High confidence since LLM-extracted
325            )
326            .with_mentions(mentions);
327
328            entities.push(entity);
329        }
330
331        Ok(entities)
332    }
333
334    /// Find all mentions of an entity name in the chunk text
335    fn find_mentions(
336        &self,
337        name: &str,
338        chunk_id: &crate::core::ChunkId,
339        text: &str,
340    ) -> Vec<crate::core::EntityMention> {
341        let mut mentions = Vec::new();
342        let mut start = 0;
343
344        while let Some(pos) = text[start..].find(name) {
345            let actual_pos = start + pos;
346            mentions.push(crate::core::EntityMention {
347                chunk_id: chunk_id.clone(),
348                start_offset: actual_pos,
349                end_offset: actual_pos + name.len(),
350                confidence: 0.9,
351            });
352            start = actual_pos + name.len();
353        }
354
355        // If no exact matches, try case-insensitive
356        if mentions.is_empty() {
357            let name_lower = name.to_lowercase();
358            let text_lower = text.to_lowercase();
359            let mut start = 0;
360
361            while let Some(pos) = text_lower[start..].find(&name_lower) {
362                let actual_pos = start + pos;
363                mentions.push(crate::core::EntityMention {
364                    chunk_id: chunk_id.clone(),
365                    start_offset: actual_pos,
366                    end_offset: actual_pos + name.len(),
367                    confidence: 0.85,
368                });
369                start = actual_pos + name.len();
370            }
371        }
372
373        mentions
374    }
375
376    /// Convert RelationshipData to domain Relationships
377    fn convert_data_to_relationships(
378        &self,
379        relationship_data: &[RelationshipData],
380        entities: &[Entity],
381    ) -> Result<Vec<Relationship>> {
382        let mut relationships = Vec::new();
383
384        // Build entity name to entity mapping
385        let mut name_to_entity: HashMap<String, &Entity> = HashMap::new();
386        for entity in entities {
387            name_to_entity.insert(entity.name.to_lowercase(), entity);
388        }
389
390        for data in relationship_data {
391            // Find source and target entities
392            let source_entity = name_to_entity.get(&data.source.to_lowercase());
393            let target_entity = name_to_entity.get(&data.target.to_lowercase());
394
395            if let (Some(source), Some(target)) = (source_entity, target_entity) {
396                let relationship = Relationship {
397                    source: source.id.clone(),
398                    target: target.id.clone(),
399                    relation_type: data.description.clone(),
400                    confidence: data.strength as f32,
401                    context: vec![],
402                };
403
404                relationships.push(relationship);
405            } else {
406                tracing::warn!(
407                    "Skipping relationship: entity not found. Source: {}, Target: {}",
408                    data.source,
409                    data.target
410                );
411            }
412        }
413
414        Ok(relationships)
415    }
416
417    /// Deduplicate relationships by source-target-type combination
418    fn deduplicate_relationships(&self, relationships: Vec<Relationship>) -> Vec<Relationship> {
419        let mut seen = std::collections::HashSet::new();
420        let mut deduplicated = Vec::new();
421
422        for relationship in relationships {
423            let key = format!(
424                "{}->{}:{}",
425                relationship.source, relationship.target, relationship.relation_type
426            );
427
428            if !seen.contains(&key) {
429                seen.insert(key);
430                deduplicated.push(relationship);
431            }
432        }
433
434        deduplicated
435    }
436
437    /// Normalize entity name for ID generation
438    fn normalize_name(&self, name: &str) -> String {
439        name.to_lowercase()
440            .chars()
441            .filter(|c| c.is_alphanumeric() || *c == '_')
442            .collect::<String>()
443            .replace(' ', "_")
444    }
445
446    /// Get extraction statistics
447    pub fn get_statistics(&self) -> GleaningStatistics {
448        GleaningStatistics {
449            config: self.config.clone(),
450            llm_available: true, // Always true for real gleaning
451        }
452    }
453}
454
455/// Statistics for gleaning extraction process
456#[derive(Debug, Clone)]
457pub struct GleaningStatistics {
458    /// Gleaning configuration used
459    pub config: GleaningConfig,
460    /// Whether LLM is available for completion checking
461    pub llm_available: bool,
462}
463
464impl GleaningStatistics {
465    /// Print statistics to stdout
466    #[allow(dead_code)]
467    pub fn print(&self) {
468        tracing::info!("🔍 REAL LLM Gleaning Extraction Statistics");
469        tracing::info!("  Max rounds: {}", self.config.max_gleaning_rounds);
470        tracing::info!(
471            "  Completion threshold: {:.2}",
472            self.config.completion_threshold
473        );
474        tracing::info!(
475            "  Entity confidence threshold: {:.2}",
476            self.config.entity_confidence_threshold
477        );
478        tracing::info!(
479            "  Uses LLM completion check: {}",
480            self.config.use_llm_completion_check
481        );
482        tracing::info!("  LLM available: {} ✅", self.llm_available);
483        tracing::info!("  Entity types: {:?}", self.config.entity_types);
484        tracing::info!("  Temperature: {}", self.config.temperature);
485    }
486}
487
488#[cfg(test)]
489mod tests {
490    use super::*;
491    use crate::{
492        core::{ChunkId, DocumentId, TextChunk},
493        ollama::OllamaConfig,
494    };
495
496    fn create_test_chunk() -> TextChunk {
497        TextChunk::new(
498            ChunkId::new("test_chunk".to_string()),
499            DocumentId::new("test_doc".to_string()),
500            "Tom Sawyer is a young boy who lives in St. Petersburg with his Aunt Polly. Tom is best friends with Huckleberry Finn.".to_string(),
501            0,
502            120,
503        )
504    }
505
506    #[test]
507    fn test_gleaning_extractor_creation() {
508        let ollama_config = OllamaConfig::default();
509        let ollama_client = OllamaClient::new(ollama_config);
510        let config = GleaningConfig::default();
511
512        let extractor = GleaningEntityExtractor::new(ollama_client, config);
513
514        let stats = extractor.get_statistics();
515        assert_eq!(stats.config.max_gleaning_rounds, 4);
516        assert!(stats.llm_available);
517    }
518
519    #[test]
520    fn test_merge_entity_data() {
521        let ollama_config = OllamaConfig::default();
522        let ollama_client = OllamaClient::new(ollama_config);
523        let config = GleaningConfig::default();
524        let extractor = GleaningEntityExtractor::new(ollama_client, config);
525
526        let existing = vec![
527            EntityData {
528                name: "Tom Sawyer".to_string(),
529                entity_type: "PERSON".to_string(),
530                description: "A boy".to_string(),
531            },
532        ];
533
534        let new = vec![
535            EntityData {
536                name: "Tom Sawyer".to_string(),
537                entity_type: "PERSON".to_string(),
538                description: "A young boy who lives in St. Petersburg".to_string(), // Longer description
539            },
540            EntityData {
541                name: "Huck Finn".to_string(),
542                entity_type: "PERSON".to_string(),
543                description: "Tom's friend".to_string(),
544            },
545        ];
546
547        let merged = extractor.merge_entity_data(existing, new);
548
549        assert_eq!(merged.len(), 2); // Tom (merged) and Huck
550        let tom = merged.iter().find(|e| e.name == "Tom Sawyer").unwrap();
551        assert!(tom.description.len() > 10); // Should have the longer description
552    }
553
554    #[test]
555    fn test_normalize_name() {
556        let ollama_config = OllamaConfig::default();
557        let ollama_client = OllamaClient::new(ollama_config);
558        let config = GleaningConfig::default();
559        let extractor = GleaningEntityExtractor::new(ollama_client, config);
560
561        assert_eq!(extractor.normalize_name("Tom Sawyer"), "tom_sawyer");
562        assert_eq!(extractor.normalize_name("St. Petersburg"), "st_petersburg");
563    }
564
565    #[test]
566    fn test_find_mentions() {
567        let ollama_config = OllamaConfig::default();
568        let ollama_client = OllamaClient::new(ollama_config);
569        let config = GleaningConfig::default();
570        let extractor = GleaningEntityExtractor::new(ollama_client, config);
571
572        let chunk = create_test_chunk();
573        let mentions = extractor.find_mentions("Tom", &chunk.id, &chunk.content);
574
575        assert!(!mentions.is_empty());
576        assert!(mentions.len() >= 2); // "Tom Sawyer" and "Tom is best friends"
577    }
578
579    #[test]
580    fn test_deduplicate_relationships() {
581        let ollama_config = OllamaConfig::default();
582        let ollama_client = OllamaClient::new(ollama_config);
583        let config = GleaningConfig::default();
584        let extractor = GleaningEntityExtractor::new(ollama_client, config);
585
586        let relationships = vec![
587            Relationship {
588                source: crate::core::EntityId::new("person_tom".to_string()),
589                target: crate::core::EntityId::new("person_huck".to_string()),
590                relation_type: "FRIENDS_WITH".to_string(),
591                confidence: 0.9,
592                context: vec![],
593            },
594            Relationship {
595                source: crate::core::EntityId::new("person_tom".to_string()),
596                target: crate::core::EntityId::new("person_huck".to_string()),
597                relation_type: "FRIENDS_WITH".to_string(), // Duplicate
598                confidence: 0.85,
599                context: vec![],
600            },
601            Relationship {
602                source: crate::core::EntityId::new("person_tom".to_string()),
603                target: crate::core::EntityId::new("location_stpetersburg".to_string()),
604                relation_type: "LIVES_IN".to_string(),
605                confidence: 0.8,
606                context: vec![],
607            },
608        ];
609
610        let deduplicated = extractor.deduplicate_relationships(relationships);
611
612        assert_eq!(deduplicated.len(), 2); // Duplicate FRIENDS_WITH removed
613    }
614}
graphrag_core/entity/gleaning_extractor.rs

graphrag_core/entity/
gleaning_extractor.rs