graphrag_core/entity/
mod.rs

1/// ATOM atomic fact extraction module (Phase 1.3)
2pub mod atomic_fact_extractor;
3/// GLiNER-Relex joint NER + RE extractor (feature-gated: `gliner`)
4#[cfg(feature = "gliner")]
5mod gliner_extractor;
6/// Bidirectional entity-chunk index for fast lookups
7pub mod bidirectional_index;
8/// Gleaning-based entity extraction module
9pub mod gleaning_extractor;
10/// LLM-based entity extractor (TRUE LLM extraction, not pattern-based)
11pub mod llm_extractor;
12/// LLM-based relationship extraction module
13pub mod llm_relationship_extractor;
14/// Prompt templates for LLM-based extraction
15pub mod prompts;
16/// Semantic entity merging module
17pub mod semantic_merging;
18/// String similarity-based entity linking module
19pub mod string_similarity_linker;
20
21pub use atomic_fact_extractor::{AtomicFact, AtomicFactExtractor};
22#[cfg(feature = "gliner")]
23pub use gliner_extractor::GLiNERExtractor;
24pub use bidirectional_index::{BidirectionalIndex, IndexStatistics};
25pub use gleaning_extractor::{ExtractionCompletionStatus, GleaningConfig, GleaningEntityExtractor};
26pub use llm_extractor::LLMEntityExtractor;
27pub use llm_relationship_extractor::{
28    ExtractedEntity, ExtractedRelationship, ExtractionResult, LLMRelationshipExtractor,
29    TripleValidation,
30};
31pub use semantic_merging::{EntityMergeDecision, MergingStatistics, SemanticEntityMerger};
32pub use string_similarity_linker::{EntityLinkingConfig, StringSimilarityLinker};
33
34use crate::{
35    config::setconfig::EntityExtractionConfig,
36    core::{ChunkId, Entity, EntityId, EntityMention, TextChunk},
37    Result,
38};
39use regex::Regex;
40use std::collections::{HashMap, HashSet};
41
42/// Entity extraction system with dynamic configuration support
43pub struct EntityExtractor {
44    min_confidence: f32,
45    config: Option<EntityExtractionConfig>,
46    allowed_patterns: Vec<Regex>,
47    excluded_patterns: Vec<Regex>,
48}
49
50impl EntityExtractor {
51    /// Create a new entity extractor
52    pub fn new(min_confidence: f32) -> Result<Self> {
53        Ok(Self {
54            min_confidence,
55            config: None,
56            allowed_patterns: Vec::new(),
57            excluded_patterns: Vec::new(),
58        })
59    }
60
61    /// Create a new entity extractor with configuration
62    pub fn with_config(config: EntityExtractionConfig) -> Result<Self> {
63        let mut allowed_patterns = Vec::new();
64        let mut excluded_patterns = Vec::new();
65
66        // Compile allowed patterns from config
67        if let Some(filters) = &config.filters {
68            if let Some(patterns) = &filters.allowed_patterns {
69                for pattern in patterns {
70                    match Regex::new(pattern) {
71                        Ok(regex) => allowed_patterns.push(regex),
72                        Err(e) => {
73                            tracing::warn!("Invalid allowed pattern '{pattern}': {e}");
74                        },
75                    }
76                }
77            }
78
79            if let Some(patterns) = &filters.excluded_patterns {
80                for pattern in patterns {
81                    match Regex::new(pattern) {
82                        Ok(regex) => excluded_patterns.push(regex),
83                        Err(e) => {
84                            tracing::warn!("Invalid excluded pattern '{pattern}': {e}");
85                        },
86                    }
87                }
88            }
89        }
90
91        let min_confidence = config
92            .filters
93            .as_ref()
94            .map(|f| f.confidence_threshold)
95            .unwrap_or(config.confidence_threshold);
96
97        Ok(Self {
98            min_confidence,
99            config: Some(config),
100            allowed_patterns,
101            excluded_patterns,
102        })
103    }
104
105    /// Extract entities from a text chunk using dynamic entity types
106    pub fn extract_from_chunk(&self, chunk: &TextChunk) -> Result<Vec<Entity>> {
107        let mut entities = Vec::new();
108        let text = &chunk.content;
109
110        // Get entity types from config or use defaults
111        let entity_types = if let Some(config) = &self.config {
112            config.entity_types.as_ref().cloned().unwrap_or_else(|| {
113                vec![
114                    "PERSON".to_string(),
115                    "ORGANIZATION".to_string(),
116                    "LOCATION".to_string(),
117                ]
118            })
119        } else {
120            vec![
121                "PERSON".to_string(),
122                "ORGANIZATION".to_string(),
123                "LOCATION".to_string(),
124            ]
125        };
126
127        // Extract entities based on configured types
128        for entity_type in &entity_types {
129            match entity_type.as_str() {
130                "PERSON" | "CHARACTER" | "RESEARCHER" | "SPEAKER" | "DIALOGUE_SPEAKER" => {
131                    entities.extend(self.extract_persons(text, &chunk.id)?);
132                },
133                "ORGANIZATION" | "INSTITUTION" | "BRAND" | "COMPANY" => {
134                    entities.extend(self.extract_organizations(text, &chunk.id)?);
135                },
136                "LOCATION" | "SETTING" | "PLACE" => {
137                    entities.extend(self.extract_locations(text, &chunk.id)?);
138                },
139                "CONCEPT" | "THEORY" | "THEME" | "ARGUMENT" | "IDEA" => {
140                    entities.extend(self.extract_concepts(text, &chunk.id, entity_type)?);
141                },
142                "EVENT" | "EXPERIMENT" | "HAPPENING" => {
143                    entities.extend(self.extract_events(text, &chunk.id)?);
144                },
145                "OBJECT" | "TOOL" | "ARTIFACT" | "ITEM" => {
146                    entities.extend(self.extract_objects(text, &chunk.id)?);
147                },
148                _ => {
149                    // For any other entity type, use generic extraction
150                    entities.extend(self.extract_generic_entities(text, &chunk.id, entity_type)?);
151                },
152            }
153        }
154
155        // Apply pattern filtering
156        entities = self.apply_pattern_filtering(entities);
157
158        // Deduplicate entities by name and type
159        entities = self.deduplicate_entities(entities);
160
161        // Filter by confidence
162        entities.retain(|e| e.confidence >= self.min_confidence);
163
164        Ok(entities)
165    }
166
167    /// Extract person entities using enhanced capitalization and context heuristics
168    fn extract_persons(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
169        let mut entities = Vec::new();
170        let words: Vec<&str> = text.split_whitespace().collect();
171        let mut processed_indices = HashSet::new();
172
173        // Known titles and honorifics that indicate a person follows
174        let person_titles = [
175            "mr",
176            "mrs",
177            "ms",
178            "dr",
179            "prof",
180            "professor",
181            "sir",
182            "lady",
183            "lord",
184            "captain",
185            "major",
186            "colonel",
187            "general",
188            "admiral",
189            "judge",
190            "father",
191            "mother",
192            "brother",
193            "sister",
194            "aunt",
195            "uncle",
196            "grandfather",
197            "grandmother",
198        ];
199
200        // Common words that are NOT person names (to avoid false positives)
201        let non_person_words = [
202            "chapter",
203            "the",
204            "and",
205            "but",
206            "or",
207            "in",
208            "on",
209            "at",
210            "to",
211            "for",
212            "with",
213            "by",
214            "from",
215            "about",
216            "into",
217            "through",
218            "during",
219            "before",
220            "after",
221            "above",
222            "below",
223            "up",
224            "down",
225            "out",
226            "off",
227            "over",
228            "under",
229            "again",
230            "further",
231            "then",
232            "once",
233            "here",
234            "there",
235            "when",
236            "where",
237            "why",
238            "how",
239            "all",
240            "any",
241            "both",
242            "each",
243            "few",
244            "more",
245            "most",
246            "other",
247            "some",
248            "such",
249            "only",
250            "own",
251            "same",
252            "so",
253            "than",
254            "too",
255            "very",
256            "can",
257            "will",
258            "just",
259            "should",
260            "now",
261            "temptations",
262            "strategic",
263            "movements",
264            "decides",
265            "upon",
266            "whitewashing",
267            "saturday",
268            "monday",
269            "tuesday",
270            "wednesday",
271            "thursday",
272            "friday",
273            "sunday",
274            "january",
275            "february",
276            "march",
277            "april",
278            "may",
279            "june",
280            "july",
281            "august",
282            "september",
283            "october",
284            "november",
285            "december",
286            "adventures",
287            "complete",
288        ];
289
290        // PHASE 1: Extract well-known character names first (prevent concatenation)
291        entities.extend(self.extract_known_names(
292            &words,
293            &mut processed_indices,
294            chunk_id,
295            text,
296        )?);
297
298        // PHASE 2: Extract title-based names (Dr. Smith, Guardian Entity)
299        entities.extend(self.extract_title_based_names(
300            &words,
301            &person_titles,
302            &mut processed_indices,
303            chunk_id,
304            text,
305        )?);
306
307        // PHASE 3: Extract two-word names (First Last pattern)
308        entities.extend(self.extract_two_word_names(
309            &words,
310            &non_person_words,
311            &mut processed_indices,
312            chunk_id,
313            text,
314        )?);
315
316        // PHASE 4: Extract remaining single-word names (only if not processed yet)
317        for (i, &word_ref) in words.iter().enumerate() {
318            if processed_indices.contains(&i) {
319                continue;
320            }
321
322            let word = self.clean_word(word_ref);
323
324            // Skip if word is too short or is a known non-person word
325            if word.len() < 2 || non_person_words.contains(&word.to_lowercase().as_str()) {
326                continue;
327            }
328
329            // Look for capitalized words that could be single names
330            if self.is_capitalized(words[i]) && self.is_likely_person_word(&word) {
331                let confidence = self.calculate_confidence(&word, "PERSON");
332                if confidence >= self.min_confidence {
333                    entities.push(self.create_entity(word, "PERSON", confidence, chunk_id, text)?);
334                }
335            }
336        }
337
338        Ok(entities)
339    }
340
341    /// Extract well-known character names to prevent concatenation
342    fn extract_known_names(
343        &self,
344        words: &[&str],
345        processed: &mut std::collections::HashSet<usize>,
346        chunk_id: &ChunkId,
347        text: &str,
348    ) -> Result<Vec<Entity>> {
349        let mut entities = Vec::new();
350        let known_names = [
351            ("Entity Name", 2),
352            ("Second Entity", 2),
353            ("Guardian Entity", 2),
354            ("Friend Entity", 2),
355            ("Companion Entity", 2),
356            ("Third Entity", 2),
357            ("Fourth Entity", 2),
358            ("Fifth Entity", 2),
359            ("Sixth Entity", 2),
360            ("Seventh Entity", 2),
361            ("Eighth Entity", 2),
362            ("Ninth Entity", 2),
363        ];
364
365        for i in 0..words.len() {
366            if processed.contains(&i) {
367                continue;
368            }
369
370            for &(name, word_count) in &known_names {
371                let name_words: Vec<&str> = name.split_whitespace().collect();
372                if i + name_words.len() <= words.len() {
373                    let matches = name_words.iter().enumerate().all(|(j, &expected)| {
374                        let actual = self.clean_word(words[i + j]);
375                        actual.to_lowercase() == expected.to_lowercase()
376                    });
377
378                    if matches {
379                        let confidence = 0.95;
380                        if confidence >= self.min_confidence {
381                            entities.push(self.create_entity(
382                                name.to_string(),
383                                "PERSON",
384                                confidence,
385                                chunk_id,
386                                text,
387                            )?);
388                        }
389                        // Mark these indices as processed
390                        for j in 0..word_count {
391                            processed.insert(i + j);
392                        }
393                        break;
394                    }
395                }
396            }
397        }
398        Ok(entities)
399    }
400
401    /// Extract title-based names (Dr. Smith, Guardian Entity)
402    fn extract_title_based_names(
403        &self,
404        words: &[&str],
405        person_titles: &[&str],
406        processed: &mut std::collections::HashSet<usize>,
407        chunk_id: &ChunkId,
408        text: &str,
409    ) -> Result<Vec<Entity>> {
410        let mut entities = Vec::new();
411
412        for i in 0..words.len() {
413            if processed.contains(&i) {
414                continue;
415            }
416
417            let word_clean = self.clean_word(words[i]).to_lowercase();
418            if person_titles.contains(&word_clean.as_str())
419                && i + 1 < words.len()
420                && !processed.contains(&(i + 1))
421            {
422                let next_word = self.clean_word(words[i + 1]);
423                if self.is_capitalized(words[i + 1]) && self.is_likely_person_word(&next_word) {
424                    let name = if i + 2 < words.len() && !processed.contains(&(i + 2)) {
425                        let third_word = self.clean_word(words[i + 2]);
426                        if self.is_capitalized(words[i + 2])
427                            && self.is_likely_person_word(&third_word)
428                        {
429                            processed.insert(i + 2);
430                            format!("{next_word} {third_word}")
431                        } else {
432                            next_word
433                        }
434                    } else {
435                        next_word
436                    };
437
438                    let confidence = 0.9;
439                    if confidence >= self.min_confidence {
440                        entities
441                            .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
442                    }
443                    processed.insert(i);
444                    processed.insert(i + 1);
445                }
446            }
447        }
448        Ok(entities)
449    }
450
451    /// Extract two-word names (First Last pattern)
452    fn extract_two_word_names(
453        &self,
454        words: &[&str],
455        non_person_words: &[&str],
456        processed: &mut std::collections::HashSet<usize>,
457        chunk_id: &ChunkId,
458        text: &str,
459    ) -> Result<Vec<Entity>> {
460        let mut entities = Vec::new();
461
462        for i in 0..words.len() {
463            if processed.contains(&i) || i + 1 >= words.len() || processed.contains(&(i + 1)) {
464                continue;
465            }
466
467            let first_word = self.clean_word(words[i]);
468            let second_word = self.clean_word(words[i + 1]);
469
470            // Check if both words are capitalized and look like names
471            if self.is_capitalized(words[i])
472                && self.is_capitalized(words[i + 1])
473                && self.is_likely_person_word(&first_word)
474                && self.is_likely_person_word(&second_word)
475                && !non_person_words.contains(&first_word.to_lowercase().as_str())
476                && !non_person_words.contains(&second_word.to_lowercase().as_str())
477            {
478                let name = format!("{first_word} {second_word}");
479                if self.is_likely_person_name(&name) {
480                    let confidence = self.calculate_confidence(&name, "PERSON");
481                    if confidence >= self.min_confidence {
482                        entities
483                            .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
484                    }
485                    processed.insert(i);
486                    processed.insert(i + 1);
487                }
488            }
489        }
490        Ok(entities)
491    }
492
493    /// Extract organization entities
494    fn extract_organizations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
495        let mut entities = Vec::new();
496        let org_suffixes = [
497            "Inc",
498            "Corp",
499            "LLC",
500            "Ltd",
501            "Company",
502            "Corporation",
503            "Group",
504            "Solutions",
505            "Technologies",
506        ];
507        let org_prefixes = ["University of", "Institute of", "Department of"];
508
509        // Look for org suffixes
510        for suffix in &org_suffixes {
511            if let Some(pos) = text.find(suffix) {
512                // Extract potential organization name
513                let start = text[..pos].rfind(' ').map(|i| i + 1).unwrap_or(0);
514                let end = pos + suffix.len();
515                let name = text[start..end].trim().to_string();
516
517                if !name.is_empty() && self.is_likely_organization(&name) {
518                    let confidence = self.calculate_confidence(&name, "ORGANIZATION");
519                    if confidence >= self.min_confidence {
520                        entities.push(self.create_entity(
521                            name,
522                            "ORGANIZATION",
523                            confidence,
524                            chunk_id,
525                            text,
526                        )?);
527                    }
528                }
529            }
530        }
531
532        // Look for org prefixes
533        for prefix in &org_prefixes {
534            if let Some(pos) = text.find(prefix) {
535                let start = pos;
536                let end = text[pos..]
537                    .find('.')
538                    .map(|i| pos + i)
539                    .unwrap_or(text.len().min(pos + 50));
540                let name = text[start..end].trim().to_string();
541
542                if !name.is_empty() && name.len() > prefix.len() {
543                    let confidence = self.calculate_confidence(&name, "ORGANIZATION");
544                    if confidence >= self.min_confidence {
545                        entities.push(self.create_entity(
546                            name,
547                            "ORGANIZATION",
548                            confidence,
549                            chunk_id,
550                            text,
551                        )?);
552                    }
553                }
554            }
555        }
556
557        Ok(entities)
558    }
559
560    /// Extract location entities
561    fn extract_locations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
562        let mut entities = Vec::new();
563        let known_locations = [
564            "United States",
565            "New York",
566            "California",
567            "London",
568            "Paris",
569            "Tokyo",
570            "Berlin",
571            "Washington",
572            "Boston",
573            "Chicago",
574        ];
575
576        for location in &known_locations {
577            if text.contains(location) {
578                let confidence = self.calculate_confidence(location, "LOCATION");
579                if confidence >= self.min_confidence {
580                    entities.push(self.create_entity(
581                        location.to_string(),
582                        "LOCATION",
583                        confidence,
584                        chunk_id,
585                        text,
586                    )?);
587                }
588            }
589        }
590
591        Ok(entities)
592    }
593
594    /// Create an entity with mentions
595    fn create_entity(
596        &self,
597        name: String,
598        entity_type: &str,
599        confidence: f32,
600        chunk_id: &ChunkId,
601        text: &str,
602    ) -> Result<Entity> {
603        let entity_id = EntityId::new(format!("{}_{}", entity_type, self.normalize_name(&name)));
604
605        // Find all occurrences of the name in text for mentions
606        let mut mentions = Vec::new();
607        let mut start = 0;
608        while let Some(pos) = text[start..].find(&name) {
609            let actual_pos = start + pos;
610            mentions.push(EntityMention {
611                chunk_id: chunk_id.clone(),
612                start_offset: actual_pos,
613                end_offset: actual_pos + name.len(),
614                confidence,
615            });
616            start = actual_pos + name.len();
617        }
618
619        Ok(
620            Entity::new(entity_id, name, entity_type.to_string(), confidence)
621                .with_mentions(mentions),
622        )
623    }
624
625    /// Check if a word is capitalized
626    fn is_capitalized(&self, word: &str) -> bool {
627        word.chars().next().is_some_and(|c| c.is_uppercase())
628    }
629
630    /// Clean word by removing punctuation
631    fn clean_word(&self, word: &str) -> String {
632        word.chars()
633            .filter(|c| c.is_alphabetic() || *c == '\'') // Keep apostrophes for names like O'Connor
634            .collect::<String>()
635            .trim_end_matches('\'') // Remove trailing apostrophes
636            .to_string()
637    }
638
639    /// Enhanced check if a word could be part of a person's name
640    fn is_likely_person_word(&self, word: &str) -> bool {
641        if word.len() < 2 {
642            return false;
643        }
644
645        // Check for common name patterns
646        let word_lower = word.to_lowercase();
647
648        // Common name endings that suggest it's a person name
649        let name_endings = [
650            "son", "sen", "ton", "ham", "ford", "ley", "ment", "ard", "ert",
651        ];
652        let has_name_ending = name_endings
653            .iter()
654            .any(|&ending| word_lower.ends_with(ending));
655
656        // Common name prefixes
657        let name_prefixes = ["mc", "mac", "o'", "de", "van", "von", "la", "le"];
658        let has_name_prefix = name_prefixes
659            .iter()
660            .any(|&prefix| word_lower.starts_with(prefix));
661
662        // Must start with uppercase and be alphabetic
663        let is_proper_format = word.chars().next().unwrap().is_uppercase()
664            && word.chars().all(|c| c.is_alphabetic() || c == '\'');
665
666        // Common short words that are rarely names
667        let short_non_names = [
668            "it", "is", "as", "at", "be", "by", "do", "go", "he", "if", "in", "me", "my", "no",
669            "of", "on", "or", "so", "to", "up", "us", "we",
670        ];
671
672        if word.len() <= 2 && short_non_names.contains(&word_lower.as_str()) {
673            return false;
674        }
675
676        is_proper_format && (word.len() >= 3 || has_name_ending || has_name_prefix)
677    }
678
679    /// Check if a word is a title
680    #[allow(dead_code)]
681    fn is_title(&self, word: &str) -> bool {
682        matches!(word, "Dr." | "Mr." | "Ms." | "Mrs." | "Prof.")
683    }
684
685    /// Check if a name is likely a person name
686    fn is_likely_person_name(&self, name: &str) -> bool {
687        let parts: Vec<&str> = name.split_whitespace().collect();
688        parts.len() == 2 && parts.iter().all(|part| self.is_capitalized(part))
689    }
690
691    /// Check if a name is likely an organization
692    fn is_likely_organization(&self, name: &str) -> bool {
693        let org_indicators = [
694            "Inc",
695            "Corp",
696            "LLC",
697            "Ltd",
698            "Company",
699            "Corporation",
700            "University",
701            "Institute",
702        ];
703        org_indicators
704            .iter()
705            .any(|indicator| name.contains(indicator))
706    }
707
708    /// Calculate confidence score for an entity
709    fn calculate_confidence(&self, name: &str, entity_type: &str) -> f32 {
710        let mut confidence: f32 = 0.5; // Base confidence
711
712        // Adjust based on entity type patterns
713        match entity_type {
714            "PERSON" => {
715                if name.contains("Dr.") || name.contains("Prof.") {
716                    confidence += 0.3;
717                }
718                if name.split_whitespace().count() == 2 {
719                    confidence += 0.2;
720                }
721            },
722            "ORGANIZATION" => {
723                if name.contains("Inc") || name.contains("Corp") || name.contains("LLC") {
724                    confidence += 0.3;
725                }
726                if name.contains("University") || name.contains("Institute") {
727                    confidence += 0.2;
728                }
729            },
730            "LOCATION" => {
731                if name.contains(',') {
732                    confidence += 0.2;
733                }
734                if self.is_known_location(name) {
735                    confidence += 0.3;
736                }
737            },
738            _ => {},
739        }
740
741        // Adjust based on capitalization
742        if name.chars().next().is_some_and(|c| c.is_uppercase()) {
743            confidence += 0.1;
744        }
745
746        confidence.min(1.0)
747    }
748
749    /// Check if a name is a known location
750    fn is_known_location(&self, name: &str) -> bool {
751        const KNOWN_LOCATIONS: &[&str] = &[
752            "United States",
753            "New York",
754            "California",
755            "London",
756            "Paris",
757            "Tokyo",
758            "Berlin",
759            "Washington",
760            "Boston",
761            "Chicago",
762        ];
763        KNOWN_LOCATIONS.iter().any(|&loc| name.contains(loc))
764    }
765
766    /// Normalize entity name for ID generation
767    fn normalize_name(&self, name: &str) -> String {
768        name.to_lowercase()
769            .chars()
770            .filter(|c| c.is_alphanumeric() || *c == '_')
771            .collect::<String>()
772            .replace(' ', "_")
773    }
774
775    /// Deduplicate entities by name and type
776    fn deduplicate_entities(&self, entities: Vec<Entity>) -> Vec<Entity> {
777        let mut unique_entities: HashMap<(String, String), Entity> = HashMap::new();
778
779        for entity in entities {
780            let key = (entity.name.clone(), entity.entity_type.clone());
781
782            match unique_entities.get_mut(&key) {
783                Some(existing) => {
784                    // Merge mentions and take highest confidence
785                    existing.mentions.extend(entity.mentions);
786                    if entity.confidence > existing.confidence {
787                        existing.confidence = entity.confidence;
788                    }
789                },
790                None => {
791                    unique_entities.insert(key, entity);
792                },
793            }
794        }
795
796        unique_entities.into_values().collect()
797    }
798
799    /// Extract relationships between entities in the same chunk
800    pub fn extract_relationships(
801        &self,
802        entities: &[Entity],
803        chunk: &TextChunk,
804    ) -> Result<Vec<(EntityId, EntityId, String)>> {
805        let mut relationships = Vec::new();
806
807        // Simple co-occurrence based relationship extraction
808        for i in 0..entities.len() {
809            for j in (i + 1)..entities.len() {
810                let entity1 = &entities[i];
811                let entity2 = &entities[j];
812
813                // Check if both entities appear in the same chunk
814                let entity1_in_chunk = entity1.mentions.iter().any(|m| m.chunk_id == chunk.id);
815                let entity2_in_chunk = entity2.mentions.iter().any(|m| m.chunk_id == chunk.id);
816
817                if entity1_in_chunk && entity2_in_chunk {
818                    let relation_type =
819                        self.infer_relationship_type(entity1, entity2, &chunk.content);
820                    relationships.push((entity1.id.clone(), entity2.id.clone(), relation_type));
821                }
822            }
823        }
824
825        Ok(relationships)
826    }
827
828    /// Infer relationship type between two entities
829    fn infer_relationship_type(&self, entity1: &Entity, entity2: &Entity, context: &str) -> String {
830        match (&entity1.entity_type[..], &entity2.entity_type[..]) {
831            ("PERSON", "ORGANIZATION") | ("ORGANIZATION", "PERSON") => {
832                if context.contains("works for") || context.contains("employed by") {
833                    "WORKS_FOR".to_string()
834                } else if context.contains("founded") || context.contains("CEO") {
835                    "LEADS".to_string()
836                } else {
837                    "ASSOCIATED_WITH".to_string()
838                }
839            },
840            ("PERSON", "LOCATION") | ("LOCATION", "PERSON") => {
841                if context.contains("born in") || context.contains("from") {
842                    "BORN_IN".to_string()
843                } else if context.contains("lives in") || context.contains("based in") {
844                    "LOCATED_IN".to_string()
845                } else {
846                    "ASSOCIATED_WITH".to_string()
847                }
848            },
849            ("ORGANIZATION", "LOCATION") | ("LOCATION", "ORGANIZATION") => {
850                if context.contains("headquartered") || context.contains("based in") {
851                    "HEADQUARTERED_IN".to_string()
852                } else {
853                    "LOCATED_IN".to_string()
854                }
855            },
856            ("PERSON", "PERSON") => {
857                if context.contains("married") || context.contains("spouse") {
858                    "MARRIED_TO".to_string()
859                } else if context.contains("colleague") || context.contains("partner") {
860                    "COLLEAGUE_OF".to_string()
861                } else {
862                    "KNOWS".to_string()
863                }
864            },
865            _ => "RELATED_TO".to_string(),
866        }
867    }
868
869    /// Apply pattern filtering to entities based on configured patterns
870    fn apply_pattern_filtering(&self, entities: Vec<Entity>) -> Vec<Entity> {
871        if self.allowed_patterns.is_empty() && self.excluded_patterns.is_empty() {
872            return entities;
873        }
874
875        entities
876            .into_iter()
877            .filter(|entity| {
878                // If we have allowed patterns, entity must match at least one
879                if !self.allowed_patterns.is_empty() {
880                    let matches_allowed = self
881                        .allowed_patterns
882                        .iter()
883                        .any(|pattern| pattern.is_match(&entity.name));
884                    if !matches_allowed {
885                        return false;
886                    }
887                }
888
889                // Entity must not match any excluded patterns
890                if !self.excluded_patterns.is_empty() {
891                    let matches_excluded = self
892                        .excluded_patterns
893                        .iter()
894                        .any(|pattern| pattern.is_match(&entity.name));
895                    if matches_excluded {
896                        return false;
897                    }
898                }
899
900                true
901            })
902            .collect()
903    }
904
905    /// Extract concept entities (themes, ideas, theories)
906    fn extract_concepts(
907        &self,
908        text: &str,
909        chunk_id: &ChunkId,
910        entity_type: &str,
911    ) -> Result<Vec<Entity>> {
912        let mut entities = Vec::new();
913        let words: Vec<&str> = text.split_whitespace().collect();
914
915        // Look for conceptual terms that are typically capitalized
916        let concept_indicators = [
917            "Theory",
918            "Concept",
919            "Principle",
920            "Philosophy",
921            "Doctrine",
922            "Idea",
923            "Method",
924            "Approach",
925            "Framework",
926            "Model",
927            "Paradigm",
928            "Thesis",
929        ];
930
931        for &word in words.iter() {
932            let clean_word = self.clean_word(word);
933
934            // Check if this word indicates a concept
935            if concept_indicators
936                .iter()
937                .any(|&indicator| clean_word.contains(indicator))
938            {
939                let confidence = 0.75;
940                if confidence >= self.min_confidence {
941                    entities.push(self.create_entity(
942                        clean_word,
943                        entity_type,
944                        confidence,
945                        chunk_id,
946                        text,
947                    )?);
948                }
949            }
950
951            // Look for capitalized terms that might be concepts
952            if self.is_capitalized(word) && word.len() > 4 {
953                let clean_word = self.clean_word(word);
954                if !self.is_common_word(&clean_word) {
955                    let confidence = 0.6;
956                    if confidence >= self.min_confidence {
957                        entities.push(self.create_entity(
958                            clean_word,
959                            entity_type,
960                            confidence,
961                            chunk_id,
962                            text,
963                        )?);
964                    }
965                }
966            }
967        }
968
969        Ok(entities)
970    }
971
972    /// Extract event entities
973    fn extract_events(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
974        let mut entities = Vec::new();
975
976        // Event indicators
977        let event_words = [
978            "meeting",
979            "conference",
980            "ceremony",
981            "celebration",
982            "festival",
983            "competition",
984            "war",
985            "battle",
986            "expedition",
987            "journey",
988            "trial",
989        ];
990
991        for event_word in &event_words {
992            if text.to_lowercase().contains(event_word) {
993                let confidence = 0.7;
994                if confidence >= self.min_confidence {
995                    entities.push(self.create_entity(
996                        event_word.to_string(),
997                        "EVENT",
998                        confidence,
999                        chunk_id,
1000                        text,
1001                    )?);
1002                }
1003            }
1004        }
1005
1006        Ok(entities)
1007    }
1008
1009    /// Extract object entities (tools, artifacts, items)
1010    fn extract_objects(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
1011        let mut entities = Vec::new();
1012
1013        // Object indicators
1014        let object_words = [
1015            "sword",
1016            "shield",
1017            "book",
1018            "manuscript",
1019            "scroll",
1020            "tablet",
1021            "ring",
1022            "crown",
1023            "treasure",
1024            "coin",
1025            "tool",
1026            "weapon",
1027        ];
1028
1029        for object_word in &object_words {
1030            if text.to_lowercase().contains(object_word) {
1031                let confidence = 0.65;
1032                if confidence >= self.min_confidence {
1033                    entities.push(self.create_entity(
1034                        object_word.to_string(),
1035                        "OBJECT",
1036                        confidence,
1037                        chunk_id,
1038                        text,
1039                    )?);
1040                }
1041            }
1042        }
1043
1044        Ok(entities)
1045    }
1046
1047    /// Generic entity extraction for any configured entity type
1048    fn extract_generic_entities(
1049        &self,
1050        text: &str,
1051        chunk_id: &ChunkId,
1052        entity_type: &str,
1053    ) -> Result<Vec<Entity>> {
1054        let mut entities = Vec::new();
1055        let words: Vec<&str> = text.split_whitespace().collect();
1056
1057        // For generic entity types, look for capitalized words that might be entities
1058        for &word in &words {
1059            if self.is_capitalized(word) && word.len() > 3 {
1060                let clean_word = self.clean_word(word);
1061                if !self.is_common_word(&clean_word) {
1062                    let confidence = 0.5; // Lower confidence for generic extraction
1063                    if confidence >= self.min_confidence {
1064                        entities.push(self.create_entity(
1065                            clean_word,
1066                            entity_type,
1067                            confidence,
1068                            chunk_id,
1069                            text,
1070                        )?);
1071                    }
1072                }
1073            }
1074        }
1075
1076        Ok(entities)
1077    }
1078
1079    /// Check if a word is a common word that shouldn't be extracted as an entity
1080    fn is_common_word(&self, word: &str) -> bool {
1081        let common_words = [
1082            "the", "and", "but", "or", "in", "on", "at", "to", "for", "with", "by", "from",
1083            "about", "into", "through", "during", "before", "after", "above", "below", "up",
1084            "down", "out", "off", "over", "under", "again", "further", "then", "once", "here",
1085            "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
1086            "most", "other", "some", "such", "only", "own", "same", "so", "than", "too", "very",
1087            "can", "will", "just", "should", "now", "could", "would", "said", "says", "told",
1088            "asked", "went", "came", "come", "going", "Chapter", "Page", "Section", "Part", "Book",
1089            "Volume",
1090        ];
1091
1092        common_words
1093            .iter()
1094            .any(|&common| word.eq_ignore_ascii_case(common))
1095    }
1096}
1097
1098#[cfg(test)]
1099mod tests {
1100    use super::*;
1101    use crate::core::{ChunkId, DocumentId};
1102
1103    #[test]
1104    fn test_person_extraction() {
1105        let extractor = EntityExtractor::new(0.5).unwrap();
1106        let chunk = TextChunk::new(
1107            ChunkId::new("test_chunk".to_string()),
1108            DocumentId::new("test_doc".to_string()),
1109            "Entity Name works at Test Corp. Dr. Second Entity is a professor.".to_string(),
1110            0,
1111            59,
1112        );
1113
1114        let entities = extractor.extract_from_chunk(&chunk).unwrap();
1115
1116        // Should extract persons and organizations
1117        assert!(!entities.is_empty());
1118
1119        let person_entities: Vec<_> = entities
1120            .iter()
1121            .filter(|e| e.entity_type == "PERSON")
1122            .collect();
1123        assert!(!person_entities.is_empty());
1124    }
1125
1126    #[test]
1127    fn test_relationship_extraction() {
1128        let extractor = EntityExtractor::new(0.5).unwrap();
1129        let chunk = TextChunk::new(
1130            ChunkId::new("test_chunk".to_string()),
1131            DocumentId::new("test_doc".to_string()),
1132            "Entity Name works for Test Corp in Test City.".to_string(),
1133            0,
1134            44,
1135        );
1136
1137        let entities = extractor.extract_from_chunk(&chunk).unwrap();
1138        let relationships = extractor.extract_relationships(&entities, &chunk).unwrap();
1139
1140        assert!(!relationships.is_empty());
1141    }
1142}
graphrag_core/entity/mod.rs

graphrag_core/entity/
mod.rs