graphrag_core/entity/
mod.rs

1/// Bidirectional entity-chunk index for fast lookups
2pub mod bidirectional_index;
3/// Gleaning-based entity extraction module
4pub mod gleaning_extractor;
5/// LLM-based entity extractor (TRUE LLM extraction, not pattern-based)
6pub mod llm_extractor;
7/// LLM-based relationship extraction module
8pub mod llm_relationship_extractor;
9/// Prompt templates for LLM-based extraction
10pub mod prompts;
11/// Semantic entity merging module
12pub mod semantic_merging;
13/// String similarity-based entity linking module
14pub mod string_similarity_linker;
15
16pub use bidirectional_index::{BidirectionalIndex, IndexStatistics};
17pub use gleaning_extractor::{ExtractionCompletionStatus, GleaningConfig, GleaningEntityExtractor};
18pub use llm_extractor::LLMEntityExtractor;
19pub use llm_relationship_extractor::{
20    ExtractedEntity, ExtractedRelationship, ExtractionResult, LLMRelationshipExtractor,
21};
22pub use semantic_merging::{EntityMergeDecision, MergingStatistics, SemanticEntityMerger};
23pub use string_similarity_linker::{EntityLinkingConfig, StringSimilarityLinker};
24
25use crate::{
26    config::setconfig::EntityExtractionConfig,
27    core::{ChunkId, Entity, EntityId, EntityMention, TextChunk},
28    Result,
29};
30use regex::Regex;
31use std::collections::{HashMap, HashSet};
32
33/// Entity extraction system with dynamic configuration support
34pub struct EntityExtractor {
35    min_confidence: f32,
36    config: Option<EntityExtractionConfig>,
37    allowed_patterns: Vec<Regex>,
38    excluded_patterns: Vec<Regex>,
39}
40
41impl EntityExtractor {
42    /// Create a new entity extractor
43    pub fn new(min_confidence: f32) -> Result<Self> {
44        Ok(Self {
45            min_confidence,
46            config: None,
47            allowed_patterns: Vec::new(),
48            excluded_patterns: Vec::new(),
49        })
50    }
51
52    /// Create a new entity extractor with configuration
53    pub fn with_config(config: EntityExtractionConfig) -> Result<Self> {
54        let mut allowed_patterns = Vec::new();
55        let mut excluded_patterns = Vec::new();
56
57        // Compile allowed patterns from config
58        if let Some(filters) = &config.filters {
59            if let Some(patterns) = &filters.allowed_patterns {
60                for pattern in patterns {
61                    match Regex::new(pattern) {
62                        Ok(regex) => allowed_patterns.push(regex),
63                        Err(e) => {
64                            tracing::warn!("Invalid allowed pattern '{pattern}': {e}");
65                        }
66                    }
67                }
68            }
69
70            if let Some(patterns) = &filters.excluded_patterns {
71                for pattern in patterns {
72                    match Regex::new(pattern) {
73                        Ok(regex) => excluded_patterns.push(regex),
74                        Err(e) => {
75                            tracing::warn!("Invalid excluded pattern '{pattern}': {e}");
76                        }
77                    }
78                }
79            }
80        }
81
82        let min_confidence = config
83            .filters
84            .as_ref()
85            .map(|f| f.confidence_threshold)
86            .unwrap_or(config.confidence_threshold);
87
88        Ok(Self {
89            min_confidence,
90            config: Some(config),
91            allowed_patterns,
92            excluded_patterns,
93        })
94    }
95
96    /// Extract entities from a text chunk using dynamic entity types
97    pub fn extract_from_chunk(&self, chunk: &TextChunk) -> Result<Vec<Entity>> {
98        let mut entities = Vec::new();
99        let text = &chunk.content;
100
101        // Get entity types from config or use defaults
102        let entity_types = if let Some(config) = &self.config {
103            config.entity_types.as_ref().cloned().unwrap_or_else(|| {
104                vec![
105                    "PERSON".to_string(),
106                    "ORGANIZATION".to_string(),
107                    "LOCATION".to_string(),
108                ]
109            })
110        } else {
111            vec![
112                "PERSON".to_string(),
113                "ORGANIZATION".to_string(),
114                "LOCATION".to_string(),
115            ]
116        };
117
118        // Extract entities based on configured types
119        for entity_type in &entity_types {
120            match entity_type.as_str() {
121                "PERSON" | "CHARACTER" | "RESEARCHER" | "SPEAKER" | "DIALOGUE_SPEAKER" => {
122                    entities.extend(self.extract_persons(text, &chunk.id)?);
123                }
124                "ORGANIZATION" | "INSTITUTION" | "BRAND" | "COMPANY" => {
125                    entities.extend(self.extract_organizations(text, &chunk.id)?);
126                }
127                "LOCATION" | "SETTING" | "PLACE" => {
128                    entities.extend(self.extract_locations(text, &chunk.id)?);
129                }
130                "CONCEPT" | "THEORY" | "THEME" | "ARGUMENT" | "IDEA" => {
131                    entities.extend(self.extract_concepts(text, &chunk.id, entity_type)?);
132                }
133                "EVENT" | "EXPERIMENT" | "HAPPENING" => {
134                    entities.extend(self.extract_events(text, &chunk.id)?);
135                }
136                "OBJECT" | "TOOL" | "ARTIFACT" | "ITEM" => {
137                    entities.extend(self.extract_objects(text, &chunk.id)?);
138                }
139                _ => {
140                    // For any other entity type, use generic extraction
141                    entities.extend(self.extract_generic_entities(text, &chunk.id, entity_type)?);
142                }
143            }
144        }
145
146        // Apply pattern filtering
147        entities = self.apply_pattern_filtering(entities);
148
149        // Deduplicate entities by name and type
150        entities = self.deduplicate_entities(entities);
151
152        // Filter by confidence
153        entities.retain(|e| e.confidence >= self.min_confidence);
154
155        Ok(entities)
156    }
157
158    /// Extract person entities using enhanced capitalization and context heuristics
159    fn extract_persons(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
160        let mut entities = Vec::new();
161        let words: Vec<&str> = text.split_whitespace().collect();
162        let mut processed_indices = HashSet::new();
163
164        // Known titles and honorifics that indicate a person follows
165        let person_titles = [
166            "mr",
167            "mrs",
168            "ms",
169            "dr",
170            "prof",
171            "professor",
172            "sir",
173            "lady",
174            "lord",
175            "captain",
176            "major",
177            "colonel",
178            "general",
179            "admiral",
180            "judge",
181            "father",
182            "mother",
183            "brother",
184            "sister",
185            "aunt",
186            "uncle",
187            "grandfather",
188            "grandmother",
189        ];
190
191        // Common words that are NOT person names (to avoid false positives)
192        let non_person_words = [
193            "chapter",
194            "the",
195            "and",
196            "but",
197            "or",
198            "in",
199            "on",
200            "at",
201            "to",
202            "for",
203            "with",
204            "by",
205            "from",
206            "about",
207            "into",
208            "through",
209            "during",
210            "before",
211            "after",
212            "above",
213            "below",
214            "up",
215            "down",
216            "out",
217            "off",
218            "over",
219            "under",
220            "again",
221            "further",
222            "then",
223            "once",
224            "here",
225            "there",
226            "when",
227            "where",
228            "why",
229            "how",
230            "all",
231            "any",
232            "both",
233            "each",
234            "few",
235            "more",
236            "most",
237            "other",
238            "some",
239            "such",
240            "only",
241            "own",
242            "same",
243            "so",
244            "than",
245            "too",
246            "very",
247            "can",
248            "will",
249            "just",
250            "should",
251            "now",
252            "temptations",
253            "strategic",
254            "movements",
255            "decides",
256            "upon",
257            "whitewashing",
258            "saturday",
259            "monday",
260            "tuesday",
261            "wednesday",
262            "thursday",
263            "friday",
264            "sunday",
265            "january",
266            "february",
267            "march",
268            "april",
269            "may",
270            "june",
271            "july",
272            "august",
273            "september",
274            "october",
275            "november",
276            "december",
277            "adventures",
278            "complete",
279        ];
280
281        // PHASE 1: Extract well-known character names first (prevent concatenation)
282        entities.extend(self.extract_known_names(
283            &words,
284            &mut processed_indices,
285            chunk_id,
286            text,
287        )?);
288
289        // PHASE 2: Extract title-based names (Dr. Smith, Guardian Entity)
290        entities.extend(self.extract_title_based_names(
291            &words,
292            &person_titles,
293            &mut processed_indices,
294            chunk_id,
295            text,
296        )?);
297
298        // PHASE 3: Extract two-word names (First Last pattern)
299        entities.extend(self.extract_two_word_names(
300            &words,
301            &non_person_words,
302            &mut processed_indices,
303            chunk_id,
304            text,
305        )?);
306
307        // PHASE 4: Extract remaining single-word names (only if not processed yet)
308        for (i, &word_ref) in words.iter().enumerate() {
309            if processed_indices.contains(&i) {
310                continue;
311            }
312
313            let word = self.clean_word(word_ref);
314
315            // Skip if word is too short or is a known non-person word
316            if word.len() < 2 || non_person_words.contains(&word.to_lowercase().as_str()) {
317                continue;
318            }
319
320            // Look for capitalized words that could be single names
321            if self.is_capitalized(words[i]) && self.is_likely_person_word(&word) {
322                let confidence = self.calculate_confidence(&word, "PERSON");
323                if confidence >= self.min_confidence {
324                    entities.push(self.create_entity(word, "PERSON", confidence, chunk_id, text)?);
325                }
326            }
327        }
328
329        Ok(entities)
330    }
331
332    /// Extract well-known character names to prevent concatenation
333    fn extract_known_names(
334        &self,
335        words: &[&str],
336        processed: &mut std::collections::HashSet<usize>,
337        chunk_id: &ChunkId,
338        text: &str,
339    ) -> Result<Vec<Entity>> {
340        let mut entities = Vec::new();
341        let known_names = [
342            ("Entity Name", 2),
343            ("Second Entity", 2),
344            ("Guardian Entity", 2),
345            ("Friend Entity", 2),
346            ("Companion Entity", 2),
347            ("Third Entity", 2),
348            ("Fourth Entity", 2),
349            ("Fifth Entity", 2),
350            ("Sixth Entity", 2),
351            ("Seventh Entity", 2),
352            ("Eighth Entity", 2),
353            ("Ninth Entity", 2),
354        ];
355
356        for i in 0..words.len() {
357            if processed.contains(&i) {
358                continue;
359            }
360
361            for &(name, word_count) in &known_names {
362                let name_words: Vec<&str> = name.split_whitespace().collect();
363                if i + name_words.len() <= words.len() {
364                    let matches = name_words.iter().enumerate().all(|(j, &expected)| {
365                        let actual = self.clean_word(words[i + j]);
366                        actual.to_lowercase() == expected.to_lowercase()
367                    });
368
369                    if matches {
370                        let confidence = 0.95;
371                        if confidence >= self.min_confidence {
372                            entities.push(self.create_entity(
373                                name.to_string(),
374                                "PERSON",
375                                confidence,
376                                chunk_id,
377                                text,
378                            )?);
379                        }
380                        // Mark these indices as processed
381                        for j in 0..word_count {
382                            processed.insert(i + j);
383                        }
384                        break;
385                    }
386                }
387            }
388        }
389        Ok(entities)
390    }
391
392    /// Extract title-based names (Dr. Smith, Guardian Entity)
393    fn extract_title_based_names(
394        &self,
395        words: &[&str],
396        person_titles: &[&str],
397        processed: &mut std::collections::HashSet<usize>,
398        chunk_id: &ChunkId,
399        text: &str,
400    ) -> Result<Vec<Entity>> {
401        let mut entities = Vec::new();
402
403        for i in 0..words.len() {
404            if processed.contains(&i) {
405                continue;
406            }
407
408            let word_clean = self.clean_word(words[i]).to_lowercase();
409            if person_titles.contains(&word_clean.as_str())
410                && i + 1 < words.len()
411                && !processed.contains(&(i + 1))
412            {
413                let next_word = self.clean_word(words[i + 1]);
414                if self.is_capitalized(words[i + 1]) && self.is_likely_person_word(&next_word) {
415                    let name = if i + 2 < words.len() && !processed.contains(&(i + 2)) {
416                        let third_word = self.clean_word(words[i + 2]);
417                        if self.is_capitalized(words[i + 2])
418                            && self.is_likely_person_word(&third_word)
419                        {
420                            processed.insert(i + 2);
421                            format!("{next_word} {third_word}")
422                        } else {
423                            next_word
424                        }
425                    } else {
426                        next_word
427                    };
428
429                    let confidence = 0.9;
430                    if confidence >= self.min_confidence {
431                        entities
432                            .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
433                    }
434                    processed.insert(i);
435                    processed.insert(i + 1);
436                }
437            }
438        }
439        Ok(entities)
440    }
441
442    /// Extract two-word names (First Last pattern)
443    fn extract_two_word_names(
444        &self,
445        words: &[&str],
446        non_person_words: &[&str],
447        processed: &mut std::collections::HashSet<usize>,
448        chunk_id: &ChunkId,
449        text: &str,
450    ) -> Result<Vec<Entity>> {
451        let mut entities = Vec::new();
452
453        for i in 0..words.len() {
454            if processed.contains(&i) || i + 1 >= words.len() || processed.contains(&(i + 1)) {
455                continue;
456            }
457
458            let first_word = self.clean_word(words[i]);
459            let second_word = self.clean_word(words[i + 1]);
460
461            // Check if both words are capitalized and look like names
462            if self.is_capitalized(words[i])
463                && self.is_capitalized(words[i + 1])
464                && self.is_likely_person_word(&first_word)
465                && self.is_likely_person_word(&second_word)
466                && !non_person_words.contains(&first_word.to_lowercase().as_str())
467                && !non_person_words.contains(&second_word.to_lowercase().as_str())
468            {
469                let name = format!("{first_word} {second_word}");
470                if self.is_likely_person_name(&name) {
471                    let confidence = self.calculate_confidence(&name, "PERSON");
472                    if confidence >= self.min_confidence {
473                        entities
474                            .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
475                    }
476                    processed.insert(i);
477                    processed.insert(i + 1);
478                }
479            }
480        }
481        Ok(entities)
482    }
483
484    /// Extract organization entities
485    fn extract_organizations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
486        let mut entities = Vec::new();
487        let org_suffixes = [
488            "Inc",
489            "Corp",
490            "LLC",
491            "Ltd",
492            "Company",
493            "Corporation",
494            "Group",
495            "Solutions",
496            "Technologies",
497        ];
498        let org_prefixes = ["University of", "Institute of", "Department of"];
499
500        // Look for org suffixes
501        for suffix in &org_suffixes {
502            if let Some(pos) = text.find(suffix) {
503                // Extract potential organization name
504                let start = text[..pos].rfind(' ').map(|i| i + 1).unwrap_or(0);
505                let end = pos + suffix.len();
506                let name = text[start..end].trim().to_string();
507
508                if !name.is_empty() && self.is_likely_organization(&name) {
509                    let confidence = self.calculate_confidence(&name, "ORGANIZATION");
510                    if confidence >= self.min_confidence {
511                        entities.push(self.create_entity(
512                            name,
513                            "ORGANIZATION",
514                            confidence,
515                            chunk_id,
516                            text,
517                        )?);
518                    }
519                }
520            }
521        }
522
523        // Look for org prefixes
524        for prefix in &org_prefixes {
525            if let Some(pos) = text.find(prefix) {
526                let start = pos;
527                let end = text[pos..]
528                    .find('.')
529                    .map(|i| pos + i)
530                    .unwrap_or(text.len().min(pos + 50));
531                let name = text[start..end].trim().to_string();
532
533                if !name.is_empty() && name.len() > prefix.len() {
534                    let confidence = self.calculate_confidence(&name, "ORGANIZATION");
535                    if confidence >= self.min_confidence {
536                        entities.push(self.create_entity(
537                            name,
538                            "ORGANIZATION",
539                            confidence,
540                            chunk_id,
541                            text,
542                        )?);
543                    }
544                }
545            }
546        }
547
548        Ok(entities)
549    }
550
551    /// Extract location entities
552    fn extract_locations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
553        let mut entities = Vec::new();
554        let known_locations = [
555            "United States",
556            "New York",
557            "California",
558            "London",
559            "Paris",
560            "Tokyo",
561            "Berlin",
562            "Washington",
563            "Boston",
564            "Chicago",
565        ];
566
567        for location in &known_locations {
568            if text.contains(location) {
569                let confidence = self.calculate_confidence(location, "LOCATION");
570                if confidence >= self.min_confidence {
571                    entities.push(self.create_entity(
572                        location.to_string(),
573                        "LOCATION",
574                        confidence,
575                        chunk_id,
576                        text,
577                    )?);
578                }
579            }
580        }
581
582        Ok(entities)
583    }
584
585    /// Create an entity with mentions
586    fn create_entity(
587        &self,
588        name: String,
589        entity_type: &str,
590        confidence: f32,
591        chunk_id: &ChunkId,
592        text: &str,
593    ) -> Result<Entity> {
594        let entity_id = EntityId::new(format!("{}_{}", entity_type, self.normalize_name(&name)));
595
596        // Find all occurrences of the name in text for mentions
597        let mut mentions = Vec::new();
598        let mut start = 0;
599        while let Some(pos) = text[start..].find(&name) {
600            let actual_pos = start + pos;
601            mentions.push(EntityMention {
602                chunk_id: chunk_id.clone(),
603                start_offset: actual_pos,
604                end_offset: actual_pos + name.len(),
605                confidence,
606            });
607            start = actual_pos + name.len();
608        }
609
610        Ok(
611            Entity::new(entity_id, name, entity_type.to_string(), confidence)
612                .with_mentions(mentions),
613        )
614    }
615
616    /// Check if a word is capitalized
617    fn is_capitalized(&self, word: &str) -> bool {
618        word.chars().next().is_some_and(|c| c.is_uppercase())
619    }
620
621    /// Clean word by removing punctuation
622    fn clean_word(&self, word: &str) -> String {
623        word.chars()
624            .filter(|c| c.is_alphabetic() || *c == '\'') // Keep apostrophes for names like O'Connor
625            .collect::<String>()
626            .trim_end_matches('\'') // Remove trailing apostrophes
627            .to_string()
628    }
629
630    /// Enhanced check if a word could be part of a person's name
631    fn is_likely_person_word(&self, word: &str) -> bool {
632        if word.len() < 2 {
633            return false;
634        }
635
636        // Check for common name patterns
637        let word_lower = word.to_lowercase();
638
639        // Common name endings that suggest it's a person name
640        let name_endings = [
641            "son", "sen", "ton", "ham", "ford", "ley", "ment", "ard", "ert",
642        ];
643        let has_name_ending = name_endings
644            .iter()
645            .any(|&ending| word_lower.ends_with(ending));
646
647        // Common name prefixes
648        let name_prefixes = ["mc", "mac", "o'", "de", "van", "von", "la", "le"];
649        let has_name_prefix = name_prefixes
650            .iter()
651            .any(|&prefix| word_lower.starts_with(prefix));
652
653        // Must start with uppercase and be alphabetic
654        let is_proper_format = word.chars().next().unwrap().is_uppercase()
655            && word.chars().all(|c| c.is_alphabetic() || c == '\'');
656
657        // Common short words that are rarely names
658        let short_non_names = [
659            "it", "is", "as", "at", "be", "by", "do", "go", "he", "if", "in", "me", "my", "no",
660            "of", "on", "or", "so", "to", "up", "us", "we",
661        ];
662
663        if word.len() <= 2 && short_non_names.contains(&word_lower.as_str()) {
664            return false;
665        }
666
667        is_proper_format && (word.len() >= 3 || has_name_ending || has_name_prefix)
668    }
669
670    /// Check if a word is a title
671    #[allow(dead_code)]
672    fn is_title(&self, word: &str) -> bool {
673        matches!(word, "Dr." | "Mr." | "Ms." | "Mrs." | "Prof.")
674    }
675
676    /// Check if a name is likely a person name
677    fn is_likely_person_name(&self, name: &str) -> bool {
678        let parts: Vec<&str> = name.split_whitespace().collect();
679        parts.len() == 2 && parts.iter().all(|part| self.is_capitalized(part))
680    }
681
682    /// Check if a name is likely an organization
683    fn is_likely_organization(&self, name: &str) -> bool {
684        let org_indicators = [
685            "Inc",
686            "Corp",
687            "LLC",
688            "Ltd",
689            "Company",
690            "Corporation",
691            "University",
692            "Institute",
693        ];
694        org_indicators
695            .iter()
696            .any(|indicator| name.contains(indicator))
697    }
698
699    /// Calculate confidence score for an entity
700    fn calculate_confidence(&self, name: &str, entity_type: &str) -> f32 {
701        let mut confidence: f32 = 0.5; // Base confidence
702
703        // Adjust based on entity type patterns
704        match entity_type {
705            "PERSON" => {
706                if name.contains("Dr.") || name.contains("Prof.") {
707                    confidence += 0.3;
708                }
709                if name.split_whitespace().count() == 2 {
710                    confidence += 0.2;
711                }
712            }
713            "ORGANIZATION" => {
714                if name.contains("Inc") || name.contains("Corp") || name.contains("LLC") {
715                    confidence += 0.3;
716                }
717                if name.contains("University") || name.contains("Institute") {
718                    confidence += 0.2;
719                }
720            }
721            "LOCATION" => {
722                if name.contains(',') {
723                    confidence += 0.2;
724                }
725                if self.is_known_location(name) {
726                    confidence += 0.3;
727                }
728            }
729            _ => {}
730        }
731
732        // Adjust based on capitalization
733        if name.chars().next().is_some_and(|c| c.is_uppercase()) {
734            confidence += 0.1;
735        }
736
737        confidence.min(1.0)
738    }
739
740    /// Check if a name is a known location
741    fn is_known_location(&self, name: &str) -> bool {
742        const KNOWN_LOCATIONS: &[&str] = &[
743            "United States",
744            "New York",
745            "California",
746            "London",
747            "Paris",
748            "Tokyo",
749            "Berlin",
750            "Washington",
751            "Boston",
752            "Chicago",
753        ];
754        KNOWN_LOCATIONS.iter().any(|&loc| name.contains(loc))
755    }
756
757    /// Normalize entity name for ID generation
758    fn normalize_name(&self, name: &str) -> String {
759        name.to_lowercase()
760            .chars()
761            .filter(|c| c.is_alphanumeric() || *c == '_')
762            .collect::<String>()
763            .replace(' ', "_")
764    }
765
766    /// Deduplicate entities by name and type
767    fn deduplicate_entities(&self, entities: Vec<Entity>) -> Vec<Entity> {
768        let mut unique_entities: HashMap<(String, String), Entity> = HashMap::new();
769
770        for entity in entities {
771            let key = (entity.name.clone(), entity.entity_type.clone());
772
773            match unique_entities.get_mut(&key) {
774                Some(existing) => {
775                    // Merge mentions and take highest confidence
776                    existing.mentions.extend(entity.mentions);
777                    if entity.confidence > existing.confidence {
778                        existing.confidence = entity.confidence;
779                    }
780                }
781                None => {
782                    unique_entities.insert(key, entity);
783                }
784            }
785        }
786
787        unique_entities.into_values().collect()
788    }
789
790    /// Extract relationships between entities in the same chunk
791    pub fn extract_relationships(
792        &self,
793        entities: &[Entity],
794        chunk: &TextChunk,
795    ) -> Result<Vec<(EntityId, EntityId, String)>> {
796        let mut relationships = Vec::new();
797
798        // Simple co-occurrence based relationship extraction
799        for i in 0..entities.len() {
800            for j in (i + 1)..entities.len() {
801                let entity1 = &entities[i];
802                let entity2 = &entities[j];
803
804                // Check if both entities appear in the same chunk
805                let entity1_in_chunk = entity1.mentions.iter().any(|m| m.chunk_id == chunk.id);
806                let entity2_in_chunk = entity2.mentions.iter().any(|m| m.chunk_id == chunk.id);
807
808                if entity1_in_chunk && entity2_in_chunk {
809                    let relation_type =
810                        self.infer_relationship_type(entity1, entity2, &chunk.content);
811                    relationships.push((entity1.id.clone(), entity2.id.clone(), relation_type));
812                }
813            }
814        }
815
816        Ok(relationships)
817    }
818
819    /// Infer relationship type between two entities
820    fn infer_relationship_type(&self, entity1: &Entity, entity2: &Entity, context: &str) -> String {
821        match (&entity1.entity_type[..], &entity2.entity_type[..]) {
822            ("PERSON", "ORGANIZATION") | ("ORGANIZATION", "PERSON") => {
823                if context.contains("works for") || context.contains("employed by") {
824                    "WORKS_FOR".to_string()
825                } else if context.contains("founded") || context.contains("CEO") {
826                    "LEADS".to_string()
827                } else {
828                    "ASSOCIATED_WITH".to_string()
829                }
830            }
831            ("PERSON", "LOCATION") | ("LOCATION", "PERSON") => {
832                if context.contains("born in") || context.contains("from") {
833                    "BORN_IN".to_string()
834                } else if context.contains("lives in") || context.contains("based in") {
835                    "LOCATED_IN".to_string()
836                } else {
837                    "ASSOCIATED_WITH".to_string()
838                }
839            }
840            ("ORGANIZATION", "LOCATION") | ("LOCATION", "ORGANIZATION") => {
841                if context.contains("headquartered") || context.contains("based in") {
842                    "HEADQUARTERED_IN".to_string()
843                } else {
844                    "LOCATED_IN".to_string()
845                }
846            }
847            ("PERSON", "PERSON") => {
848                if context.contains("married") || context.contains("spouse") {
849                    "MARRIED_TO".to_string()
850                } else if context.contains("colleague") || context.contains("partner") {
851                    "COLLEAGUE_OF".to_string()
852                } else {
853                    "KNOWS".to_string()
854                }
855            }
856            _ => "RELATED_TO".to_string(),
857        }
858    }
859
860    /// Apply pattern filtering to entities based on configured patterns
861    fn apply_pattern_filtering(&self, entities: Vec<Entity>) -> Vec<Entity> {
862        if self.allowed_patterns.is_empty() && self.excluded_patterns.is_empty() {
863            return entities;
864        }
865
866        entities
867            .into_iter()
868            .filter(|entity| {
869                // If we have allowed patterns, entity must match at least one
870                if !self.allowed_patterns.is_empty() {
871                    let matches_allowed = self
872                        .allowed_patterns
873                        .iter()
874                        .any(|pattern| pattern.is_match(&entity.name));
875                    if !matches_allowed {
876                        return false;
877                    }
878                }
879
880                // Entity must not match any excluded patterns
881                if !self.excluded_patterns.is_empty() {
882                    let matches_excluded = self
883                        .excluded_patterns
884                        .iter()
885                        .any(|pattern| pattern.is_match(&entity.name));
886                    if matches_excluded {
887                        return false;
888                    }
889                }
890
891                true
892            })
893            .collect()
894    }
895
896    /// Extract concept entities (themes, ideas, theories)
897    fn extract_concepts(
898        &self,
899        text: &str,
900        chunk_id: &ChunkId,
901        entity_type: &str,
902    ) -> Result<Vec<Entity>> {
903        let mut entities = Vec::new();
904        let words: Vec<&str> = text.split_whitespace().collect();
905
906        // Look for conceptual terms that are typically capitalized
907        let concept_indicators = [
908            "Theory",
909            "Concept",
910            "Principle",
911            "Philosophy",
912            "Doctrine",
913            "Idea",
914            "Method",
915            "Approach",
916            "Framework",
917            "Model",
918            "Paradigm",
919            "Thesis",
920        ];
921
922        for &word in words.iter() {
923            let clean_word = self.clean_word(word);
924
925            // Check if this word indicates a concept
926            if concept_indicators
927                .iter()
928                .any(|&indicator| clean_word.contains(indicator))
929            {
930                let confidence = 0.75;
931                if confidence >= self.min_confidence {
932                    entities.push(self.create_entity(
933                        clean_word,
934                        entity_type,
935                        confidence,
936                        chunk_id,
937                        text,
938                    )?);
939                }
940            }
941
942            // Look for capitalized terms that might be concepts
943            if self.is_capitalized(word) && word.len() > 4 {
944                let clean_word = self.clean_word(word);
945                if !self.is_common_word(&clean_word) {
946                    let confidence = 0.6;
947                    if confidence >= self.min_confidence {
948                        entities.push(self.create_entity(
949                            clean_word,
950                            entity_type,
951                            confidence,
952                            chunk_id,
953                            text,
954                        )?);
955                    }
956                }
957            }
958        }
959
960        Ok(entities)
961    }
962
963    /// Extract event entities
964    fn extract_events(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
965        let mut entities = Vec::new();
966
967        // Event indicators
968        let event_words = [
969            "meeting",
970            "conference",
971            "ceremony",
972            "celebration",
973            "festival",
974            "competition",
975            "war",
976            "battle",
977            "expedition",
978            "journey",
979            "trial",
980        ];
981
982        for event_word in &event_words {
983            if text.to_lowercase().contains(event_word) {
984                let confidence = 0.7;
985                if confidence >= self.min_confidence {
986                    entities.push(self.create_entity(
987                        event_word.to_string(),
988                        "EVENT",
989                        confidence,
990                        chunk_id,
991                        text,
992                    )?);
993                }
994            }
995        }
996
997        Ok(entities)
998    }
999
1000    /// Extract object entities (tools, artifacts, items)
1001    fn extract_objects(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
1002        let mut entities = Vec::new();
1003
1004        // Object indicators
1005        let object_words = [
1006            "sword",
1007            "shield",
1008            "book",
1009            "manuscript",
1010            "scroll",
1011            "tablet",
1012            "ring",
1013            "crown",
1014            "treasure",
1015            "coin",
1016            "tool",
1017            "weapon",
1018        ];
1019
1020        for object_word in &object_words {
1021            if text.to_lowercase().contains(object_word) {
1022                let confidence = 0.65;
1023                if confidence >= self.min_confidence {
1024                    entities.push(self.create_entity(
1025                        object_word.to_string(),
1026                        "OBJECT",
1027                        confidence,
1028                        chunk_id,
1029                        text,
1030                    )?);
1031                }
1032            }
1033        }
1034
1035        Ok(entities)
1036    }
1037
1038    /// Generic entity extraction for any configured entity type
1039    fn extract_generic_entities(
1040        &self,
1041        text: &str,
1042        chunk_id: &ChunkId,
1043        entity_type: &str,
1044    ) -> Result<Vec<Entity>> {
1045        let mut entities = Vec::new();
1046        let words: Vec<&str> = text.split_whitespace().collect();
1047
1048        // For generic entity types, look for capitalized words that might be entities
1049        for &word in &words {
1050            if self.is_capitalized(word) && word.len() > 3 {
1051                let clean_word = self.clean_word(word);
1052                if !self.is_common_word(&clean_word) {
1053                    let confidence = 0.5; // Lower confidence for generic extraction
1054                    if confidence >= self.min_confidence {
1055                        entities.push(self.create_entity(
1056                            clean_word,
1057                            entity_type,
1058                            confidence,
1059                            chunk_id,
1060                            text,
1061                        )?);
1062                    }
1063                }
1064            }
1065        }
1066
1067        Ok(entities)
1068    }
1069
1070    /// Check if a word is a common word that shouldn't be extracted as an entity
1071    fn is_common_word(&self, word: &str) -> bool {
1072        let common_words = [
1073            "the", "and", "but", "or", "in", "on", "at", "to", "for", "with", "by", "from",
1074            "about", "into", "through", "during", "before", "after", "above", "below", "up",
1075            "down", "out", "off", "over", "under", "again", "further", "then", "once", "here",
1076            "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
1077            "most", "other", "some", "such", "only", "own", "same", "so", "than", "too", "very",
1078            "can", "will", "just", "should", "now", "could", "would", "said", "says", "told",
1079            "asked", "went", "came", "come", "going", "Chapter", "Page", "Section", "Part", "Book",
1080            "Volume",
1081        ];
1082
1083        common_words
1084            .iter()
1085            .any(|&common| word.eq_ignore_ascii_case(common))
1086    }
1087}
1088
1089#[cfg(test)]
1090mod tests {
1091    use super::*;
1092    use crate::core::{ChunkId, DocumentId};
1093
1094    #[test]
1095    fn test_person_extraction() {
1096        let extractor = EntityExtractor::new(0.5).unwrap();
1097        let chunk = TextChunk::new(
1098            ChunkId::new("test_chunk".to_string()),
1099            DocumentId::new("test_doc".to_string()),
1100            "Entity Name works at Test Corp. Dr. Second Entity is a professor.".to_string(),
1101            0,
1102            59,
1103        );
1104
1105        let entities = extractor.extract_from_chunk(&chunk).unwrap();
1106
1107        // Should extract persons and organizations
1108        assert!(!entities.is_empty());
1109
1110        let person_entities: Vec<_> = entities
1111            .iter()
1112            .filter(|e| e.entity_type == "PERSON")
1113            .collect();
1114        assert!(!person_entities.is_empty());
1115    }
1116
1117    #[test]
1118    fn test_relationship_extraction() {
1119        let extractor = EntityExtractor::new(0.5).unwrap();
1120        let chunk = TextChunk::new(
1121            ChunkId::new("test_chunk".to_string()),
1122            DocumentId::new("test_doc".to_string()),
1123            "Entity Name works for Test Corp in Test City.".to_string(),
1124            0,
1125            44,
1126        );
1127
1128        let entities = extractor.extract_from_chunk(&chunk).unwrap();
1129        let relationships = extractor.extract_relationships(&entities, &chunk).unwrap();
1130
1131        assert!(!relationships.is_empty());
1132    }
1133}
graphrag_core/entity/mod.rs

graphrag_core/entity/
mod.rs