graphrag_core/entity/
mod.rs

1//! Entity and relationship extraction.
2//!
3//! Pulls entities and relationships out of text chunks. The active extractor is chosen
4//! at runtime from `Config`: pattern-based regex, multi-round LLM gleaning, single-pass
5//! LLM, or GLiNER joint NER+RE (feature `gliner`).
6
7/// ATOM atomic fact extraction module (Phase 1.3)
8pub mod atomic_fact_extractor;
9/// Bidirectional entity-chunk index for fast lookups
10pub mod bidirectional_index;
11/// Gleaning-based entity extraction module
12pub mod gleaning_extractor;
13/// GLiNER-Relex joint NER + RE extractor (feature-gated: `gliner`)
14#[cfg(feature = "gliner")]
15mod gliner_extractor;
16/// LLM-based entity extractor (TRUE LLM extraction, not pattern-based)
17pub mod llm_extractor;
18/// LLM-based relationship extraction module
19pub mod llm_relationship_extractor;
20/// Prompt templates for LLM-based extraction
21pub mod prompts;
22/// Semantic entity merging module
23pub mod semantic_merging;
24/// String similarity-based entity linking module
25pub mod string_similarity_linker;
26
27pub use atomic_fact_extractor::{AtomicFact, AtomicFactExtractor};
28pub use bidirectional_index::{BidirectionalIndex, IndexStatistics};
29pub use gleaning_extractor::{ExtractionCompletionStatus, GleaningConfig, GleaningEntityExtractor};
30#[cfg(feature = "gliner")]
31pub use gliner_extractor::GLiNERExtractor;
32pub use llm_extractor::LLMEntityExtractor;
33pub use llm_relationship_extractor::{
34    ExtractedEntity, ExtractedRelationship, ExtractionResult, LLMRelationshipExtractor,
35    TripleValidation,
36};
37pub use semantic_merging::{EntityMergeDecision, MergingStatistics, SemanticEntityMerger};
38pub use string_similarity_linker::{EntityLinkingConfig, StringSimilarityLinker};
39
40use crate::{
41    config::setconfig::EntityExtractionConfig,
42    core::{ChunkId, Entity, EntityId, EntityMention, TextChunk},
43    Result,
44};
45use regex::Regex;
46use std::collections::{HashMap, HashSet};
47
48/// Entity extraction system with dynamic configuration support
49pub struct EntityExtractor {
50    min_confidence: f32,
51    config: Option<EntityExtractionConfig>,
52    allowed_patterns: Vec<Regex>,
53    excluded_patterns: Vec<Regex>,
54}
55
56impl EntityExtractor {
57    /// Create a new entity extractor
58    pub fn new(min_confidence: f32) -> Result<Self> {
59        Ok(Self {
60            min_confidence,
61            config: None,
62            allowed_patterns: Vec::new(),
63            excluded_patterns: Vec::new(),
64        })
65    }
66
67    /// Create a new entity extractor with configuration
68    pub fn with_config(config: EntityExtractionConfig) -> Result<Self> {
69        let mut allowed_patterns = Vec::new();
70        let mut excluded_patterns = Vec::new();
71
72        // Compile allowed patterns from config
73        if let Some(filters) = &config.filters {
74            if let Some(patterns) = &filters.allowed_patterns {
75                for pattern in patterns {
76                    match Regex::new(pattern) {
77                        Ok(regex) => allowed_patterns.push(regex),
78                        Err(_e) => {
79                            #[cfg(feature = "tracing")]
80                            tracing::warn!("Invalid allowed pattern '{pattern}': {_e}");
81                        },
82                    }
83                }
84            }
85
86            if let Some(patterns) = &filters.excluded_patterns {
87                for pattern in patterns {
88                    match Regex::new(pattern) {
89                        Ok(regex) => excluded_patterns.push(regex),
90                        Err(_e) => {
91                            #[cfg(feature = "tracing")]
92                            tracing::warn!("Invalid excluded pattern '{pattern}': {_e}");
93                        },
94                    }
95                }
96            }
97        }
98
99        let min_confidence = config
100            .filters
101            .as_ref()
102            .map(|f| f.confidence_threshold)
103            .unwrap_or(config.confidence_threshold);
104
105        Ok(Self {
106            min_confidence,
107            config: Some(config),
108            allowed_patterns,
109            excluded_patterns,
110        })
111    }
112
113    /// Extract entities from a text chunk using dynamic entity types
114    pub fn extract_from_chunk(&self, chunk: &TextChunk) -> Result<Vec<Entity>> {
115        let mut entities = Vec::new();
116        let text = &chunk.content;
117
118        // Get entity types from config or use defaults
119        let entity_types = if let Some(config) = &self.config {
120            config.entity_types.as_ref().cloned().unwrap_or_else(|| {
121                vec![
122                    "PERSON".to_string(),
123                    "ORGANIZATION".to_string(),
124                    "LOCATION".to_string(),
125                ]
126            })
127        } else {
128            vec![
129                "PERSON".to_string(),
130                "ORGANIZATION".to_string(),
131                "LOCATION".to_string(),
132            ]
133        };
134
135        // Extract entities based on configured types
136        for entity_type in &entity_types {
137            match entity_type.as_str() {
138                "PERSON" | "CHARACTER" | "RESEARCHER" | "SPEAKER" | "DIALOGUE_SPEAKER" => {
139                    entities.extend(self.extract_persons(text, &chunk.id)?);
140                },
141                "ORGANIZATION" | "INSTITUTION" | "BRAND" | "COMPANY" => {
142                    entities.extend(self.extract_organizations(text, &chunk.id)?);
143                },
144                "LOCATION" | "SETTING" | "PLACE" => {
145                    entities.extend(self.extract_locations(text, &chunk.id)?);
146                },
147                "CONCEPT" | "THEORY" | "THEME" | "ARGUMENT" | "IDEA" => {
148                    entities.extend(self.extract_concepts(text, &chunk.id, entity_type)?);
149                },
150                "EVENT" | "EXPERIMENT" | "HAPPENING" => {
151                    entities.extend(self.extract_events(text, &chunk.id)?);
152                },
153                "OBJECT" | "TOOL" | "ARTIFACT" | "ITEM" => {
154                    entities.extend(self.extract_objects(text, &chunk.id)?);
155                },
156                _ => {
157                    // For any other entity type, use generic extraction
158                    entities.extend(self.extract_generic_entities(text, &chunk.id, entity_type)?);
159                },
160            }
161        }
162
163        // Apply pattern filtering
164        entities = self.apply_pattern_filtering(entities);
165
166        // Deduplicate entities by name and type
167        entities = self.deduplicate_entities(entities);
168
169        // Filter by confidence
170        entities.retain(|e| e.confidence >= self.min_confidence);
171
172        Ok(entities)
173    }
174
175    /// Extract person entities using enhanced capitalization and context heuristics
176    fn extract_persons(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
177        let mut entities = Vec::new();
178        let words: Vec<&str> = text.split_whitespace().collect();
179        let mut processed_indices = HashSet::new();
180
181        // Known titles and honorifics that indicate a person follows
182        let person_titles = [
183            "mr",
184            "mrs",
185            "ms",
186            "dr",
187            "prof",
188            "professor",
189            "sir",
190            "lady",
191            "lord",
192            "captain",
193            "major",
194            "colonel",
195            "general",
196            "admiral",
197            "judge",
198            "father",
199            "mother",
200            "brother",
201            "sister",
202            "aunt",
203            "uncle",
204            "grandfather",
205            "grandmother",
206        ];
207
208        // Common words that are NOT person names (to avoid false positives)
209        let non_person_words = [
210            "chapter",
211            "the",
212            "and",
213            "but",
214            "or",
215            "in",
216            "on",
217            "at",
218            "to",
219            "for",
220            "with",
221            "by",
222            "from",
223            "about",
224            "into",
225            "through",
226            "during",
227            "before",
228            "after",
229            "above",
230            "below",
231            "up",
232            "down",
233            "out",
234            "off",
235            "over",
236            "under",
237            "again",
238            "further",
239            "then",
240            "once",
241            "here",
242            "there",
243            "when",
244            "where",
245            "why",
246            "how",
247            "all",
248            "any",
249            "both",
250            "each",
251            "few",
252            "more",
253            "most",
254            "other",
255            "some",
256            "such",
257            "only",
258            "own",
259            "same",
260            "so",
261            "than",
262            "too",
263            "very",
264            "can",
265            "will",
266            "just",
267            "should",
268            "now",
269            "temptations",
270            "strategic",
271            "movements",
272            "decides",
273            "upon",
274            "whitewashing",
275            "saturday",
276            "monday",
277            "tuesday",
278            "wednesday",
279            "thursday",
280            "friday",
281            "sunday",
282            "january",
283            "february",
284            "march",
285            "april",
286            "may",
287            "june",
288            "july",
289            "august",
290            "september",
291            "october",
292            "november",
293            "december",
294            "adventures",
295            "complete",
296        ];
297
298        // PHASE 1: Extract well-known character names first (prevent concatenation)
299        entities.extend(self.extract_known_names(
300            &words,
301            &mut processed_indices,
302            chunk_id,
303            text,
304        )?);
305
306        // PHASE 2: Extract title-based names (Dr. Smith, Guardian Entity)
307        entities.extend(self.extract_title_based_names(
308            &words,
309            &person_titles,
310            &mut processed_indices,
311            chunk_id,
312            text,
313        )?);
314
315        // PHASE 3: Extract two-word names (First Last pattern)
316        entities.extend(self.extract_two_word_names(
317            &words,
318            &non_person_words,
319            &mut processed_indices,
320            chunk_id,
321            text,
322        )?);
323
324        // PHASE 4: Extract remaining single-word names (only if not processed yet)
325        for (i, &word_ref) in words.iter().enumerate() {
326            if processed_indices.contains(&i) {
327                continue;
328            }
329
330            let word = self.clean_word(word_ref);
331
332            // Skip if word is too short or is a known non-person word
333            if word.len() < 2 || non_person_words.contains(&word.to_lowercase().as_str()) {
334                continue;
335            }
336
337            // Look for capitalized words that could be single names
338            if self.is_capitalized(words[i]) && self.is_likely_person_word(&word) {
339                let confidence = self.calculate_confidence(&word, "PERSON");
340                if confidence >= self.min_confidence {
341                    entities.push(self.create_entity(word, "PERSON", confidence, chunk_id, text)?);
342                }
343            }
344        }
345
346        Ok(entities)
347    }
348
349    /// Extract well-known character names to prevent concatenation
350    fn extract_known_names(
351        &self,
352        words: &[&str],
353        processed: &mut std::collections::HashSet<usize>,
354        chunk_id: &ChunkId,
355        text: &str,
356    ) -> Result<Vec<Entity>> {
357        let mut entities = Vec::new();
358        let known_names = [
359            ("Entity Name", 2),
360            ("Second Entity", 2),
361            ("Guardian Entity", 2),
362            ("Friend Entity", 2),
363            ("Companion Entity", 2),
364            ("Third Entity", 2),
365            ("Fourth Entity", 2),
366            ("Fifth Entity", 2),
367            ("Sixth Entity", 2),
368            ("Seventh Entity", 2),
369            ("Eighth Entity", 2),
370            ("Ninth Entity", 2),
371        ];
372
373        for i in 0..words.len() {
374            if processed.contains(&i) {
375                continue;
376            }
377
378            for &(name, word_count) in &known_names {
379                let name_words: Vec<&str> = name.split_whitespace().collect();
380                if i + name_words.len() <= words.len() {
381                    let matches = name_words.iter().enumerate().all(|(j, &expected)| {
382                        let actual = self.clean_word(words[i + j]);
383                        actual.to_lowercase() == expected.to_lowercase()
384                    });
385
386                    if matches {
387                        let confidence = 0.95;
388                        if confidence >= self.min_confidence {
389                            entities.push(self.create_entity(
390                                name.to_string(),
391                                "PERSON",
392                                confidence,
393                                chunk_id,
394                                text,
395                            )?);
396                        }
397                        // Mark these indices as processed
398                        for j in 0..word_count {
399                            processed.insert(i + j);
400                        }
401                        break;
402                    }
403                }
404            }
405        }
406        Ok(entities)
407    }
408
409    /// Extract title-based names (Dr. Smith, Guardian Entity)
410    fn extract_title_based_names(
411        &self,
412        words: &[&str],
413        person_titles: &[&str],
414        processed: &mut std::collections::HashSet<usize>,
415        chunk_id: &ChunkId,
416        text: &str,
417    ) -> Result<Vec<Entity>> {
418        let mut entities = Vec::new();
419
420        for i in 0..words.len() {
421            if processed.contains(&i) {
422                continue;
423            }
424
425            let word_clean = self.clean_word(words[i]).to_lowercase();
426            if person_titles.contains(&word_clean.as_str())
427                && i + 1 < words.len()
428                && !processed.contains(&(i + 1))
429            {
430                let next_word = self.clean_word(words[i + 1]);
431                if self.is_capitalized(words[i + 1]) && self.is_likely_person_word(&next_word) {
432                    let name = if i + 2 < words.len() && !processed.contains(&(i + 2)) {
433                        let third_word = self.clean_word(words[i + 2]);
434                        if self.is_capitalized(words[i + 2])
435                            && self.is_likely_person_word(&third_word)
436                        {
437                            processed.insert(i + 2);
438                            format!("{next_word} {third_word}")
439                        } else {
440                            next_word
441                        }
442                    } else {
443                        next_word
444                    };
445
446                    let confidence = 0.9;
447                    if confidence >= self.min_confidence {
448                        entities
449                            .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
450                    }
451                    processed.insert(i);
452                    processed.insert(i + 1);
453                }
454            }
455        }
456        Ok(entities)
457    }
458
459    /// Extract two-word names (First Last pattern)
460    fn extract_two_word_names(
461        &self,
462        words: &[&str],
463        non_person_words: &[&str],
464        processed: &mut std::collections::HashSet<usize>,
465        chunk_id: &ChunkId,
466        text: &str,
467    ) -> Result<Vec<Entity>> {
468        let mut entities = Vec::new();
469
470        for i in 0..words.len() {
471            if processed.contains(&i) || i + 1 >= words.len() || processed.contains(&(i + 1)) {
472                continue;
473            }
474
475            let first_word = self.clean_word(words[i]);
476            let second_word = self.clean_word(words[i + 1]);
477
478            // Check if both words are capitalized and look like names
479            if self.is_capitalized(words[i])
480                && self.is_capitalized(words[i + 1])
481                && self.is_likely_person_word(&first_word)
482                && self.is_likely_person_word(&second_word)
483                && !non_person_words.contains(&first_word.to_lowercase().as_str())
484                && !non_person_words.contains(&second_word.to_lowercase().as_str())
485            {
486                let name = format!("{first_word} {second_word}");
487                if self.is_likely_person_name(&name) {
488                    let confidence = self.calculate_confidence(&name, "PERSON");
489                    if confidence >= self.min_confidence {
490                        entities
491                            .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
492                    }
493                    processed.insert(i);
494                    processed.insert(i + 1);
495                }
496            }
497        }
498        Ok(entities)
499    }
500
501    /// Extract organization entities
502    fn extract_organizations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
503        let mut entities = Vec::new();
504        let org_suffixes = [
505            "Inc",
506            "Corp",
507            "LLC",
508            "Ltd",
509            "Company",
510            "Corporation",
511            "Group",
512            "Solutions",
513            "Technologies",
514        ];
515        let org_prefixes = ["University of", "Institute of", "Department of"];
516
517        // Look for org suffixes
518        for suffix in &org_suffixes {
519            if let Some(pos) = text.find(suffix) {
520                // Extract potential organization name
521                let start = text[..pos].rfind(' ').map(|i| i + 1).unwrap_or(0);
522                let end = pos + suffix.len();
523                let name = text[start..end].trim().to_string();
524
525                if !name.is_empty() && self.is_likely_organization(&name) {
526                    let confidence = self.calculate_confidence(&name, "ORGANIZATION");
527                    if confidence >= self.min_confidence {
528                        entities.push(self.create_entity(
529                            name,
530                            "ORGANIZATION",
531                            confidence,
532                            chunk_id,
533                            text,
534                        )?);
535                    }
536                }
537            }
538        }
539
540        // Look for org prefixes
541        for prefix in &org_prefixes {
542            if let Some(pos) = text.find(prefix) {
543                let start = pos;
544                let end = text[pos..]
545                    .find('.')
546                    .map(|i| pos + i)
547                    .unwrap_or(text.len().min(pos + 50));
548                let name = text[start..end].trim().to_string();
549
550                if !name.is_empty() && name.len() > prefix.len() {
551                    let confidence = self.calculate_confidence(&name, "ORGANIZATION");
552                    if confidence >= self.min_confidence {
553                        entities.push(self.create_entity(
554                            name,
555                            "ORGANIZATION",
556                            confidence,
557                            chunk_id,
558                            text,
559                        )?);
560                    }
561                }
562            }
563        }
564
565        Ok(entities)
566    }
567
568    /// Extract location entities
569    fn extract_locations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
570        let mut entities = Vec::new();
571        let known_locations = [
572            "United States",
573            "New York",
574            "California",
575            "London",
576            "Paris",
577            "Tokyo",
578            "Berlin",
579            "Washington",
580            "Boston",
581            "Chicago",
582        ];
583
584        for location in &known_locations {
585            if text.contains(location) {
586                let confidence = self.calculate_confidence(location, "LOCATION");
587                if confidence >= self.min_confidence {
588                    entities.push(self.create_entity(
589                        location.to_string(),
590                        "LOCATION",
591                        confidence,
592                        chunk_id,
593                        text,
594                    )?);
595                }
596            }
597        }
598
599        Ok(entities)
600    }
601
602    /// Create an entity with mentions
603    fn create_entity(
604        &self,
605        name: String,
606        entity_type: &str,
607        confidence: f32,
608        chunk_id: &ChunkId,
609        text: &str,
610    ) -> Result<Entity> {
611        let entity_id = EntityId::new(format!("{}_{}", entity_type, self.normalize_name(&name)));
612
613        // Find all occurrences of the name in text for mentions
614        let mut mentions = Vec::new();
615        let mut start = 0;
616        while let Some(pos) = text[start..].find(&name) {
617            let actual_pos = start + pos;
618            mentions.push(EntityMention {
619                chunk_id: chunk_id.clone(),
620                start_offset: actual_pos,
621                end_offset: actual_pos + name.len(),
622                confidence,
623            });
624            start = actual_pos + name.len();
625        }
626
627        Ok(
628            Entity::new(entity_id, name, entity_type.to_string(), confidence)
629                .with_mentions(mentions),
630        )
631    }
632
633    /// Check if a word is capitalized
634    fn is_capitalized(&self, word: &str) -> bool {
635        word.chars().next().is_some_and(|c| c.is_uppercase())
636    }
637
638    /// Clean word by removing punctuation
639    fn clean_word(&self, word: &str) -> String {
640        word.chars()
641            .filter(|c| c.is_alphabetic() || *c == '\'') // Keep apostrophes for names like O'Connor
642            .collect::<String>()
643            .trim_end_matches('\'') // Remove trailing apostrophes
644            .to_string()
645    }
646
647    /// Enhanced check if a word could be part of a person's name
648    fn is_likely_person_word(&self, word: &str) -> bool {
649        if word.len() < 2 {
650            return false;
651        }
652
653        // Check for common name patterns
654        let word_lower = word.to_lowercase();
655
656        // Common name endings that suggest it's a person name
657        let name_endings = [
658            "son", "sen", "ton", "ham", "ford", "ley", "ment", "ard", "ert",
659        ];
660        let has_name_ending = name_endings
661            .iter()
662            .any(|&ending| word_lower.ends_with(ending));
663
664        // Common name prefixes
665        let name_prefixes = ["mc", "mac", "o'", "de", "van", "von", "la", "le"];
666        let has_name_prefix = name_prefixes
667            .iter()
668            .any(|&prefix| word_lower.starts_with(prefix));
669
670        // Must start with uppercase and be alphabetic
671        let is_proper_format = word
672            .chars()
673            .next()
674            .expect("non-empty string")
675            .is_uppercase()
676            && word.chars().all(|c| c.is_alphabetic() || c == '\'');
677
678        // Common short words that are rarely names
679        let short_non_names = [
680            "it", "is", "as", "at", "be", "by", "do", "go", "he", "if", "in", "me", "my", "no",
681            "of", "on", "or", "so", "to", "up", "us", "we",
682        ];
683
684        if word.len() <= 2 && short_non_names.contains(&word_lower.as_str()) {
685            return false;
686        }
687
688        is_proper_format && (word.len() >= 3 || has_name_ending || has_name_prefix)
689    }
690
691    /// Check if a word is a title
692    #[allow(dead_code)]
693    fn is_title(&self, word: &str) -> bool {
694        matches!(word, "Dr." | "Mr." | "Ms." | "Mrs." | "Prof.")
695    }
696
697    /// Check if a name is likely a person name
698    fn is_likely_person_name(&self, name: &str) -> bool {
699        let parts: Vec<&str> = name.split_whitespace().collect();
700        parts.len() == 2 && parts.iter().all(|part| self.is_capitalized(part))
701    }
702
703    /// Check if a name is likely an organization
704    fn is_likely_organization(&self, name: &str) -> bool {
705        let org_indicators = [
706            "Inc",
707            "Corp",
708            "LLC",
709            "Ltd",
710            "Company",
711            "Corporation",
712            "University",
713            "Institute",
714        ];
715        org_indicators
716            .iter()
717            .any(|indicator| name.contains(indicator))
718    }
719
720    /// Calculate confidence score for an entity
721    fn calculate_confidence(&self, name: &str, entity_type: &str) -> f32 {
722        let mut confidence: f32 = 0.5; // Base confidence
723
724        // Adjust based on entity type patterns
725        match entity_type {
726            "PERSON" => {
727                if name.contains("Dr.") || name.contains("Prof.") {
728                    confidence += 0.3;
729                }
730                if name.split_whitespace().count() == 2 {
731                    confidence += 0.2;
732                }
733            },
734            "ORGANIZATION" => {
735                if name.contains("Inc") || name.contains("Corp") || name.contains("LLC") {
736                    confidence += 0.3;
737                }
738                if name.contains("University") || name.contains("Institute") {
739                    confidence += 0.2;
740                }
741            },
742            "LOCATION" => {
743                if name.contains(',') {
744                    confidence += 0.2;
745                }
746                if self.is_known_location(name) {
747                    confidence += 0.3;
748                }
749            },
750            _ => {},
751        }
752
753        // Adjust based on capitalization
754        if name.chars().next().is_some_and(|c| c.is_uppercase()) {
755            confidence += 0.1;
756        }
757
758        confidence.min(1.0)
759    }
760
761    /// Check if a name is a known location
762    fn is_known_location(&self, name: &str) -> bool {
763        const KNOWN_LOCATIONS: &[&str] = &[
764            "United States",
765            "New York",
766            "California",
767            "London",
768            "Paris",
769            "Tokyo",
770            "Berlin",
771            "Washington",
772            "Boston",
773            "Chicago",
774        ];
775        KNOWN_LOCATIONS.iter().any(|&loc| name.contains(loc))
776    }
777
778    /// Normalize entity name for ID generation
779    fn normalize_name(&self, name: &str) -> String {
780        name.to_lowercase()
781            .chars()
782            .filter(|c| c.is_alphanumeric() || *c == '_')
783            .collect::<String>()
784            .replace(' ', "_")
785    }
786
787    /// Deduplicate entities by name and type
788    fn deduplicate_entities(&self, entities: Vec<Entity>) -> Vec<Entity> {
789        let mut unique_entities: HashMap<(String, String), Entity> = HashMap::new();
790
791        for entity in entities {
792            let key = (entity.name.clone(), entity.entity_type.clone());
793
794            match unique_entities.get_mut(&key) {
795                Some(existing) => {
796                    // Merge mentions and take highest confidence
797                    existing.mentions.extend(entity.mentions);
798                    if entity.confidence > existing.confidence {
799                        existing.confidence = entity.confidence;
800                    }
801                },
802                None => {
803                    unique_entities.insert(key, entity);
804                },
805            }
806        }
807
808        unique_entities.into_values().collect()
809    }
810
811    /// Extract relationships between entities in the same chunk
812    pub fn extract_relationships(
813        &self,
814        entities: &[Entity],
815        chunk: &TextChunk,
816    ) -> Result<Vec<(EntityId, EntityId, String)>> {
817        let mut relationships = Vec::new();
818
819        // Simple co-occurrence based relationship extraction
820        for i in 0..entities.len() {
821            for j in (i + 1)..entities.len() {
822                let entity1 = &entities[i];
823                let entity2 = &entities[j];
824
825                // Check if both entities appear in the same chunk
826                let entity1_in_chunk = entity1.mentions.iter().any(|m| m.chunk_id == chunk.id);
827                let entity2_in_chunk = entity2.mentions.iter().any(|m| m.chunk_id == chunk.id);
828
829                if entity1_in_chunk && entity2_in_chunk {
830                    let relation_type =
831                        self.infer_relationship_type(entity1, entity2, &chunk.content);
832                    relationships.push((entity1.id.clone(), entity2.id.clone(), relation_type));
833                }
834            }
835        }
836
837        Ok(relationships)
838    }
839
840    /// Infer relationship type between two entities
841    fn infer_relationship_type(&self, entity1: &Entity, entity2: &Entity, context: &str) -> String {
842        match (&entity1.entity_type[..], &entity2.entity_type[..]) {
843            ("PERSON", "ORGANIZATION") | ("ORGANIZATION", "PERSON") => {
844                if context.contains("works for") || context.contains("employed by") {
845                    "WORKS_FOR".to_string()
846                } else if context.contains("founded") || context.contains("CEO") {
847                    "LEADS".to_string()
848                } else {
849                    "ASSOCIATED_WITH".to_string()
850                }
851            },
852            ("PERSON", "LOCATION") | ("LOCATION", "PERSON") => {
853                if context.contains("born in") || context.contains("from") {
854                    "BORN_IN".to_string()
855                } else if context.contains("lives in") || context.contains("based in") {
856                    "LOCATED_IN".to_string()
857                } else {
858                    "ASSOCIATED_WITH".to_string()
859                }
860            },
861            ("ORGANIZATION", "LOCATION") | ("LOCATION", "ORGANIZATION") => {
862                if context.contains("headquartered") || context.contains("based in") {
863                    "HEADQUARTERED_IN".to_string()
864                } else {
865                    "LOCATED_IN".to_string()
866                }
867            },
868            ("PERSON", "PERSON") => {
869                if context.contains("married") || context.contains("spouse") {
870                    "MARRIED_TO".to_string()
871                } else if context.contains("colleague") || context.contains("partner") {
872                    "COLLEAGUE_OF".to_string()
873                } else {
874                    "KNOWS".to_string()
875                }
876            },
877            _ => "RELATED_TO".to_string(),
878        }
879    }
880
881    /// Apply pattern filtering to entities based on configured patterns
882    fn apply_pattern_filtering(&self, entities: Vec<Entity>) -> Vec<Entity> {
883        if self.allowed_patterns.is_empty() && self.excluded_patterns.is_empty() {
884            return entities;
885        }
886
887        entities
888            .into_iter()
889            .filter(|entity| {
890                // If we have allowed patterns, entity must match at least one
891                if !self.allowed_patterns.is_empty() {
892                    let matches_allowed = self
893                        .allowed_patterns
894                        .iter()
895                        .any(|pattern| pattern.is_match(&entity.name));
896                    if !matches_allowed {
897                        return false;
898                    }
899                }
900
901                // Entity must not match any excluded patterns
902                if !self.excluded_patterns.is_empty() {
903                    let matches_excluded = self
904                        .excluded_patterns
905                        .iter()
906                        .any(|pattern| pattern.is_match(&entity.name));
907                    if matches_excluded {
908                        return false;
909                    }
910                }
911
912                true
913            })
914            .collect()
915    }
916
917    /// Extract concept entities (themes, ideas, theories)
918    fn extract_concepts(
919        &self,
920        text: &str,
921        chunk_id: &ChunkId,
922        entity_type: &str,
923    ) -> Result<Vec<Entity>> {
924        let mut entities = Vec::new();
925        let words: Vec<&str> = text.split_whitespace().collect();
926
927        // Look for conceptual terms that are typically capitalized
928        let concept_indicators = [
929            "Theory",
930            "Concept",
931            "Principle",
932            "Philosophy",
933            "Doctrine",
934            "Idea",
935            "Method",
936            "Approach",
937            "Framework",
938            "Model",
939            "Paradigm",
940            "Thesis",
941        ];
942
943        for &word in words.iter() {
944            let clean_word = self.clean_word(word);
945
946            // Check if this word indicates a concept
947            if concept_indicators
948                .iter()
949                .any(|&indicator| clean_word.contains(indicator))
950            {
951                let confidence = 0.75;
952                if confidence >= self.min_confidence {
953                    entities.push(self.create_entity(
954                        clean_word,
955                        entity_type,
956                        confidence,
957                        chunk_id,
958                        text,
959                    )?);
960                }
961            }
962
963            // Look for capitalized terms that might be concepts
964            if self.is_capitalized(word) && word.len() > 4 {
965                let clean_word = self.clean_word(word);
966                if !self.is_common_word(&clean_word) {
967                    let confidence = 0.6;
968                    if confidence >= self.min_confidence {
969                        entities.push(self.create_entity(
970                            clean_word,
971                            entity_type,
972                            confidence,
973                            chunk_id,
974                            text,
975                        )?);
976                    }
977                }
978            }
979        }
980
981        Ok(entities)
982    }
983
984    /// Extract event entities
985    fn extract_events(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
986        let mut entities = Vec::new();
987
988        // Event indicators
989        let event_words = [
990            "meeting",
991            "conference",
992            "ceremony",
993            "celebration",
994            "festival",
995            "competition",
996            "war",
997            "battle",
998            "expedition",
999            "journey",
1000            "trial",
1001        ];
1002
1003        for event_word in &event_words {
1004            if text.to_lowercase().contains(event_word) {
1005                let confidence = 0.7;
1006                if confidence >= self.min_confidence {
1007                    entities.push(self.create_entity(
1008                        event_word.to_string(),
1009                        "EVENT",
1010                        confidence,
1011                        chunk_id,
1012                        text,
1013                    )?);
1014                }
1015            }
1016        }
1017
1018        Ok(entities)
1019    }
1020
1021    /// Extract object entities (tools, artifacts, items)
1022    fn extract_objects(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
1023        let mut entities = Vec::new();
1024
1025        // Object indicators
1026        let object_words = [
1027            "sword",
1028            "shield",
1029            "book",
1030            "manuscript",
1031            "scroll",
1032            "tablet",
1033            "ring",
1034            "crown",
1035            "treasure",
1036            "coin",
1037            "tool",
1038            "weapon",
1039        ];
1040
1041        for object_word in &object_words {
1042            if text.to_lowercase().contains(object_word) {
1043                let confidence = 0.65;
1044                if confidence >= self.min_confidence {
1045                    entities.push(self.create_entity(
1046                        object_word.to_string(),
1047                        "OBJECT",
1048                        confidence,
1049                        chunk_id,
1050                        text,
1051                    )?);
1052                }
1053            }
1054        }
1055
1056        Ok(entities)
1057    }
1058
1059    /// Generic entity extraction for any configured entity type
1060    fn extract_generic_entities(
1061        &self,
1062        text: &str,
1063        chunk_id: &ChunkId,
1064        entity_type: &str,
1065    ) -> Result<Vec<Entity>> {
1066        let mut entities = Vec::new();
1067        let words: Vec<&str> = text.split_whitespace().collect();
1068
1069        // For generic entity types, look for capitalized words that might be entities
1070        for &word in &words {
1071            if self.is_capitalized(word) && word.len() > 3 {
1072                let clean_word = self.clean_word(word);
1073                if !self.is_common_word(&clean_word) {
1074                    let confidence = 0.5; // Lower confidence for generic extraction
1075                    if confidence >= self.min_confidence {
1076                        entities.push(self.create_entity(
1077                            clean_word,
1078                            entity_type,
1079                            confidence,
1080                            chunk_id,
1081                            text,
1082                        )?);
1083                    }
1084                }
1085            }
1086        }
1087
1088        Ok(entities)
1089    }
1090
1091    /// Check if a word is a common word that shouldn't be extracted as an entity
1092    fn is_common_word(&self, word: &str) -> bool {
1093        let common_words = [
1094            "the", "and", "but", "or", "in", "on", "at", "to", "for", "with", "by", "from",
1095            "about", "into", "through", "during", "before", "after", "above", "below", "up",
1096            "down", "out", "off", "over", "under", "again", "further", "then", "once", "here",
1097            "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
1098            "most", "other", "some", "such", "only", "own", "same", "so", "than", "too", "very",
1099            "can", "will", "just", "should", "now", "could", "would", "said", "says", "told",
1100            "asked", "went", "came", "come", "going", "Chapter", "Page", "Section", "Part", "Book",
1101            "Volume",
1102        ];
1103
1104        common_words
1105            .iter()
1106            .any(|&common| word.eq_ignore_ascii_case(common))
1107    }
1108}
1109
1110#[cfg(test)]
1111mod tests {
1112    use super::*;
1113    use crate::core::{ChunkId, DocumentId};
1114
1115    #[test]
1116    fn test_person_extraction() {
1117        let extractor = EntityExtractor::new(0.5).unwrap();
1118        let chunk = TextChunk::new(
1119            ChunkId::new("test_chunk".to_string()),
1120            DocumentId::new("test_doc".to_string()),
1121            "Entity Name works at Test Corp. Dr. Second Entity is a professor.".to_string(),
1122            0,
1123            59,
1124        );
1125
1126        let entities = extractor.extract_from_chunk(&chunk).unwrap();
1127
1128        // Should extract persons and organizations
1129        assert!(!entities.is_empty());
1130
1131        let person_entities: Vec<_> = entities
1132            .iter()
1133            .filter(|e| e.entity_type == "PERSON")
1134            .collect();
1135        assert!(!person_entities.is_empty());
1136    }
1137
1138    #[test]
1139    fn test_relationship_extraction() {
1140        let extractor = EntityExtractor::new(0.5).unwrap();
1141        let chunk = TextChunk::new(
1142            ChunkId::new("test_chunk".to_string()),
1143            DocumentId::new("test_doc".to_string()),
1144            "Entity Name works for Test Corp in Test City.".to_string(),
1145            0,
1146            44,
1147        );
1148
1149        let entities = extractor.extract_from_chunk(&chunk).unwrap();
1150        let relationships = extractor.extract_relationships(&entities, &chunk).unwrap();
1151
1152        assert!(!relationships.is_empty());
1153    }
1154}
graphrag_core/entity/mod.rs

graphrag_core/entity/
mod.rs