1pub mod atomic_fact_extractor;
3#[cfg(feature = "gliner")]
5mod gliner_extractor;
6pub mod bidirectional_index;
8pub mod gleaning_extractor;
10pub mod llm_extractor;
12pub mod llm_relationship_extractor;
14pub mod prompts;
16pub mod semantic_merging;
18pub mod string_similarity_linker;
20
21pub use atomic_fact_extractor::{AtomicFact, AtomicFactExtractor};
22#[cfg(feature = "gliner")]
23pub use gliner_extractor::GLiNERExtractor;
24pub use bidirectional_index::{BidirectionalIndex, IndexStatistics};
25pub use gleaning_extractor::{ExtractionCompletionStatus, GleaningConfig, GleaningEntityExtractor};
26pub use llm_extractor::LLMEntityExtractor;
27pub use llm_relationship_extractor::{
28 ExtractedEntity, ExtractedRelationship, ExtractionResult, LLMRelationshipExtractor,
29 TripleValidation,
30};
31pub use semantic_merging::{EntityMergeDecision, MergingStatistics, SemanticEntityMerger};
32pub use string_similarity_linker::{EntityLinkingConfig, StringSimilarityLinker};
33
34use crate::{
35 config::setconfig::EntityExtractionConfig,
36 core::{ChunkId, Entity, EntityId, EntityMention, TextChunk},
37 Result,
38};
39use regex::Regex;
40use std::collections::{HashMap, HashSet};
41
42pub struct EntityExtractor {
44 min_confidence: f32,
45 config: Option<EntityExtractionConfig>,
46 allowed_patterns: Vec<Regex>,
47 excluded_patterns: Vec<Regex>,
48}
49
50impl EntityExtractor {
51 pub fn new(min_confidence: f32) -> Result<Self> {
53 Ok(Self {
54 min_confidence,
55 config: None,
56 allowed_patterns: Vec::new(),
57 excluded_patterns: Vec::new(),
58 })
59 }
60
61 pub fn with_config(config: EntityExtractionConfig) -> Result<Self> {
63 let mut allowed_patterns = Vec::new();
64 let mut excluded_patterns = Vec::new();
65
66 if let Some(filters) = &config.filters {
68 if let Some(patterns) = &filters.allowed_patterns {
69 for pattern in patterns {
70 match Regex::new(pattern) {
71 Ok(regex) => allowed_patterns.push(regex),
72 Err(e) => {
73 tracing::warn!("Invalid allowed pattern '{pattern}': {e}");
74 },
75 }
76 }
77 }
78
79 if let Some(patterns) = &filters.excluded_patterns {
80 for pattern in patterns {
81 match Regex::new(pattern) {
82 Ok(regex) => excluded_patterns.push(regex),
83 Err(e) => {
84 tracing::warn!("Invalid excluded pattern '{pattern}': {e}");
85 },
86 }
87 }
88 }
89 }
90
91 let min_confidence = config
92 .filters
93 .as_ref()
94 .map(|f| f.confidence_threshold)
95 .unwrap_or(config.confidence_threshold);
96
97 Ok(Self {
98 min_confidence,
99 config: Some(config),
100 allowed_patterns,
101 excluded_patterns,
102 })
103 }
104
105 pub fn extract_from_chunk(&self, chunk: &TextChunk) -> Result<Vec<Entity>> {
107 let mut entities = Vec::new();
108 let text = &chunk.content;
109
110 let entity_types = if let Some(config) = &self.config {
112 config.entity_types.as_ref().cloned().unwrap_or_else(|| {
113 vec![
114 "PERSON".to_string(),
115 "ORGANIZATION".to_string(),
116 "LOCATION".to_string(),
117 ]
118 })
119 } else {
120 vec![
121 "PERSON".to_string(),
122 "ORGANIZATION".to_string(),
123 "LOCATION".to_string(),
124 ]
125 };
126
127 for entity_type in &entity_types {
129 match entity_type.as_str() {
130 "PERSON" | "CHARACTER" | "RESEARCHER" | "SPEAKER" | "DIALOGUE_SPEAKER" => {
131 entities.extend(self.extract_persons(text, &chunk.id)?);
132 },
133 "ORGANIZATION" | "INSTITUTION" | "BRAND" | "COMPANY" => {
134 entities.extend(self.extract_organizations(text, &chunk.id)?);
135 },
136 "LOCATION" | "SETTING" | "PLACE" => {
137 entities.extend(self.extract_locations(text, &chunk.id)?);
138 },
139 "CONCEPT" | "THEORY" | "THEME" | "ARGUMENT" | "IDEA" => {
140 entities.extend(self.extract_concepts(text, &chunk.id, entity_type)?);
141 },
142 "EVENT" | "EXPERIMENT" | "HAPPENING" => {
143 entities.extend(self.extract_events(text, &chunk.id)?);
144 },
145 "OBJECT" | "TOOL" | "ARTIFACT" | "ITEM" => {
146 entities.extend(self.extract_objects(text, &chunk.id)?);
147 },
148 _ => {
149 entities.extend(self.extract_generic_entities(text, &chunk.id, entity_type)?);
151 },
152 }
153 }
154
155 entities = self.apply_pattern_filtering(entities);
157
158 entities = self.deduplicate_entities(entities);
160
161 entities.retain(|e| e.confidence >= self.min_confidence);
163
164 Ok(entities)
165 }
166
167 fn extract_persons(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
169 let mut entities = Vec::new();
170 let words: Vec<&str> = text.split_whitespace().collect();
171 let mut processed_indices = HashSet::new();
172
173 let person_titles = [
175 "mr",
176 "mrs",
177 "ms",
178 "dr",
179 "prof",
180 "professor",
181 "sir",
182 "lady",
183 "lord",
184 "captain",
185 "major",
186 "colonel",
187 "general",
188 "admiral",
189 "judge",
190 "father",
191 "mother",
192 "brother",
193 "sister",
194 "aunt",
195 "uncle",
196 "grandfather",
197 "grandmother",
198 ];
199
200 let non_person_words = [
202 "chapter",
203 "the",
204 "and",
205 "but",
206 "or",
207 "in",
208 "on",
209 "at",
210 "to",
211 "for",
212 "with",
213 "by",
214 "from",
215 "about",
216 "into",
217 "through",
218 "during",
219 "before",
220 "after",
221 "above",
222 "below",
223 "up",
224 "down",
225 "out",
226 "off",
227 "over",
228 "under",
229 "again",
230 "further",
231 "then",
232 "once",
233 "here",
234 "there",
235 "when",
236 "where",
237 "why",
238 "how",
239 "all",
240 "any",
241 "both",
242 "each",
243 "few",
244 "more",
245 "most",
246 "other",
247 "some",
248 "such",
249 "only",
250 "own",
251 "same",
252 "so",
253 "than",
254 "too",
255 "very",
256 "can",
257 "will",
258 "just",
259 "should",
260 "now",
261 "temptations",
262 "strategic",
263 "movements",
264 "decides",
265 "upon",
266 "whitewashing",
267 "saturday",
268 "monday",
269 "tuesday",
270 "wednesday",
271 "thursday",
272 "friday",
273 "sunday",
274 "january",
275 "february",
276 "march",
277 "april",
278 "may",
279 "june",
280 "july",
281 "august",
282 "september",
283 "october",
284 "november",
285 "december",
286 "adventures",
287 "complete",
288 ];
289
290 entities.extend(self.extract_known_names(
292 &words,
293 &mut processed_indices,
294 chunk_id,
295 text,
296 )?);
297
298 entities.extend(self.extract_title_based_names(
300 &words,
301 &person_titles,
302 &mut processed_indices,
303 chunk_id,
304 text,
305 )?);
306
307 entities.extend(self.extract_two_word_names(
309 &words,
310 &non_person_words,
311 &mut processed_indices,
312 chunk_id,
313 text,
314 )?);
315
316 for (i, &word_ref) in words.iter().enumerate() {
318 if processed_indices.contains(&i) {
319 continue;
320 }
321
322 let word = self.clean_word(word_ref);
323
324 if word.len() < 2 || non_person_words.contains(&word.to_lowercase().as_str()) {
326 continue;
327 }
328
329 if self.is_capitalized(words[i]) && self.is_likely_person_word(&word) {
331 let confidence = self.calculate_confidence(&word, "PERSON");
332 if confidence >= self.min_confidence {
333 entities.push(self.create_entity(word, "PERSON", confidence, chunk_id, text)?);
334 }
335 }
336 }
337
338 Ok(entities)
339 }
340
341 fn extract_known_names(
343 &self,
344 words: &[&str],
345 processed: &mut std::collections::HashSet<usize>,
346 chunk_id: &ChunkId,
347 text: &str,
348 ) -> Result<Vec<Entity>> {
349 let mut entities = Vec::new();
350 let known_names = [
351 ("Entity Name", 2),
352 ("Second Entity", 2),
353 ("Guardian Entity", 2),
354 ("Friend Entity", 2),
355 ("Companion Entity", 2),
356 ("Third Entity", 2),
357 ("Fourth Entity", 2),
358 ("Fifth Entity", 2),
359 ("Sixth Entity", 2),
360 ("Seventh Entity", 2),
361 ("Eighth Entity", 2),
362 ("Ninth Entity", 2),
363 ];
364
365 for i in 0..words.len() {
366 if processed.contains(&i) {
367 continue;
368 }
369
370 for &(name, word_count) in &known_names {
371 let name_words: Vec<&str> = name.split_whitespace().collect();
372 if i + name_words.len() <= words.len() {
373 let matches = name_words.iter().enumerate().all(|(j, &expected)| {
374 let actual = self.clean_word(words[i + j]);
375 actual.to_lowercase() == expected.to_lowercase()
376 });
377
378 if matches {
379 let confidence = 0.95;
380 if confidence >= self.min_confidence {
381 entities.push(self.create_entity(
382 name.to_string(),
383 "PERSON",
384 confidence,
385 chunk_id,
386 text,
387 )?);
388 }
389 for j in 0..word_count {
391 processed.insert(i + j);
392 }
393 break;
394 }
395 }
396 }
397 }
398 Ok(entities)
399 }
400
401 fn extract_title_based_names(
403 &self,
404 words: &[&str],
405 person_titles: &[&str],
406 processed: &mut std::collections::HashSet<usize>,
407 chunk_id: &ChunkId,
408 text: &str,
409 ) -> Result<Vec<Entity>> {
410 let mut entities = Vec::new();
411
412 for i in 0..words.len() {
413 if processed.contains(&i) {
414 continue;
415 }
416
417 let word_clean = self.clean_word(words[i]).to_lowercase();
418 if person_titles.contains(&word_clean.as_str())
419 && i + 1 < words.len()
420 && !processed.contains(&(i + 1))
421 {
422 let next_word = self.clean_word(words[i + 1]);
423 if self.is_capitalized(words[i + 1]) && self.is_likely_person_word(&next_word) {
424 let name = if i + 2 < words.len() && !processed.contains(&(i + 2)) {
425 let third_word = self.clean_word(words[i + 2]);
426 if self.is_capitalized(words[i + 2])
427 && self.is_likely_person_word(&third_word)
428 {
429 processed.insert(i + 2);
430 format!("{next_word} {third_word}")
431 } else {
432 next_word
433 }
434 } else {
435 next_word
436 };
437
438 let confidence = 0.9;
439 if confidence >= self.min_confidence {
440 entities
441 .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
442 }
443 processed.insert(i);
444 processed.insert(i + 1);
445 }
446 }
447 }
448 Ok(entities)
449 }
450
451 fn extract_two_word_names(
453 &self,
454 words: &[&str],
455 non_person_words: &[&str],
456 processed: &mut std::collections::HashSet<usize>,
457 chunk_id: &ChunkId,
458 text: &str,
459 ) -> Result<Vec<Entity>> {
460 let mut entities = Vec::new();
461
462 for i in 0..words.len() {
463 if processed.contains(&i) || i + 1 >= words.len() || processed.contains(&(i + 1)) {
464 continue;
465 }
466
467 let first_word = self.clean_word(words[i]);
468 let second_word = self.clean_word(words[i + 1]);
469
470 if self.is_capitalized(words[i])
472 && self.is_capitalized(words[i + 1])
473 && self.is_likely_person_word(&first_word)
474 && self.is_likely_person_word(&second_word)
475 && !non_person_words.contains(&first_word.to_lowercase().as_str())
476 && !non_person_words.contains(&second_word.to_lowercase().as_str())
477 {
478 let name = format!("{first_word} {second_word}");
479 if self.is_likely_person_name(&name) {
480 let confidence = self.calculate_confidence(&name, "PERSON");
481 if confidence >= self.min_confidence {
482 entities
483 .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
484 }
485 processed.insert(i);
486 processed.insert(i + 1);
487 }
488 }
489 }
490 Ok(entities)
491 }
492
493 fn extract_organizations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
495 let mut entities = Vec::new();
496 let org_suffixes = [
497 "Inc",
498 "Corp",
499 "LLC",
500 "Ltd",
501 "Company",
502 "Corporation",
503 "Group",
504 "Solutions",
505 "Technologies",
506 ];
507 let org_prefixes = ["University of", "Institute of", "Department of"];
508
509 for suffix in &org_suffixes {
511 if let Some(pos) = text.find(suffix) {
512 let start = text[..pos].rfind(' ').map(|i| i + 1).unwrap_or(0);
514 let end = pos + suffix.len();
515 let name = text[start..end].trim().to_string();
516
517 if !name.is_empty() && self.is_likely_organization(&name) {
518 let confidence = self.calculate_confidence(&name, "ORGANIZATION");
519 if confidence >= self.min_confidence {
520 entities.push(self.create_entity(
521 name,
522 "ORGANIZATION",
523 confidence,
524 chunk_id,
525 text,
526 )?);
527 }
528 }
529 }
530 }
531
532 for prefix in &org_prefixes {
534 if let Some(pos) = text.find(prefix) {
535 let start = pos;
536 let end = text[pos..]
537 .find('.')
538 .map(|i| pos + i)
539 .unwrap_or(text.len().min(pos + 50));
540 let name = text[start..end].trim().to_string();
541
542 if !name.is_empty() && name.len() > prefix.len() {
543 let confidence = self.calculate_confidence(&name, "ORGANIZATION");
544 if confidence >= self.min_confidence {
545 entities.push(self.create_entity(
546 name,
547 "ORGANIZATION",
548 confidence,
549 chunk_id,
550 text,
551 )?);
552 }
553 }
554 }
555 }
556
557 Ok(entities)
558 }
559
560 fn extract_locations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
562 let mut entities = Vec::new();
563 let known_locations = [
564 "United States",
565 "New York",
566 "California",
567 "London",
568 "Paris",
569 "Tokyo",
570 "Berlin",
571 "Washington",
572 "Boston",
573 "Chicago",
574 ];
575
576 for location in &known_locations {
577 if text.contains(location) {
578 let confidence = self.calculate_confidence(location, "LOCATION");
579 if confidence >= self.min_confidence {
580 entities.push(self.create_entity(
581 location.to_string(),
582 "LOCATION",
583 confidence,
584 chunk_id,
585 text,
586 )?);
587 }
588 }
589 }
590
591 Ok(entities)
592 }
593
594 fn create_entity(
596 &self,
597 name: String,
598 entity_type: &str,
599 confidence: f32,
600 chunk_id: &ChunkId,
601 text: &str,
602 ) -> Result<Entity> {
603 let entity_id = EntityId::new(format!("{}_{}", entity_type, self.normalize_name(&name)));
604
605 let mut mentions = Vec::new();
607 let mut start = 0;
608 while let Some(pos) = text[start..].find(&name) {
609 let actual_pos = start + pos;
610 mentions.push(EntityMention {
611 chunk_id: chunk_id.clone(),
612 start_offset: actual_pos,
613 end_offset: actual_pos + name.len(),
614 confidence,
615 });
616 start = actual_pos + name.len();
617 }
618
619 Ok(
620 Entity::new(entity_id, name, entity_type.to_string(), confidence)
621 .with_mentions(mentions),
622 )
623 }
624
625 fn is_capitalized(&self, word: &str) -> bool {
627 word.chars().next().is_some_and(|c| c.is_uppercase())
628 }
629
630 fn clean_word(&self, word: &str) -> String {
632 word.chars()
633 .filter(|c| c.is_alphabetic() || *c == '\'') .collect::<String>()
635 .trim_end_matches('\'') .to_string()
637 }
638
639 fn is_likely_person_word(&self, word: &str) -> bool {
641 if word.len() < 2 {
642 return false;
643 }
644
645 let word_lower = word.to_lowercase();
647
648 let name_endings = [
650 "son", "sen", "ton", "ham", "ford", "ley", "ment", "ard", "ert",
651 ];
652 let has_name_ending = name_endings
653 .iter()
654 .any(|&ending| word_lower.ends_with(ending));
655
656 let name_prefixes = ["mc", "mac", "o'", "de", "van", "von", "la", "le"];
658 let has_name_prefix = name_prefixes
659 .iter()
660 .any(|&prefix| word_lower.starts_with(prefix));
661
662 let is_proper_format = word.chars().next().unwrap().is_uppercase()
664 && word.chars().all(|c| c.is_alphabetic() || c == '\'');
665
666 let short_non_names = [
668 "it", "is", "as", "at", "be", "by", "do", "go", "he", "if", "in", "me", "my", "no",
669 "of", "on", "or", "so", "to", "up", "us", "we",
670 ];
671
672 if word.len() <= 2 && short_non_names.contains(&word_lower.as_str()) {
673 return false;
674 }
675
676 is_proper_format && (word.len() >= 3 || has_name_ending || has_name_prefix)
677 }
678
679 #[allow(dead_code)]
681 fn is_title(&self, word: &str) -> bool {
682 matches!(word, "Dr." | "Mr." | "Ms." | "Mrs." | "Prof.")
683 }
684
685 fn is_likely_person_name(&self, name: &str) -> bool {
687 let parts: Vec<&str> = name.split_whitespace().collect();
688 parts.len() == 2 && parts.iter().all(|part| self.is_capitalized(part))
689 }
690
691 fn is_likely_organization(&self, name: &str) -> bool {
693 let org_indicators = [
694 "Inc",
695 "Corp",
696 "LLC",
697 "Ltd",
698 "Company",
699 "Corporation",
700 "University",
701 "Institute",
702 ];
703 org_indicators
704 .iter()
705 .any(|indicator| name.contains(indicator))
706 }
707
708 fn calculate_confidence(&self, name: &str, entity_type: &str) -> f32 {
710 let mut confidence: f32 = 0.5; match entity_type {
714 "PERSON" => {
715 if name.contains("Dr.") || name.contains("Prof.") {
716 confidence += 0.3;
717 }
718 if name.split_whitespace().count() == 2 {
719 confidence += 0.2;
720 }
721 },
722 "ORGANIZATION" => {
723 if name.contains("Inc") || name.contains("Corp") || name.contains("LLC") {
724 confidence += 0.3;
725 }
726 if name.contains("University") || name.contains("Institute") {
727 confidence += 0.2;
728 }
729 },
730 "LOCATION" => {
731 if name.contains(',') {
732 confidence += 0.2;
733 }
734 if self.is_known_location(name) {
735 confidence += 0.3;
736 }
737 },
738 _ => {},
739 }
740
741 if name.chars().next().is_some_and(|c| c.is_uppercase()) {
743 confidence += 0.1;
744 }
745
746 confidence.min(1.0)
747 }
748
749 fn is_known_location(&self, name: &str) -> bool {
751 const KNOWN_LOCATIONS: &[&str] = &[
752 "United States",
753 "New York",
754 "California",
755 "London",
756 "Paris",
757 "Tokyo",
758 "Berlin",
759 "Washington",
760 "Boston",
761 "Chicago",
762 ];
763 KNOWN_LOCATIONS.iter().any(|&loc| name.contains(loc))
764 }
765
766 fn normalize_name(&self, name: &str) -> String {
768 name.to_lowercase()
769 .chars()
770 .filter(|c| c.is_alphanumeric() || *c == '_')
771 .collect::<String>()
772 .replace(' ', "_")
773 }
774
775 fn deduplicate_entities(&self, entities: Vec<Entity>) -> Vec<Entity> {
777 let mut unique_entities: HashMap<(String, String), Entity> = HashMap::new();
778
779 for entity in entities {
780 let key = (entity.name.clone(), entity.entity_type.clone());
781
782 match unique_entities.get_mut(&key) {
783 Some(existing) => {
784 existing.mentions.extend(entity.mentions);
786 if entity.confidence > existing.confidence {
787 existing.confidence = entity.confidence;
788 }
789 },
790 None => {
791 unique_entities.insert(key, entity);
792 },
793 }
794 }
795
796 unique_entities.into_values().collect()
797 }
798
799 pub fn extract_relationships(
801 &self,
802 entities: &[Entity],
803 chunk: &TextChunk,
804 ) -> Result<Vec<(EntityId, EntityId, String)>> {
805 let mut relationships = Vec::new();
806
807 for i in 0..entities.len() {
809 for j in (i + 1)..entities.len() {
810 let entity1 = &entities[i];
811 let entity2 = &entities[j];
812
813 let entity1_in_chunk = entity1.mentions.iter().any(|m| m.chunk_id == chunk.id);
815 let entity2_in_chunk = entity2.mentions.iter().any(|m| m.chunk_id == chunk.id);
816
817 if entity1_in_chunk && entity2_in_chunk {
818 let relation_type =
819 self.infer_relationship_type(entity1, entity2, &chunk.content);
820 relationships.push((entity1.id.clone(), entity2.id.clone(), relation_type));
821 }
822 }
823 }
824
825 Ok(relationships)
826 }
827
828 fn infer_relationship_type(&self, entity1: &Entity, entity2: &Entity, context: &str) -> String {
830 match (&entity1.entity_type[..], &entity2.entity_type[..]) {
831 ("PERSON", "ORGANIZATION") | ("ORGANIZATION", "PERSON") => {
832 if context.contains("works for") || context.contains("employed by") {
833 "WORKS_FOR".to_string()
834 } else if context.contains("founded") || context.contains("CEO") {
835 "LEADS".to_string()
836 } else {
837 "ASSOCIATED_WITH".to_string()
838 }
839 },
840 ("PERSON", "LOCATION") | ("LOCATION", "PERSON") => {
841 if context.contains("born in") || context.contains("from") {
842 "BORN_IN".to_string()
843 } else if context.contains("lives in") || context.contains("based in") {
844 "LOCATED_IN".to_string()
845 } else {
846 "ASSOCIATED_WITH".to_string()
847 }
848 },
849 ("ORGANIZATION", "LOCATION") | ("LOCATION", "ORGANIZATION") => {
850 if context.contains("headquartered") || context.contains("based in") {
851 "HEADQUARTERED_IN".to_string()
852 } else {
853 "LOCATED_IN".to_string()
854 }
855 },
856 ("PERSON", "PERSON") => {
857 if context.contains("married") || context.contains("spouse") {
858 "MARRIED_TO".to_string()
859 } else if context.contains("colleague") || context.contains("partner") {
860 "COLLEAGUE_OF".to_string()
861 } else {
862 "KNOWS".to_string()
863 }
864 },
865 _ => "RELATED_TO".to_string(),
866 }
867 }
868
869 fn apply_pattern_filtering(&self, entities: Vec<Entity>) -> Vec<Entity> {
871 if self.allowed_patterns.is_empty() && self.excluded_patterns.is_empty() {
872 return entities;
873 }
874
875 entities
876 .into_iter()
877 .filter(|entity| {
878 if !self.allowed_patterns.is_empty() {
880 let matches_allowed = self
881 .allowed_patterns
882 .iter()
883 .any(|pattern| pattern.is_match(&entity.name));
884 if !matches_allowed {
885 return false;
886 }
887 }
888
889 if !self.excluded_patterns.is_empty() {
891 let matches_excluded = self
892 .excluded_patterns
893 .iter()
894 .any(|pattern| pattern.is_match(&entity.name));
895 if matches_excluded {
896 return false;
897 }
898 }
899
900 true
901 })
902 .collect()
903 }
904
905 fn extract_concepts(
907 &self,
908 text: &str,
909 chunk_id: &ChunkId,
910 entity_type: &str,
911 ) -> Result<Vec<Entity>> {
912 let mut entities = Vec::new();
913 let words: Vec<&str> = text.split_whitespace().collect();
914
915 let concept_indicators = [
917 "Theory",
918 "Concept",
919 "Principle",
920 "Philosophy",
921 "Doctrine",
922 "Idea",
923 "Method",
924 "Approach",
925 "Framework",
926 "Model",
927 "Paradigm",
928 "Thesis",
929 ];
930
931 for &word in words.iter() {
932 let clean_word = self.clean_word(word);
933
934 if concept_indicators
936 .iter()
937 .any(|&indicator| clean_word.contains(indicator))
938 {
939 let confidence = 0.75;
940 if confidence >= self.min_confidence {
941 entities.push(self.create_entity(
942 clean_word,
943 entity_type,
944 confidence,
945 chunk_id,
946 text,
947 )?);
948 }
949 }
950
951 if self.is_capitalized(word) && word.len() > 4 {
953 let clean_word = self.clean_word(word);
954 if !self.is_common_word(&clean_word) {
955 let confidence = 0.6;
956 if confidence >= self.min_confidence {
957 entities.push(self.create_entity(
958 clean_word,
959 entity_type,
960 confidence,
961 chunk_id,
962 text,
963 )?);
964 }
965 }
966 }
967 }
968
969 Ok(entities)
970 }
971
972 fn extract_events(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
974 let mut entities = Vec::new();
975
976 let event_words = [
978 "meeting",
979 "conference",
980 "ceremony",
981 "celebration",
982 "festival",
983 "competition",
984 "war",
985 "battle",
986 "expedition",
987 "journey",
988 "trial",
989 ];
990
991 for event_word in &event_words {
992 if text.to_lowercase().contains(event_word) {
993 let confidence = 0.7;
994 if confidence >= self.min_confidence {
995 entities.push(self.create_entity(
996 event_word.to_string(),
997 "EVENT",
998 confidence,
999 chunk_id,
1000 text,
1001 )?);
1002 }
1003 }
1004 }
1005
1006 Ok(entities)
1007 }
1008
1009 fn extract_objects(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
1011 let mut entities = Vec::new();
1012
1013 let object_words = [
1015 "sword",
1016 "shield",
1017 "book",
1018 "manuscript",
1019 "scroll",
1020 "tablet",
1021 "ring",
1022 "crown",
1023 "treasure",
1024 "coin",
1025 "tool",
1026 "weapon",
1027 ];
1028
1029 for object_word in &object_words {
1030 if text.to_lowercase().contains(object_word) {
1031 let confidence = 0.65;
1032 if confidence >= self.min_confidence {
1033 entities.push(self.create_entity(
1034 object_word.to_string(),
1035 "OBJECT",
1036 confidence,
1037 chunk_id,
1038 text,
1039 )?);
1040 }
1041 }
1042 }
1043
1044 Ok(entities)
1045 }
1046
1047 fn extract_generic_entities(
1049 &self,
1050 text: &str,
1051 chunk_id: &ChunkId,
1052 entity_type: &str,
1053 ) -> Result<Vec<Entity>> {
1054 let mut entities = Vec::new();
1055 let words: Vec<&str> = text.split_whitespace().collect();
1056
1057 for &word in &words {
1059 if self.is_capitalized(word) && word.len() > 3 {
1060 let clean_word = self.clean_word(word);
1061 if !self.is_common_word(&clean_word) {
1062 let confidence = 0.5; if confidence >= self.min_confidence {
1064 entities.push(self.create_entity(
1065 clean_word,
1066 entity_type,
1067 confidence,
1068 chunk_id,
1069 text,
1070 )?);
1071 }
1072 }
1073 }
1074 }
1075
1076 Ok(entities)
1077 }
1078
1079 fn is_common_word(&self, word: &str) -> bool {
1081 let common_words = [
1082 "the", "and", "but", "or", "in", "on", "at", "to", "for", "with", "by", "from",
1083 "about", "into", "through", "during", "before", "after", "above", "below", "up",
1084 "down", "out", "off", "over", "under", "again", "further", "then", "once", "here",
1085 "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
1086 "most", "other", "some", "such", "only", "own", "same", "so", "than", "too", "very",
1087 "can", "will", "just", "should", "now", "could", "would", "said", "says", "told",
1088 "asked", "went", "came", "come", "going", "Chapter", "Page", "Section", "Part", "Book",
1089 "Volume",
1090 ];
1091
1092 common_words
1093 .iter()
1094 .any(|&common| word.eq_ignore_ascii_case(common))
1095 }
1096}
1097
1098#[cfg(test)]
1099mod tests {
1100 use super::*;
1101 use crate::core::{ChunkId, DocumentId};
1102
1103 #[test]
1104 fn test_person_extraction() {
1105 let extractor = EntityExtractor::new(0.5).unwrap();
1106 let chunk = TextChunk::new(
1107 ChunkId::new("test_chunk".to_string()),
1108 DocumentId::new("test_doc".to_string()),
1109 "Entity Name works at Test Corp. Dr. Second Entity is a professor.".to_string(),
1110 0,
1111 59,
1112 );
1113
1114 let entities = extractor.extract_from_chunk(&chunk).unwrap();
1115
1116 assert!(!entities.is_empty());
1118
1119 let person_entities: Vec<_> = entities
1120 .iter()
1121 .filter(|e| e.entity_type == "PERSON")
1122 .collect();
1123 assert!(!person_entities.is_empty());
1124 }
1125
1126 #[test]
1127 fn test_relationship_extraction() {
1128 let extractor = EntityExtractor::new(0.5).unwrap();
1129 let chunk = TextChunk::new(
1130 ChunkId::new("test_chunk".to_string()),
1131 DocumentId::new("test_doc".to_string()),
1132 "Entity Name works for Test Corp in Test City.".to_string(),
1133 0,
1134 44,
1135 );
1136
1137 let entities = extractor.extract_from_chunk(&chunk).unwrap();
1138 let relationships = extractor.extract_relationships(&entities, &chunk).unwrap();
1139
1140 assert!(!relationships.is_empty());
1141 }
1142}