1pub mod atomic_fact_extractor;
9pub mod bidirectional_index;
11pub mod gleaning_extractor;
13#[cfg(feature = "gliner")]
15mod gliner_extractor;
16pub mod llm_extractor;
18pub mod llm_relationship_extractor;
20pub mod prompts;
22pub mod semantic_merging;
24pub mod string_similarity_linker;
26
27pub use atomic_fact_extractor::{AtomicFact, AtomicFactExtractor};
28pub use bidirectional_index::{BidirectionalIndex, IndexStatistics};
29pub use gleaning_extractor::{ExtractionCompletionStatus, GleaningConfig, GleaningEntityExtractor};
30#[cfg(feature = "gliner")]
31pub use gliner_extractor::GLiNERExtractor;
32pub use llm_extractor::LLMEntityExtractor;
33pub use llm_relationship_extractor::{
34 ExtractedEntity, ExtractedRelationship, ExtractionResult, LLMRelationshipExtractor,
35 TripleValidation,
36};
37pub use semantic_merging::{EntityMergeDecision, MergingStatistics, SemanticEntityMerger};
38pub use string_similarity_linker::{EntityLinkingConfig, StringSimilarityLinker};
39
40use crate::{
41 config::setconfig::EntityExtractionConfig,
42 core::{ChunkId, Entity, EntityId, EntityMention, TextChunk},
43 Result,
44};
45use regex::Regex;
46use std::collections::{HashMap, HashSet};
47
48pub struct EntityExtractor {
50 min_confidence: f32,
51 config: Option<EntityExtractionConfig>,
52 allowed_patterns: Vec<Regex>,
53 excluded_patterns: Vec<Regex>,
54}
55
56impl EntityExtractor {
57 pub fn new(min_confidence: f32) -> Result<Self> {
59 Ok(Self {
60 min_confidence,
61 config: None,
62 allowed_patterns: Vec::new(),
63 excluded_patterns: Vec::new(),
64 })
65 }
66
67 pub fn with_config(config: EntityExtractionConfig) -> Result<Self> {
69 let mut allowed_patterns = Vec::new();
70 let mut excluded_patterns = Vec::new();
71
72 if let Some(filters) = &config.filters {
74 if let Some(patterns) = &filters.allowed_patterns {
75 for pattern in patterns {
76 match Regex::new(pattern) {
77 Ok(regex) => allowed_patterns.push(regex),
78 Err(_e) => {
79 #[cfg(feature = "tracing")]
80 tracing::warn!("Invalid allowed pattern '{pattern}': {_e}");
81 },
82 }
83 }
84 }
85
86 if let Some(patterns) = &filters.excluded_patterns {
87 for pattern in patterns {
88 match Regex::new(pattern) {
89 Ok(regex) => excluded_patterns.push(regex),
90 Err(_e) => {
91 #[cfg(feature = "tracing")]
92 tracing::warn!("Invalid excluded pattern '{pattern}': {_e}");
93 },
94 }
95 }
96 }
97 }
98
99 let min_confidence = config
100 .filters
101 .as_ref()
102 .map(|f| f.confidence_threshold)
103 .unwrap_or(config.confidence_threshold);
104
105 Ok(Self {
106 min_confidence,
107 config: Some(config),
108 allowed_patterns,
109 excluded_patterns,
110 })
111 }
112
113 pub fn extract_from_chunk(&self, chunk: &TextChunk) -> Result<Vec<Entity>> {
115 let mut entities = Vec::new();
116 let text = &chunk.content;
117
118 let entity_types = if let Some(config) = &self.config {
120 config.entity_types.as_ref().cloned().unwrap_or_else(|| {
121 vec![
122 "PERSON".to_string(),
123 "ORGANIZATION".to_string(),
124 "LOCATION".to_string(),
125 ]
126 })
127 } else {
128 vec![
129 "PERSON".to_string(),
130 "ORGANIZATION".to_string(),
131 "LOCATION".to_string(),
132 ]
133 };
134
135 for entity_type in &entity_types {
137 match entity_type.as_str() {
138 "PERSON" | "CHARACTER" | "RESEARCHER" | "SPEAKER" | "DIALOGUE_SPEAKER" => {
139 entities.extend(self.extract_persons(text, &chunk.id)?);
140 },
141 "ORGANIZATION" | "INSTITUTION" | "BRAND" | "COMPANY" => {
142 entities.extend(self.extract_organizations(text, &chunk.id)?);
143 },
144 "LOCATION" | "SETTING" | "PLACE" => {
145 entities.extend(self.extract_locations(text, &chunk.id)?);
146 },
147 "CONCEPT" | "THEORY" | "THEME" | "ARGUMENT" | "IDEA" => {
148 entities.extend(self.extract_concepts(text, &chunk.id, entity_type)?);
149 },
150 "EVENT" | "EXPERIMENT" | "HAPPENING" => {
151 entities.extend(self.extract_events(text, &chunk.id)?);
152 },
153 "OBJECT" | "TOOL" | "ARTIFACT" | "ITEM" => {
154 entities.extend(self.extract_objects(text, &chunk.id)?);
155 },
156 _ => {
157 entities.extend(self.extract_generic_entities(text, &chunk.id, entity_type)?);
159 },
160 }
161 }
162
163 entities = self.apply_pattern_filtering(entities);
165
166 entities = self.deduplicate_entities(entities);
168
169 entities.retain(|e| e.confidence >= self.min_confidence);
171
172 Ok(entities)
173 }
174
175 fn extract_persons(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
177 let mut entities = Vec::new();
178 let words: Vec<&str> = text.split_whitespace().collect();
179 let mut processed_indices = HashSet::new();
180
181 let person_titles = [
183 "mr",
184 "mrs",
185 "ms",
186 "dr",
187 "prof",
188 "professor",
189 "sir",
190 "lady",
191 "lord",
192 "captain",
193 "major",
194 "colonel",
195 "general",
196 "admiral",
197 "judge",
198 "father",
199 "mother",
200 "brother",
201 "sister",
202 "aunt",
203 "uncle",
204 "grandfather",
205 "grandmother",
206 ];
207
208 let non_person_words = [
210 "chapter",
211 "the",
212 "and",
213 "but",
214 "or",
215 "in",
216 "on",
217 "at",
218 "to",
219 "for",
220 "with",
221 "by",
222 "from",
223 "about",
224 "into",
225 "through",
226 "during",
227 "before",
228 "after",
229 "above",
230 "below",
231 "up",
232 "down",
233 "out",
234 "off",
235 "over",
236 "under",
237 "again",
238 "further",
239 "then",
240 "once",
241 "here",
242 "there",
243 "when",
244 "where",
245 "why",
246 "how",
247 "all",
248 "any",
249 "both",
250 "each",
251 "few",
252 "more",
253 "most",
254 "other",
255 "some",
256 "such",
257 "only",
258 "own",
259 "same",
260 "so",
261 "than",
262 "too",
263 "very",
264 "can",
265 "will",
266 "just",
267 "should",
268 "now",
269 "temptations",
270 "strategic",
271 "movements",
272 "decides",
273 "upon",
274 "whitewashing",
275 "saturday",
276 "monday",
277 "tuesday",
278 "wednesday",
279 "thursday",
280 "friday",
281 "sunday",
282 "january",
283 "february",
284 "march",
285 "april",
286 "may",
287 "june",
288 "july",
289 "august",
290 "september",
291 "october",
292 "november",
293 "december",
294 "adventures",
295 "complete",
296 ];
297
298 entities.extend(self.extract_known_names(
300 &words,
301 &mut processed_indices,
302 chunk_id,
303 text,
304 )?);
305
306 entities.extend(self.extract_title_based_names(
308 &words,
309 &person_titles,
310 &mut processed_indices,
311 chunk_id,
312 text,
313 )?);
314
315 entities.extend(self.extract_two_word_names(
317 &words,
318 &non_person_words,
319 &mut processed_indices,
320 chunk_id,
321 text,
322 )?);
323
324 for (i, &word_ref) in words.iter().enumerate() {
326 if processed_indices.contains(&i) {
327 continue;
328 }
329
330 let word = self.clean_word(word_ref);
331
332 if word.len() < 2 || non_person_words.contains(&word.to_lowercase().as_str()) {
334 continue;
335 }
336
337 if self.is_capitalized(words[i]) && self.is_likely_person_word(&word) {
339 let confidence = self.calculate_confidence(&word, "PERSON");
340 if confidence >= self.min_confidence {
341 entities.push(self.create_entity(word, "PERSON", confidence, chunk_id, text)?);
342 }
343 }
344 }
345
346 Ok(entities)
347 }
348
349 fn extract_known_names(
351 &self,
352 words: &[&str],
353 processed: &mut std::collections::HashSet<usize>,
354 chunk_id: &ChunkId,
355 text: &str,
356 ) -> Result<Vec<Entity>> {
357 let mut entities = Vec::new();
358 let known_names = [
359 ("Entity Name", 2),
360 ("Second Entity", 2),
361 ("Guardian Entity", 2),
362 ("Friend Entity", 2),
363 ("Companion Entity", 2),
364 ("Third Entity", 2),
365 ("Fourth Entity", 2),
366 ("Fifth Entity", 2),
367 ("Sixth Entity", 2),
368 ("Seventh Entity", 2),
369 ("Eighth Entity", 2),
370 ("Ninth Entity", 2),
371 ];
372
373 for i in 0..words.len() {
374 if processed.contains(&i) {
375 continue;
376 }
377
378 for &(name, word_count) in &known_names {
379 let name_words: Vec<&str> = name.split_whitespace().collect();
380 if i + name_words.len() <= words.len() {
381 let matches = name_words.iter().enumerate().all(|(j, &expected)| {
382 let actual = self.clean_word(words[i + j]);
383 actual.to_lowercase() == expected.to_lowercase()
384 });
385
386 if matches {
387 let confidence = 0.95;
388 if confidence >= self.min_confidence {
389 entities.push(self.create_entity(
390 name.to_string(),
391 "PERSON",
392 confidence,
393 chunk_id,
394 text,
395 )?);
396 }
397 for j in 0..word_count {
399 processed.insert(i + j);
400 }
401 break;
402 }
403 }
404 }
405 }
406 Ok(entities)
407 }
408
409 fn extract_title_based_names(
411 &self,
412 words: &[&str],
413 person_titles: &[&str],
414 processed: &mut std::collections::HashSet<usize>,
415 chunk_id: &ChunkId,
416 text: &str,
417 ) -> Result<Vec<Entity>> {
418 let mut entities = Vec::new();
419
420 for i in 0..words.len() {
421 if processed.contains(&i) {
422 continue;
423 }
424
425 let word_clean = self.clean_word(words[i]).to_lowercase();
426 if person_titles.contains(&word_clean.as_str())
427 && i + 1 < words.len()
428 && !processed.contains(&(i + 1))
429 {
430 let next_word = self.clean_word(words[i + 1]);
431 if self.is_capitalized(words[i + 1]) && self.is_likely_person_word(&next_word) {
432 let name = if i + 2 < words.len() && !processed.contains(&(i + 2)) {
433 let third_word = self.clean_word(words[i + 2]);
434 if self.is_capitalized(words[i + 2])
435 && self.is_likely_person_word(&third_word)
436 {
437 processed.insert(i + 2);
438 format!("{next_word} {third_word}")
439 } else {
440 next_word
441 }
442 } else {
443 next_word
444 };
445
446 let confidence = 0.9;
447 if confidence >= self.min_confidence {
448 entities
449 .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
450 }
451 processed.insert(i);
452 processed.insert(i + 1);
453 }
454 }
455 }
456 Ok(entities)
457 }
458
459 fn extract_two_word_names(
461 &self,
462 words: &[&str],
463 non_person_words: &[&str],
464 processed: &mut std::collections::HashSet<usize>,
465 chunk_id: &ChunkId,
466 text: &str,
467 ) -> Result<Vec<Entity>> {
468 let mut entities = Vec::new();
469
470 for i in 0..words.len() {
471 if processed.contains(&i) || i + 1 >= words.len() || processed.contains(&(i + 1)) {
472 continue;
473 }
474
475 let first_word = self.clean_word(words[i]);
476 let second_word = self.clean_word(words[i + 1]);
477
478 if self.is_capitalized(words[i])
480 && self.is_capitalized(words[i + 1])
481 && self.is_likely_person_word(&first_word)
482 && self.is_likely_person_word(&second_word)
483 && !non_person_words.contains(&first_word.to_lowercase().as_str())
484 && !non_person_words.contains(&second_word.to_lowercase().as_str())
485 {
486 let name = format!("{first_word} {second_word}");
487 if self.is_likely_person_name(&name) {
488 let confidence = self.calculate_confidence(&name, "PERSON");
489 if confidence >= self.min_confidence {
490 entities
491 .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
492 }
493 processed.insert(i);
494 processed.insert(i + 1);
495 }
496 }
497 }
498 Ok(entities)
499 }
500
501 fn extract_organizations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
503 let mut entities = Vec::new();
504 let org_suffixes = [
505 "Inc",
506 "Corp",
507 "LLC",
508 "Ltd",
509 "Company",
510 "Corporation",
511 "Group",
512 "Solutions",
513 "Technologies",
514 ];
515 let org_prefixes = ["University of", "Institute of", "Department of"];
516
517 for suffix in &org_suffixes {
519 if let Some(pos) = text.find(suffix) {
520 let start = text[..pos].rfind(' ').map(|i| i + 1).unwrap_or(0);
522 let end = pos + suffix.len();
523 let name = text[start..end].trim().to_string();
524
525 if !name.is_empty() && self.is_likely_organization(&name) {
526 let confidence = self.calculate_confidence(&name, "ORGANIZATION");
527 if confidence >= self.min_confidence {
528 entities.push(self.create_entity(
529 name,
530 "ORGANIZATION",
531 confidence,
532 chunk_id,
533 text,
534 )?);
535 }
536 }
537 }
538 }
539
540 for prefix in &org_prefixes {
542 if let Some(pos) = text.find(prefix) {
543 let start = pos;
544 let end = text[pos..]
545 .find('.')
546 .map(|i| pos + i)
547 .unwrap_or(text.len().min(pos + 50));
548 let name = text[start..end].trim().to_string();
549
550 if !name.is_empty() && name.len() > prefix.len() {
551 let confidence = self.calculate_confidence(&name, "ORGANIZATION");
552 if confidence >= self.min_confidence {
553 entities.push(self.create_entity(
554 name,
555 "ORGANIZATION",
556 confidence,
557 chunk_id,
558 text,
559 )?);
560 }
561 }
562 }
563 }
564
565 Ok(entities)
566 }
567
568 fn extract_locations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
570 let mut entities = Vec::new();
571 let known_locations = [
572 "United States",
573 "New York",
574 "California",
575 "London",
576 "Paris",
577 "Tokyo",
578 "Berlin",
579 "Washington",
580 "Boston",
581 "Chicago",
582 ];
583
584 for location in &known_locations {
585 if text.contains(location) {
586 let confidence = self.calculate_confidence(location, "LOCATION");
587 if confidence >= self.min_confidence {
588 entities.push(self.create_entity(
589 location.to_string(),
590 "LOCATION",
591 confidence,
592 chunk_id,
593 text,
594 )?);
595 }
596 }
597 }
598
599 Ok(entities)
600 }
601
602 fn create_entity(
604 &self,
605 name: String,
606 entity_type: &str,
607 confidence: f32,
608 chunk_id: &ChunkId,
609 text: &str,
610 ) -> Result<Entity> {
611 let entity_id = EntityId::new(format!("{}_{}", entity_type, self.normalize_name(&name)));
612
613 let mut mentions = Vec::new();
615 let mut start = 0;
616 while let Some(pos) = text[start..].find(&name) {
617 let actual_pos = start + pos;
618 mentions.push(EntityMention {
619 chunk_id: chunk_id.clone(),
620 start_offset: actual_pos,
621 end_offset: actual_pos + name.len(),
622 confidence,
623 });
624 start = actual_pos + name.len();
625 }
626
627 Ok(
628 Entity::new(entity_id, name, entity_type.to_string(), confidence)
629 .with_mentions(mentions),
630 )
631 }
632
633 fn is_capitalized(&self, word: &str) -> bool {
635 word.chars().next().is_some_and(|c| c.is_uppercase())
636 }
637
638 fn clean_word(&self, word: &str) -> String {
640 word.chars()
641 .filter(|c| c.is_alphabetic() || *c == '\'') .collect::<String>()
643 .trim_end_matches('\'') .to_string()
645 }
646
647 fn is_likely_person_word(&self, word: &str) -> bool {
649 if word.len() < 2 {
650 return false;
651 }
652
653 let word_lower = word.to_lowercase();
655
656 let name_endings = [
658 "son", "sen", "ton", "ham", "ford", "ley", "ment", "ard", "ert",
659 ];
660 let has_name_ending = name_endings
661 .iter()
662 .any(|&ending| word_lower.ends_with(ending));
663
664 let name_prefixes = ["mc", "mac", "o'", "de", "van", "von", "la", "le"];
666 let has_name_prefix = name_prefixes
667 .iter()
668 .any(|&prefix| word_lower.starts_with(prefix));
669
670 let is_proper_format = word
672 .chars()
673 .next()
674 .expect("non-empty string")
675 .is_uppercase()
676 && word.chars().all(|c| c.is_alphabetic() || c == '\'');
677
678 let short_non_names = [
680 "it", "is", "as", "at", "be", "by", "do", "go", "he", "if", "in", "me", "my", "no",
681 "of", "on", "or", "so", "to", "up", "us", "we",
682 ];
683
684 if word.len() <= 2 && short_non_names.contains(&word_lower.as_str()) {
685 return false;
686 }
687
688 is_proper_format && (word.len() >= 3 || has_name_ending || has_name_prefix)
689 }
690
691 #[allow(dead_code)]
693 fn is_title(&self, word: &str) -> bool {
694 matches!(word, "Dr." | "Mr." | "Ms." | "Mrs." | "Prof.")
695 }
696
697 fn is_likely_person_name(&self, name: &str) -> bool {
699 let parts: Vec<&str> = name.split_whitespace().collect();
700 parts.len() == 2 && parts.iter().all(|part| self.is_capitalized(part))
701 }
702
703 fn is_likely_organization(&self, name: &str) -> bool {
705 let org_indicators = [
706 "Inc",
707 "Corp",
708 "LLC",
709 "Ltd",
710 "Company",
711 "Corporation",
712 "University",
713 "Institute",
714 ];
715 org_indicators
716 .iter()
717 .any(|indicator| name.contains(indicator))
718 }
719
720 fn calculate_confidence(&self, name: &str, entity_type: &str) -> f32 {
722 let mut confidence: f32 = 0.5; match entity_type {
726 "PERSON" => {
727 if name.contains("Dr.") || name.contains("Prof.") {
728 confidence += 0.3;
729 }
730 if name.split_whitespace().count() == 2 {
731 confidence += 0.2;
732 }
733 },
734 "ORGANIZATION" => {
735 if name.contains("Inc") || name.contains("Corp") || name.contains("LLC") {
736 confidence += 0.3;
737 }
738 if name.contains("University") || name.contains("Institute") {
739 confidence += 0.2;
740 }
741 },
742 "LOCATION" => {
743 if name.contains(',') {
744 confidence += 0.2;
745 }
746 if self.is_known_location(name) {
747 confidence += 0.3;
748 }
749 },
750 _ => {},
751 }
752
753 if name.chars().next().is_some_and(|c| c.is_uppercase()) {
755 confidence += 0.1;
756 }
757
758 confidence.min(1.0)
759 }
760
761 fn is_known_location(&self, name: &str) -> bool {
763 const KNOWN_LOCATIONS: &[&str] = &[
764 "United States",
765 "New York",
766 "California",
767 "London",
768 "Paris",
769 "Tokyo",
770 "Berlin",
771 "Washington",
772 "Boston",
773 "Chicago",
774 ];
775 KNOWN_LOCATIONS.iter().any(|&loc| name.contains(loc))
776 }
777
778 fn normalize_name(&self, name: &str) -> String {
780 name.to_lowercase()
781 .chars()
782 .filter(|c| c.is_alphanumeric() || *c == '_')
783 .collect::<String>()
784 .replace(' ', "_")
785 }
786
787 fn deduplicate_entities(&self, entities: Vec<Entity>) -> Vec<Entity> {
789 let mut unique_entities: HashMap<(String, String), Entity> = HashMap::new();
790
791 for entity in entities {
792 let key = (entity.name.clone(), entity.entity_type.clone());
793
794 match unique_entities.get_mut(&key) {
795 Some(existing) => {
796 existing.mentions.extend(entity.mentions);
798 if entity.confidence > existing.confidence {
799 existing.confidence = entity.confidence;
800 }
801 },
802 None => {
803 unique_entities.insert(key, entity);
804 },
805 }
806 }
807
808 unique_entities.into_values().collect()
809 }
810
811 pub fn extract_relationships(
813 &self,
814 entities: &[Entity],
815 chunk: &TextChunk,
816 ) -> Result<Vec<(EntityId, EntityId, String)>> {
817 let mut relationships = Vec::new();
818
819 for i in 0..entities.len() {
821 for j in (i + 1)..entities.len() {
822 let entity1 = &entities[i];
823 let entity2 = &entities[j];
824
825 let entity1_in_chunk = entity1.mentions.iter().any(|m| m.chunk_id == chunk.id);
827 let entity2_in_chunk = entity2.mentions.iter().any(|m| m.chunk_id == chunk.id);
828
829 if entity1_in_chunk && entity2_in_chunk {
830 let relation_type =
831 self.infer_relationship_type(entity1, entity2, &chunk.content);
832 relationships.push((entity1.id.clone(), entity2.id.clone(), relation_type));
833 }
834 }
835 }
836
837 Ok(relationships)
838 }
839
840 fn infer_relationship_type(&self, entity1: &Entity, entity2: &Entity, context: &str) -> String {
842 match (&entity1.entity_type[..], &entity2.entity_type[..]) {
843 ("PERSON", "ORGANIZATION") | ("ORGANIZATION", "PERSON") => {
844 if context.contains("works for") || context.contains("employed by") {
845 "WORKS_FOR".to_string()
846 } else if context.contains("founded") || context.contains("CEO") {
847 "LEADS".to_string()
848 } else {
849 "ASSOCIATED_WITH".to_string()
850 }
851 },
852 ("PERSON", "LOCATION") | ("LOCATION", "PERSON") => {
853 if context.contains("born in") || context.contains("from") {
854 "BORN_IN".to_string()
855 } else if context.contains("lives in") || context.contains("based in") {
856 "LOCATED_IN".to_string()
857 } else {
858 "ASSOCIATED_WITH".to_string()
859 }
860 },
861 ("ORGANIZATION", "LOCATION") | ("LOCATION", "ORGANIZATION") => {
862 if context.contains("headquartered") || context.contains("based in") {
863 "HEADQUARTERED_IN".to_string()
864 } else {
865 "LOCATED_IN".to_string()
866 }
867 },
868 ("PERSON", "PERSON") => {
869 if context.contains("married") || context.contains("spouse") {
870 "MARRIED_TO".to_string()
871 } else if context.contains("colleague") || context.contains("partner") {
872 "COLLEAGUE_OF".to_string()
873 } else {
874 "KNOWS".to_string()
875 }
876 },
877 _ => "RELATED_TO".to_string(),
878 }
879 }
880
881 fn apply_pattern_filtering(&self, entities: Vec<Entity>) -> Vec<Entity> {
883 if self.allowed_patterns.is_empty() && self.excluded_patterns.is_empty() {
884 return entities;
885 }
886
887 entities
888 .into_iter()
889 .filter(|entity| {
890 if !self.allowed_patterns.is_empty() {
892 let matches_allowed = self
893 .allowed_patterns
894 .iter()
895 .any(|pattern| pattern.is_match(&entity.name));
896 if !matches_allowed {
897 return false;
898 }
899 }
900
901 if !self.excluded_patterns.is_empty() {
903 let matches_excluded = self
904 .excluded_patterns
905 .iter()
906 .any(|pattern| pattern.is_match(&entity.name));
907 if matches_excluded {
908 return false;
909 }
910 }
911
912 true
913 })
914 .collect()
915 }
916
917 fn extract_concepts(
919 &self,
920 text: &str,
921 chunk_id: &ChunkId,
922 entity_type: &str,
923 ) -> Result<Vec<Entity>> {
924 let mut entities = Vec::new();
925 let words: Vec<&str> = text.split_whitespace().collect();
926
927 let concept_indicators = [
929 "Theory",
930 "Concept",
931 "Principle",
932 "Philosophy",
933 "Doctrine",
934 "Idea",
935 "Method",
936 "Approach",
937 "Framework",
938 "Model",
939 "Paradigm",
940 "Thesis",
941 ];
942
943 for &word in words.iter() {
944 let clean_word = self.clean_word(word);
945
946 if concept_indicators
948 .iter()
949 .any(|&indicator| clean_word.contains(indicator))
950 {
951 let confidence = 0.75;
952 if confidence >= self.min_confidence {
953 entities.push(self.create_entity(
954 clean_word,
955 entity_type,
956 confidence,
957 chunk_id,
958 text,
959 )?);
960 }
961 }
962
963 if self.is_capitalized(word) && word.len() > 4 {
965 let clean_word = self.clean_word(word);
966 if !self.is_common_word(&clean_word) {
967 let confidence = 0.6;
968 if confidence >= self.min_confidence {
969 entities.push(self.create_entity(
970 clean_word,
971 entity_type,
972 confidence,
973 chunk_id,
974 text,
975 )?);
976 }
977 }
978 }
979 }
980
981 Ok(entities)
982 }
983
984 fn extract_events(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
986 let mut entities = Vec::new();
987
988 let event_words = [
990 "meeting",
991 "conference",
992 "ceremony",
993 "celebration",
994 "festival",
995 "competition",
996 "war",
997 "battle",
998 "expedition",
999 "journey",
1000 "trial",
1001 ];
1002
1003 for event_word in &event_words {
1004 if text.to_lowercase().contains(event_word) {
1005 let confidence = 0.7;
1006 if confidence >= self.min_confidence {
1007 entities.push(self.create_entity(
1008 event_word.to_string(),
1009 "EVENT",
1010 confidence,
1011 chunk_id,
1012 text,
1013 )?);
1014 }
1015 }
1016 }
1017
1018 Ok(entities)
1019 }
1020
1021 fn extract_objects(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
1023 let mut entities = Vec::new();
1024
1025 let object_words = [
1027 "sword",
1028 "shield",
1029 "book",
1030 "manuscript",
1031 "scroll",
1032 "tablet",
1033 "ring",
1034 "crown",
1035 "treasure",
1036 "coin",
1037 "tool",
1038 "weapon",
1039 ];
1040
1041 for object_word in &object_words {
1042 if text.to_lowercase().contains(object_word) {
1043 let confidence = 0.65;
1044 if confidence >= self.min_confidence {
1045 entities.push(self.create_entity(
1046 object_word.to_string(),
1047 "OBJECT",
1048 confidence,
1049 chunk_id,
1050 text,
1051 )?);
1052 }
1053 }
1054 }
1055
1056 Ok(entities)
1057 }
1058
1059 fn extract_generic_entities(
1061 &self,
1062 text: &str,
1063 chunk_id: &ChunkId,
1064 entity_type: &str,
1065 ) -> Result<Vec<Entity>> {
1066 let mut entities = Vec::new();
1067 let words: Vec<&str> = text.split_whitespace().collect();
1068
1069 for &word in &words {
1071 if self.is_capitalized(word) && word.len() > 3 {
1072 let clean_word = self.clean_word(word);
1073 if !self.is_common_word(&clean_word) {
1074 let confidence = 0.5; if confidence >= self.min_confidence {
1076 entities.push(self.create_entity(
1077 clean_word,
1078 entity_type,
1079 confidence,
1080 chunk_id,
1081 text,
1082 )?);
1083 }
1084 }
1085 }
1086 }
1087
1088 Ok(entities)
1089 }
1090
1091 fn is_common_word(&self, word: &str) -> bool {
1093 let common_words = [
1094 "the", "and", "but", "or", "in", "on", "at", "to", "for", "with", "by", "from",
1095 "about", "into", "through", "during", "before", "after", "above", "below", "up",
1096 "down", "out", "off", "over", "under", "again", "further", "then", "once", "here",
1097 "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
1098 "most", "other", "some", "such", "only", "own", "same", "so", "than", "too", "very",
1099 "can", "will", "just", "should", "now", "could", "would", "said", "says", "told",
1100 "asked", "went", "came", "come", "going", "Chapter", "Page", "Section", "Part", "Book",
1101 "Volume",
1102 ];
1103
1104 common_words
1105 .iter()
1106 .any(|&common| word.eq_ignore_ascii_case(common))
1107 }
1108}
1109
1110#[cfg(test)]
1111mod tests {
1112 use super::*;
1113 use crate::core::{ChunkId, DocumentId};
1114
1115 #[test]
1116 fn test_person_extraction() {
1117 let extractor = EntityExtractor::new(0.5).unwrap();
1118 let chunk = TextChunk::new(
1119 ChunkId::new("test_chunk".to_string()),
1120 DocumentId::new("test_doc".to_string()),
1121 "Entity Name works at Test Corp. Dr. Second Entity is a professor.".to_string(),
1122 0,
1123 59,
1124 );
1125
1126 let entities = extractor.extract_from_chunk(&chunk).unwrap();
1127
1128 assert!(!entities.is_empty());
1130
1131 let person_entities: Vec<_> = entities
1132 .iter()
1133 .filter(|e| e.entity_type == "PERSON")
1134 .collect();
1135 assert!(!person_entities.is_empty());
1136 }
1137
1138 #[test]
1139 fn test_relationship_extraction() {
1140 let extractor = EntityExtractor::new(0.5).unwrap();
1141 let chunk = TextChunk::new(
1142 ChunkId::new("test_chunk".to_string()),
1143 DocumentId::new("test_doc".to_string()),
1144 "Entity Name works for Test Corp in Test City.".to_string(),
1145 0,
1146 44,
1147 );
1148
1149 let entities = extractor.extract_from_chunk(&chunk).unwrap();
1150 let relationships = extractor.extract_relationships(&entities, &chunk).unwrap();
1151
1152 assert!(!relationships.is_empty());
1153 }
1154}