1pub mod bidirectional_index;
3pub mod gleaning_extractor;
5pub mod llm_extractor;
7pub mod llm_relationship_extractor;
9pub mod prompts;
11pub mod semantic_merging;
13pub mod string_similarity_linker;
15
16pub use bidirectional_index::{BidirectionalIndex, IndexStatistics};
17pub use gleaning_extractor::{ExtractionCompletionStatus, GleaningConfig, GleaningEntityExtractor};
18pub use llm_extractor::LLMEntityExtractor;
19pub use llm_relationship_extractor::{
20 ExtractedEntity, ExtractedRelationship, ExtractionResult, LLMRelationshipExtractor,
21};
22pub use semantic_merging::{EntityMergeDecision, MergingStatistics, SemanticEntityMerger};
23pub use string_similarity_linker::{EntityLinkingConfig, StringSimilarityLinker};
24
25use crate::{
26 config::setconfig::EntityExtractionConfig,
27 core::{ChunkId, Entity, EntityId, EntityMention, TextChunk},
28 Result,
29};
30use regex::Regex;
31use std::collections::{HashMap, HashSet};
32
33pub struct EntityExtractor {
35 min_confidence: f32,
36 config: Option<EntityExtractionConfig>,
37 allowed_patterns: Vec<Regex>,
38 excluded_patterns: Vec<Regex>,
39}
40
41impl EntityExtractor {
42 pub fn new(min_confidence: f32) -> Result<Self> {
44 Ok(Self {
45 min_confidence,
46 config: None,
47 allowed_patterns: Vec::new(),
48 excluded_patterns: Vec::new(),
49 })
50 }
51
52 pub fn with_config(config: EntityExtractionConfig) -> Result<Self> {
54 let mut allowed_patterns = Vec::new();
55 let mut excluded_patterns = Vec::new();
56
57 if let Some(filters) = &config.filters {
59 if let Some(patterns) = &filters.allowed_patterns {
60 for pattern in patterns {
61 match Regex::new(pattern) {
62 Ok(regex) => allowed_patterns.push(regex),
63 Err(e) => {
64 tracing::warn!("Invalid allowed pattern '{pattern}': {e}");
65 }
66 }
67 }
68 }
69
70 if let Some(patterns) = &filters.excluded_patterns {
71 for pattern in patterns {
72 match Regex::new(pattern) {
73 Ok(regex) => excluded_patterns.push(regex),
74 Err(e) => {
75 tracing::warn!("Invalid excluded pattern '{pattern}': {e}");
76 }
77 }
78 }
79 }
80 }
81
82 let min_confidence = config
83 .filters
84 .as_ref()
85 .map(|f| f.confidence_threshold)
86 .unwrap_or(config.confidence_threshold);
87
88 Ok(Self {
89 min_confidence,
90 config: Some(config),
91 allowed_patterns,
92 excluded_patterns,
93 })
94 }
95
96 pub fn extract_from_chunk(&self, chunk: &TextChunk) -> Result<Vec<Entity>> {
98 let mut entities = Vec::new();
99 let text = &chunk.content;
100
101 let entity_types = if let Some(config) = &self.config {
103 config.entity_types.as_ref().cloned().unwrap_or_else(|| {
104 vec![
105 "PERSON".to_string(),
106 "ORGANIZATION".to_string(),
107 "LOCATION".to_string(),
108 ]
109 })
110 } else {
111 vec![
112 "PERSON".to_string(),
113 "ORGANIZATION".to_string(),
114 "LOCATION".to_string(),
115 ]
116 };
117
118 for entity_type in &entity_types {
120 match entity_type.as_str() {
121 "PERSON" | "CHARACTER" | "RESEARCHER" | "SPEAKER" | "DIALOGUE_SPEAKER" => {
122 entities.extend(self.extract_persons(text, &chunk.id)?);
123 }
124 "ORGANIZATION" | "INSTITUTION" | "BRAND" | "COMPANY" => {
125 entities.extend(self.extract_organizations(text, &chunk.id)?);
126 }
127 "LOCATION" | "SETTING" | "PLACE" => {
128 entities.extend(self.extract_locations(text, &chunk.id)?);
129 }
130 "CONCEPT" | "THEORY" | "THEME" | "ARGUMENT" | "IDEA" => {
131 entities.extend(self.extract_concepts(text, &chunk.id, entity_type)?);
132 }
133 "EVENT" | "EXPERIMENT" | "HAPPENING" => {
134 entities.extend(self.extract_events(text, &chunk.id)?);
135 }
136 "OBJECT" | "TOOL" | "ARTIFACT" | "ITEM" => {
137 entities.extend(self.extract_objects(text, &chunk.id)?);
138 }
139 _ => {
140 entities.extend(self.extract_generic_entities(text, &chunk.id, entity_type)?);
142 }
143 }
144 }
145
146 entities = self.apply_pattern_filtering(entities);
148
149 entities = self.deduplicate_entities(entities);
151
152 entities.retain(|e| e.confidence >= self.min_confidence);
154
155 Ok(entities)
156 }
157
158 fn extract_persons(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
160 let mut entities = Vec::new();
161 let words: Vec<&str> = text.split_whitespace().collect();
162 let mut processed_indices = HashSet::new();
163
164 let person_titles = [
166 "mr",
167 "mrs",
168 "ms",
169 "dr",
170 "prof",
171 "professor",
172 "sir",
173 "lady",
174 "lord",
175 "captain",
176 "major",
177 "colonel",
178 "general",
179 "admiral",
180 "judge",
181 "father",
182 "mother",
183 "brother",
184 "sister",
185 "aunt",
186 "uncle",
187 "grandfather",
188 "grandmother",
189 ];
190
191 let non_person_words = [
193 "chapter",
194 "the",
195 "and",
196 "but",
197 "or",
198 "in",
199 "on",
200 "at",
201 "to",
202 "for",
203 "with",
204 "by",
205 "from",
206 "about",
207 "into",
208 "through",
209 "during",
210 "before",
211 "after",
212 "above",
213 "below",
214 "up",
215 "down",
216 "out",
217 "off",
218 "over",
219 "under",
220 "again",
221 "further",
222 "then",
223 "once",
224 "here",
225 "there",
226 "when",
227 "where",
228 "why",
229 "how",
230 "all",
231 "any",
232 "both",
233 "each",
234 "few",
235 "more",
236 "most",
237 "other",
238 "some",
239 "such",
240 "only",
241 "own",
242 "same",
243 "so",
244 "than",
245 "too",
246 "very",
247 "can",
248 "will",
249 "just",
250 "should",
251 "now",
252 "temptations",
253 "strategic",
254 "movements",
255 "decides",
256 "upon",
257 "whitewashing",
258 "saturday",
259 "monday",
260 "tuesday",
261 "wednesday",
262 "thursday",
263 "friday",
264 "sunday",
265 "january",
266 "february",
267 "march",
268 "april",
269 "may",
270 "june",
271 "july",
272 "august",
273 "september",
274 "october",
275 "november",
276 "december",
277 "adventures",
278 "complete",
279 ];
280
281 entities.extend(self.extract_known_names(
283 &words,
284 &mut processed_indices,
285 chunk_id,
286 text,
287 )?);
288
289 entities.extend(self.extract_title_based_names(
291 &words,
292 &person_titles,
293 &mut processed_indices,
294 chunk_id,
295 text,
296 )?);
297
298 entities.extend(self.extract_two_word_names(
300 &words,
301 &non_person_words,
302 &mut processed_indices,
303 chunk_id,
304 text,
305 )?);
306
307 for (i, &word_ref) in words.iter().enumerate() {
309 if processed_indices.contains(&i) {
310 continue;
311 }
312
313 let word = self.clean_word(word_ref);
314
315 if word.len() < 2 || non_person_words.contains(&word.to_lowercase().as_str()) {
317 continue;
318 }
319
320 if self.is_capitalized(words[i]) && self.is_likely_person_word(&word) {
322 let confidence = self.calculate_confidence(&word, "PERSON");
323 if confidence >= self.min_confidence {
324 entities.push(self.create_entity(word, "PERSON", confidence, chunk_id, text)?);
325 }
326 }
327 }
328
329 Ok(entities)
330 }
331
332 fn extract_known_names(
334 &self,
335 words: &[&str],
336 processed: &mut std::collections::HashSet<usize>,
337 chunk_id: &ChunkId,
338 text: &str,
339 ) -> Result<Vec<Entity>> {
340 let mut entities = Vec::new();
341 let known_names = [
342 ("Entity Name", 2),
343 ("Second Entity", 2),
344 ("Guardian Entity", 2),
345 ("Friend Entity", 2),
346 ("Companion Entity", 2),
347 ("Third Entity", 2),
348 ("Fourth Entity", 2),
349 ("Fifth Entity", 2),
350 ("Sixth Entity", 2),
351 ("Seventh Entity", 2),
352 ("Eighth Entity", 2),
353 ("Ninth Entity", 2),
354 ];
355
356 for i in 0..words.len() {
357 if processed.contains(&i) {
358 continue;
359 }
360
361 for &(name, word_count) in &known_names {
362 let name_words: Vec<&str> = name.split_whitespace().collect();
363 if i + name_words.len() <= words.len() {
364 let matches = name_words.iter().enumerate().all(|(j, &expected)| {
365 let actual = self.clean_word(words[i + j]);
366 actual.to_lowercase() == expected.to_lowercase()
367 });
368
369 if matches {
370 let confidence = 0.95;
371 if confidence >= self.min_confidence {
372 entities.push(self.create_entity(
373 name.to_string(),
374 "PERSON",
375 confidence,
376 chunk_id,
377 text,
378 )?);
379 }
380 for j in 0..word_count {
382 processed.insert(i + j);
383 }
384 break;
385 }
386 }
387 }
388 }
389 Ok(entities)
390 }
391
392 fn extract_title_based_names(
394 &self,
395 words: &[&str],
396 person_titles: &[&str],
397 processed: &mut std::collections::HashSet<usize>,
398 chunk_id: &ChunkId,
399 text: &str,
400 ) -> Result<Vec<Entity>> {
401 let mut entities = Vec::new();
402
403 for i in 0..words.len() {
404 if processed.contains(&i) {
405 continue;
406 }
407
408 let word_clean = self.clean_word(words[i]).to_lowercase();
409 if person_titles.contains(&word_clean.as_str())
410 && i + 1 < words.len()
411 && !processed.contains(&(i + 1))
412 {
413 let next_word = self.clean_word(words[i + 1]);
414 if self.is_capitalized(words[i + 1]) && self.is_likely_person_word(&next_word) {
415 let name = if i + 2 < words.len() && !processed.contains(&(i + 2)) {
416 let third_word = self.clean_word(words[i + 2]);
417 if self.is_capitalized(words[i + 2])
418 && self.is_likely_person_word(&third_word)
419 {
420 processed.insert(i + 2);
421 format!("{next_word} {third_word}")
422 } else {
423 next_word
424 }
425 } else {
426 next_word
427 };
428
429 let confidence = 0.9;
430 if confidence >= self.min_confidence {
431 entities
432 .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
433 }
434 processed.insert(i);
435 processed.insert(i + 1);
436 }
437 }
438 }
439 Ok(entities)
440 }
441
442 fn extract_two_word_names(
444 &self,
445 words: &[&str],
446 non_person_words: &[&str],
447 processed: &mut std::collections::HashSet<usize>,
448 chunk_id: &ChunkId,
449 text: &str,
450 ) -> Result<Vec<Entity>> {
451 let mut entities = Vec::new();
452
453 for i in 0..words.len() {
454 if processed.contains(&i) || i + 1 >= words.len() || processed.contains(&(i + 1)) {
455 continue;
456 }
457
458 let first_word = self.clean_word(words[i]);
459 let second_word = self.clean_word(words[i + 1]);
460
461 if self.is_capitalized(words[i])
463 && self.is_capitalized(words[i + 1])
464 && self.is_likely_person_word(&first_word)
465 && self.is_likely_person_word(&second_word)
466 && !non_person_words.contains(&first_word.to_lowercase().as_str())
467 && !non_person_words.contains(&second_word.to_lowercase().as_str())
468 {
469 let name = format!("{first_word} {second_word}");
470 if self.is_likely_person_name(&name) {
471 let confidence = self.calculate_confidence(&name, "PERSON");
472 if confidence >= self.min_confidence {
473 entities
474 .push(self.create_entity(name, "PERSON", confidence, chunk_id, text)?);
475 }
476 processed.insert(i);
477 processed.insert(i + 1);
478 }
479 }
480 }
481 Ok(entities)
482 }
483
484 fn extract_organizations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
486 let mut entities = Vec::new();
487 let org_suffixes = [
488 "Inc",
489 "Corp",
490 "LLC",
491 "Ltd",
492 "Company",
493 "Corporation",
494 "Group",
495 "Solutions",
496 "Technologies",
497 ];
498 let org_prefixes = ["University of", "Institute of", "Department of"];
499
500 for suffix in &org_suffixes {
502 if let Some(pos) = text.find(suffix) {
503 let start = text[..pos].rfind(' ').map(|i| i + 1).unwrap_or(0);
505 let end = pos + suffix.len();
506 let name = text[start..end].trim().to_string();
507
508 if !name.is_empty() && self.is_likely_organization(&name) {
509 let confidence = self.calculate_confidence(&name, "ORGANIZATION");
510 if confidence >= self.min_confidence {
511 entities.push(self.create_entity(
512 name,
513 "ORGANIZATION",
514 confidence,
515 chunk_id,
516 text,
517 )?);
518 }
519 }
520 }
521 }
522
523 for prefix in &org_prefixes {
525 if let Some(pos) = text.find(prefix) {
526 let start = pos;
527 let end = text[pos..]
528 .find('.')
529 .map(|i| pos + i)
530 .unwrap_or(text.len().min(pos + 50));
531 let name = text[start..end].trim().to_string();
532
533 if !name.is_empty() && name.len() > prefix.len() {
534 let confidence = self.calculate_confidence(&name, "ORGANIZATION");
535 if confidence >= self.min_confidence {
536 entities.push(self.create_entity(
537 name,
538 "ORGANIZATION",
539 confidence,
540 chunk_id,
541 text,
542 )?);
543 }
544 }
545 }
546 }
547
548 Ok(entities)
549 }
550
551 fn extract_locations(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
553 let mut entities = Vec::new();
554 let known_locations = [
555 "United States",
556 "New York",
557 "California",
558 "London",
559 "Paris",
560 "Tokyo",
561 "Berlin",
562 "Washington",
563 "Boston",
564 "Chicago",
565 ];
566
567 for location in &known_locations {
568 if text.contains(location) {
569 let confidence = self.calculate_confidence(location, "LOCATION");
570 if confidence >= self.min_confidence {
571 entities.push(self.create_entity(
572 location.to_string(),
573 "LOCATION",
574 confidence,
575 chunk_id,
576 text,
577 )?);
578 }
579 }
580 }
581
582 Ok(entities)
583 }
584
585 fn create_entity(
587 &self,
588 name: String,
589 entity_type: &str,
590 confidence: f32,
591 chunk_id: &ChunkId,
592 text: &str,
593 ) -> Result<Entity> {
594 let entity_id = EntityId::new(format!("{}_{}", entity_type, self.normalize_name(&name)));
595
596 let mut mentions = Vec::new();
598 let mut start = 0;
599 while let Some(pos) = text[start..].find(&name) {
600 let actual_pos = start + pos;
601 mentions.push(EntityMention {
602 chunk_id: chunk_id.clone(),
603 start_offset: actual_pos,
604 end_offset: actual_pos + name.len(),
605 confidence,
606 });
607 start = actual_pos + name.len();
608 }
609
610 Ok(
611 Entity::new(entity_id, name, entity_type.to_string(), confidence)
612 .with_mentions(mentions),
613 )
614 }
615
616 fn is_capitalized(&self, word: &str) -> bool {
618 word.chars().next().is_some_and(|c| c.is_uppercase())
619 }
620
621 fn clean_word(&self, word: &str) -> String {
623 word.chars()
624 .filter(|c| c.is_alphabetic() || *c == '\'') .collect::<String>()
626 .trim_end_matches('\'') .to_string()
628 }
629
630 fn is_likely_person_word(&self, word: &str) -> bool {
632 if word.len() < 2 {
633 return false;
634 }
635
636 let word_lower = word.to_lowercase();
638
639 let name_endings = [
641 "son", "sen", "ton", "ham", "ford", "ley", "ment", "ard", "ert",
642 ];
643 let has_name_ending = name_endings
644 .iter()
645 .any(|&ending| word_lower.ends_with(ending));
646
647 let name_prefixes = ["mc", "mac", "o'", "de", "van", "von", "la", "le"];
649 let has_name_prefix = name_prefixes
650 .iter()
651 .any(|&prefix| word_lower.starts_with(prefix));
652
653 let is_proper_format = word.chars().next().unwrap().is_uppercase()
655 && word.chars().all(|c| c.is_alphabetic() || c == '\'');
656
657 let short_non_names = [
659 "it", "is", "as", "at", "be", "by", "do", "go", "he", "if", "in", "me", "my", "no",
660 "of", "on", "or", "so", "to", "up", "us", "we",
661 ];
662
663 if word.len() <= 2 && short_non_names.contains(&word_lower.as_str()) {
664 return false;
665 }
666
667 is_proper_format && (word.len() >= 3 || has_name_ending || has_name_prefix)
668 }
669
670 #[allow(dead_code)]
672 fn is_title(&self, word: &str) -> bool {
673 matches!(word, "Dr." | "Mr." | "Ms." | "Mrs." | "Prof.")
674 }
675
676 fn is_likely_person_name(&self, name: &str) -> bool {
678 let parts: Vec<&str> = name.split_whitespace().collect();
679 parts.len() == 2 && parts.iter().all(|part| self.is_capitalized(part))
680 }
681
682 fn is_likely_organization(&self, name: &str) -> bool {
684 let org_indicators = [
685 "Inc",
686 "Corp",
687 "LLC",
688 "Ltd",
689 "Company",
690 "Corporation",
691 "University",
692 "Institute",
693 ];
694 org_indicators
695 .iter()
696 .any(|indicator| name.contains(indicator))
697 }
698
699 fn calculate_confidence(&self, name: &str, entity_type: &str) -> f32 {
701 let mut confidence: f32 = 0.5; match entity_type {
705 "PERSON" => {
706 if name.contains("Dr.") || name.contains("Prof.") {
707 confidence += 0.3;
708 }
709 if name.split_whitespace().count() == 2 {
710 confidence += 0.2;
711 }
712 }
713 "ORGANIZATION" => {
714 if name.contains("Inc") || name.contains("Corp") || name.contains("LLC") {
715 confidence += 0.3;
716 }
717 if name.contains("University") || name.contains("Institute") {
718 confidence += 0.2;
719 }
720 }
721 "LOCATION" => {
722 if name.contains(',') {
723 confidence += 0.2;
724 }
725 if self.is_known_location(name) {
726 confidence += 0.3;
727 }
728 }
729 _ => {}
730 }
731
732 if name.chars().next().is_some_and(|c| c.is_uppercase()) {
734 confidence += 0.1;
735 }
736
737 confidence.min(1.0)
738 }
739
740 fn is_known_location(&self, name: &str) -> bool {
742 const KNOWN_LOCATIONS: &[&str] = &[
743 "United States",
744 "New York",
745 "California",
746 "London",
747 "Paris",
748 "Tokyo",
749 "Berlin",
750 "Washington",
751 "Boston",
752 "Chicago",
753 ];
754 KNOWN_LOCATIONS.iter().any(|&loc| name.contains(loc))
755 }
756
757 fn normalize_name(&self, name: &str) -> String {
759 name.to_lowercase()
760 .chars()
761 .filter(|c| c.is_alphanumeric() || *c == '_')
762 .collect::<String>()
763 .replace(' ', "_")
764 }
765
766 fn deduplicate_entities(&self, entities: Vec<Entity>) -> Vec<Entity> {
768 let mut unique_entities: HashMap<(String, String), Entity> = HashMap::new();
769
770 for entity in entities {
771 let key = (entity.name.clone(), entity.entity_type.clone());
772
773 match unique_entities.get_mut(&key) {
774 Some(existing) => {
775 existing.mentions.extend(entity.mentions);
777 if entity.confidence > existing.confidence {
778 existing.confidence = entity.confidence;
779 }
780 }
781 None => {
782 unique_entities.insert(key, entity);
783 }
784 }
785 }
786
787 unique_entities.into_values().collect()
788 }
789
790 pub fn extract_relationships(
792 &self,
793 entities: &[Entity],
794 chunk: &TextChunk,
795 ) -> Result<Vec<(EntityId, EntityId, String)>> {
796 let mut relationships = Vec::new();
797
798 for i in 0..entities.len() {
800 for j in (i + 1)..entities.len() {
801 let entity1 = &entities[i];
802 let entity2 = &entities[j];
803
804 let entity1_in_chunk = entity1.mentions.iter().any(|m| m.chunk_id == chunk.id);
806 let entity2_in_chunk = entity2.mentions.iter().any(|m| m.chunk_id == chunk.id);
807
808 if entity1_in_chunk && entity2_in_chunk {
809 let relation_type =
810 self.infer_relationship_type(entity1, entity2, &chunk.content);
811 relationships.push((entity1.id.clone(), entity2.id.clone(), relation_type));
812 }
813 }
814 }
815
816 Ok(relationships)
817 }
818
819 fn infer_relationship_type(&self, entity1: &Entity, entity2: &Entity, context: &str) -> String {
821 match (&entity1.entity_type[..], &entity2.entity_type[..]) {
822 ("PERSON", "ORGANIZATION") | ("ORGANIZATION", "PERSON") => {
823 if context.contains("works for") || context.contains("employed by") {
824 "WORKS_FOR".to_string()
825 } else if context.contains("founded") || context.contains("CEO") {
826 "LEADS".to_string()
827 } else {
828 "ASSOCIATED_WITH".to_string()
829 }
830 }
831 ("PERSON", "LOCATION") | ("LOCATION", "PERSON") => {
832 if context.contains("born in") || context.contains("from") {
833 "BORN_IN".to_string()
834 } else if context.contains("lives in") || context.contains("based in") {
835 "LOCATED_IN".to_string()
836 } else {
837 "ASSOCIATED_WITH".to_string()
838 }
839 }
840 ("ORGANIZATION", "LOCATION") | ("LOCATION", "ORGANIZATION") => {
841 if context.contains("headquartered") || context.contains("based in") {
842 "HEADQUARTERED_IN".to_string()
843 } else {
844 "LOCATED_IN".to_string()
845 }
846 }
847 ("PERSON", "PERSON") => {
848 if context.contains("married") || context.contains("spouse") {
849 "MARRIED_TO".to_string()
850 } else if context.contains("colleague") || context.contains("partner") {
851 "COLLEAGUE_OF".to_string()
852 } else {
853 "KNOWS".to_string()
854 }
855 }
856 _ => "RELATED_TO".to_string(),
857 }
858 }
859
860 fn apply_pattern_filtering(&self, entities: Vec<Entity>) -> Vec<Entity> {
862 if self.allowed_patterns.is_empty() && self.excluded_patterns.is_empty() {
863 return entities;
864 }
865
866 entities
867 .into_iter()
868 .filter(|entity| {
869 if !self.allowed_patterns.is_empty() {
871 let matches_allowed = self
872 .allowed_patterns
873 .iter()
874 .any(|pattern| pattern.is_match(&entity.name));
875 if !matches_allowed {
876 return false;
877 }
878 }
879
880 if !self.excluded_patterns.is_empty() {
882 let matches_excluded = self
883 .excluded_patterns
884 .iter()
885 .any(|pattern| pattern.is_match(&entity.name));
886 if matches_excluded {
887 return false;
888 }
889 }
890
891 true
892 })
893 .collect()
894 }
895
896 fn extract_concepts(
898 &self,
899 text: &str,
900 chunk_id: &ChunkId,
901 entity_type: &str,
902 ) -> Result<Vec<Entity>> {
903 let mut entities = Vec::new();
904 let words: Vec<&str> = text.split_whitespace().collect();
905
906 let concept_indicators = [
908 "Theory",
909 "Concept",
910 "Principle",
911 "Philosophy",
912 "Doctrine",
913 "Idea",
914 "Method",
915 "Approach",
916 "Framework",
917 "Model",
918 "Paradigm",
919 "Thesis",
920 ];
921
922 for &word in words.iter() {
923 let clean_word = self.clean_word(word);
924
925 if concept_indicators
927 .iter()
928 .any(|&indicator| clean_word.contains(indicator))
929 {
930 let confidence = 0.75;
931 if confidence >= self.min_confidence {
932 entities.push(self.create_entity(
933 clean_word,
934 entity_type,
935 confidence,
936 chunk_id,
937 text,
938 )?);
939 }
940 }
941
942 if self.is_capitalized(word) && word.len() > 4 {
944 let clean_word = self.clean_word(word);
945 if !self.is_common_word(&clean_word) {
946 let confidence = 0.6;
947 if confidence >= self.min_confidence {
948 entities.push(self.create_entity(
949 clean_word,
950 entity_type,
951 confidence,
952 chunk_id,
953 text,
954 )?);
955 }
956 }
957 }
958 }
959
960 Ok(entities)
961 }
962
963 fn extract_events(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
965 let mut entities = Vec::new();
966
967 let event_words = [
969 "meeting",
970 "conference",
971 "ceremony",
972 "celebration",
973 "festival",
974 "competition",
975 "war",
976 "battle",
977 "expedition",
978 "journey",
979 "trial",
980 ];
981
982 for event_word in &event_words {
983 if text.to_lowercase().contains(event_word) {
984 let confidence = 0.7;
985 if confidence >= self.min_confidence {
986 entities.push(self.create_entity(
987 event_word.to_string(),
988 "EVENT",
989 confidence,
990 chunk_id,
991 text,
992 )?);
993 }
994 }
995 }
996
997 Ok(entities)
998 }
999
1000 fn extract_objects(&self, text: &str, chunk_id: &ChunkId) -> Result<Vec<Entity>> {
1002 let mut entities = Vec::new();
1003
1004 let object_words = [
1006 "sword",
1007 "shield",
1008 "book",
1009 "manuscript",
1010 "scroll",
1011 "tablet",
1012 "ring",
1013 "crown",
1014 "treasure",
1015 "coin",
1016 "tool",
1017 "weapon",
1018 ];
1019
1020 for object_word in &object_words {
1021 if text.to_lowercase().contains(object_word) {
1022 let confidence = 0.65;
1023 if confidence >= self.min_confidence {
1024 entities.push(self.create_entity(
1025 object_word.to_string(),
1026 "OBJECT",
1027 confidence,
1028 chunk_id,
1029 text,
1030 )?);
1031 }
1032 }
1033 }
1034
1035 Ok(entities)
1036 }
1037
1038 fn extract_generic_entities(
1040 &self,
1041 text: &str,
1042 chunk_id: &ChunkId,
1043 entity_type: &str,
1044 ) -> Result<Vec<Entity>> {
1045 let mut entities = Vec::new();
1046 let words: Vec<&str> = text.split_whitespace().collect();
1047
1048 for &word in &words {
1050 if self.is_capitalized(word) && word.len() > 3 {
1051 let clean_word = self.clean_word(word);
1052 if !self.is_common_word(&clean_word) {
1053 let confidence = 0.5; if confidence >= self.min_confidence {
1055 entities.push(self.create_entity(
1056 clean_word,
1057 entity_type,
1058 confidence,
1059 chunk_id,
1060 text,
1061 )?);
1062 }
1063 }
1064 }
1065 }
1066
1067 Ok(entities)
1068 }
1069
1070 fn is_common_word(&self, word: &str) -> bool {
1072 let common_words = [
1073 "the", "and", "but", "or", "in", "on", "at", "to", "for", "with", "by", "from",
1074 "about", "into", "through", "during", "before", "after", "above", "below", "up",
1075 "down", "out", "off", "over", "under", "again", "further", "then", "once", "here",
1076 "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
1077 "most", "other", "some", "such", "only", "own", "same", "so", "than", "too", "very",
1078 "can", "will", "just", "should", "now", "could", "would", "said", "says", "told",
1079 "asked", "went", "came", "come", "going", "Chapter", "Page", "Section", "Part", "Book",
1080 "Volume",
1081 ];
1082
1083 common_words
1084 .iter()
1085 .any(|&common| word.eq_ignore_ascii_case(common))
1086 }
1087}
1088
1089#[cfg(test)]
1090mod tests {
1091 use super::*;
1092 use crate::core::{ChunkId, DocumentId};
1093
1094 #[test]
1095 fn test_person_extraction() {
1096 let extractor = EntityExtractor::new(0.5).unwrap();
1097 let chunk = TextChunk::new(
1098 ChunkId::new("test_chunk".to_string()),
1099 DocumentId::new("test_doc".to_string()),
1100 "Entity Name works at Test Corp. Dr. Second Entity is a professor.".to_string(),
1101 0,
1102 59,
1103 );
1104
1105 let entities = extractor.extract_from_chunk(&chunk).unwrap();
1106
1107 assert!(!entities.is_empty());
1109
1110 let person_entities: Vec<_> = entities
1111 .iter()
1112 .filter(|e| e.entity_type == "PERSON")
1113 .collect();
1114 assert!(!person_entities.is_empty());
1115 }
1116
1117 #[test]
1118 fn test_relationship_extraction() {
1119 let extractor = EntityExtractor::new(0.5).unwrap();
1120 let chunk = TextChunk::new(
1121 ChunkId::new("test_chunk".to_string()),
1122 DocumentId::new("test_doc".to_string()),
1123 "Entity Name works for Test Corp in Test City.".to_string(),
1124 0,
1125 44,
1126 );
1127
1128 let entities = extractor.extract_from_chunk(&chunk).unwrap();
1129 let relationships = extractor.extract_relationships(&entities, &chunk).unwrap();
1130
1131 assert!(!relationships.is_empty());
1132 }
1133}