1use anyhow::Result;
11use chrono::{DateTime, Utc};
12use oxirs_core::model::{triple::Triple, NamedNode, Object, Predicate, Subject};
13use regex::Regex;
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16use tracing::{debug, info, warn};
17use uuid::Uuid;
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct KnowledgeExtractionConfig {
22 pub enable_entity_extraction: bool,
23 pub enable_relationship_extraction: bool,
24 pub enable_schema_discovery: bool,
25 pub enable_fact_validation: bool,
26 pub enable_temporal_extraction: bool,
27 pub enable_multilingual_extraction: bool,
28 pub confidence_threshold: f64,
29 pub max_extraction_depth: usize,
30 pub language_models: Vec<String>,
31}
32
33impl Default for KnowledgeExtractionConfig {
34 fn default() -> Self {
35 Self {
36 enable_entity_extraction: true,
37 enable_relationship_extraction: true,
38 enable_schema_discovery: true,
39 enable_fact_validation: true,
40 enable_temporal_extraction: true,
41 enable_multilingual_extraction: false,
42 confidence_threshold: 0.8,
43 max_extraction_depth: 3,
44 language_models: vec!["en".to_string()],
45 }
46 }
47}
48
49#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ExtractedKnowledge {
52 pub knowledge_id: String,
53 pub source_text: String,
54 pub extracted_triples: Vec<Triple>,
55 pub extracted_entities: Vec<ExtractedEntity>,
56 pub extracted_relationships: Vec<ExtractedRelationship>,
57 pub schema_elements: Vec<SchemaElement>,
58 pub temporal_facts: Vec<TemporalFact>,
59 pub confidence_score: f64,
60 pub extraction_metadata: ExtractionMetadata,
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct ExtractedEntity {
66 pub entity_id: String,
67 pub entity_text: String,
68 pub entity_type: EntityType,
69 pub canonical_form: String,
70 pub aliases: Vec<String>,
71 pub properties: HashMap<String, String>,
72 pub confidence: f64,
73 pub source_position: TextPosition,
74 pub linked_entities: Vec<String>,
75}
76
77#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct ExtractedRelationship {
80 pub relationship_id: String,
81 pub subject_entity: String,
82 pub predicate: String,
83 pub object_entity: String,
84 pub relationship_type: RelationshipType,
85 pub confidence: f64,
86 pub evidence_text: String,
87 pub temporal_context: Option<TemporalContext>,
88 pub source_position: TextPosition,
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct SchemaElement {
94 pub element_id: String,
95 pub element_type: SchemaElementType,
96 pub name: String,
97 pub description: String,
98 pub properties: Vec<SchemaProperty>,
99 pub hierarchical_relations: Vec<HierarchicalRelation>,
100 pub constraints: Vec<SchemaConstraint>,
101 pub confidence: f64,
102}
103
104#[derive(Debug, Clone, Serialize, Deserialize)]
106pub struct TemporalFact {
107 pub fact_id: String,
108 pub subject: String,
109 pub predicate: String,
110 pub object: String,
111 pub temporal_qualifier: TemporalQualifier,
112 pub confidence: f64,
113 pub source_text: String,
114}
115
116#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
118pub enum EntityType {
119 Person,
120 Organization,
121 Location,
122 Event,
123 Concept,
124 Product,
125 Technology,
126 Scientific,
127 Temporal,
128 Numerical,
129 Unknown,
130}
131
132#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
134pub enum RelationshipType {
135 IsA,
136 PartOf,
137 LocatedIn,
138 OwnedBy,
139 CreatedBy,
140 CausedBy,
141 TemporalSequence,
142 Similarity,
143 Dependency,
144 Custom(String),
145}
146
147#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
149pub enum SchemaElementType {
150 Class,
151 Property,
152 Relationship,
153 Constraint,
154 Rule,
155}
156
157#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct TextPosition {
160 pub start_offset: usize,
161 pub end_offset: usize,
162 pub line_number: usize,
163 pub column_number: usize,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
168pub struct TemporalContext {
169 pub start_time: Option<DateTime<Utc>>,
170 pub end_time: Option<DateTime<Utc>>,
171 pub duration: Option<std::time::Duration>,
172 pub temporal_relation: String,
173}
174
175#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct TemporalQualifier {
178 pub qualifier_type: TemporalType,
179 pub time_point: Option<DateTime<Utc>>,
180 pub time_interval: Option<TimeInterval>,
181 pub frequency: Option<String>,
182}
183
184#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
186pub enum TemporalType {
187 PointInTime,
188 TimeInterval,
189 Frequency,
190 Duration,
191 Relative,
192}
193
194#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct TimeInterval {
197 pub start: DateTime<Utc>,
198 pub end: DateTime<Utc>,
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize)]
203pub struct SchemaProperty {
204 pub property_name: String,
205 pub property_type: String,
206 pub cardinality: Cardinality,
207 pub domain: Option<String>,
208 pub range: Option<String>,
209}
210
211#[derive(Debug, Clone, Serialize, Deserialize)]
213pub struct HierarchicalRelation {
214 pub relation_type: HierarchyType,
215 pub parent: String,
216 pub child: String,
217}
218
219#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct SchemaConstraint {
222 pub constraint_type: ConstraintType,
223 pub description: String,
224 pub enforcement_level: EnforcementLevel,
225}
226
227#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
229pub enum Cardinality {
230 ZeroOrOne,
231 ExactlyOne,
232 ZeroOrMore,
233 OneOrMore,
234 Exact(usize),
235 Range(usize, usize),
236}
237
238#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
240pub enum HierarchyType {
241 SubClassOf,
242 SubPropertyOf,
243 PartOf,
244 InstanceOf,
245}
246
247#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
249pub enum ConstraintType {
250 UniqueValue,
251 RequiredProperty,
252 ValueRange,
253 DataType,
254 Pattern,
255 Cardinality,
256}
257
258#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
260pub enum EnforcementLevel {
261 Strict,
262 Warning,
263 Suggestion,
264}
265
266#[derive(Debug, Clone, Serialize, Deserialize)]
268pub struct ExtractionMetadata {
269 pub extraction_timestamp: DateTime<Utc>,
270 pub extraction_method: String,
271 pub processing_time_ms: u64,
272 pub language_detected: String,
273 pub text_length: usize,
274 pub extraction_statistics: ExtractionStatistics,
275}
276
277#[derive(Debug, Clone, Serialize, Deserialize)]
279pub struct ExtractionStatistics {
280 pub entities_extracted: usize,
281 pub relationships_extracted: usize,
282 pub triples_generated: usize,
283 pub schema_elements_discovered: usize,
284 pub temporal_facts_extracted: usize,
285 pub average_confidence: f64,
286}
287
288pub struct KnowledgeExtractionEngine {
290 config: KnowledgeExtractionConfig,
291 entity_patterns: HashMap<EntityType, Vec<Regex>>,
292 relationship_patterns: HashMap<RelationshipType, Vec<Regex>>,
293 temporal_patterns: Vec<Regex>,
294 schema_inference_rules: Vec<SchemaInferenceRule>,
295 language_detectors: HashMap<String, LanguageDetector>,
296}
297
298#[derive(Debug, Clone)]
300struct SchemaInferenceRule {
301 rule_id: String,
302 pattern: Regex,
303 inferred_type: SchemaElementType,
304 confidence_modifier: f64,
305}
306
307#[derive(Debug, Clone)]
309struct LanguageDetector {
310 language_code: String,
311 detection_patterns: Vec<Regex>,
312 confidence_threshold: f64,
313}
314
315impl KnowledgeExtractionEngine {
316 pub fn new(config: KnowledgeExtractionConfig) -> Result<Self> {
318 let mut engine = Self {
319 config,
320 entity_patterns: HashMap::new(),
321 relationship_patterns: HashMap::new(),
322 temporal_patterns: Vec::new(),
323 schema_inference_rules: Vec::new(),
324 language_detectors: HashMap::new(),
325 };
326
327 engine.initialize_extraction_patterns()?;
328 engine.initialize_schema_rules()?;
329 engine.initialize_language_detectors()?;
330
331 Ok(engine)
332 }
333
334 fn initialize_extraction_patterns(&mut self) -> Result<()> {
336 let person_patterns = vec![
338 Regex::new(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b")?, Regex::new(r"\bDr\. [A-Z][a-z]+\b")?, Regex::new(r"\bProf\. [A-Z][a-z]+\b")?, ];
342 self.entity_patterns
343 .insert(EntityType::Person, person_patterns);
344
345 let org_patterns = vec![
347 Regex::new(r"\b[A-Z][a-z]+ (Inc|Corp|Ltd|LLC)\b")?,
348 Regex::new(r"\bUniversity of [A-Z][a-z]+\b")?,
349 Regex::new(r"\b[A-Z][A-Z]+ Corporation\b")?,
350 ];
351 self.entity_patterns
352 .insert(EntityType::Organization, org_patterns);
353
354 let location_patterns = vec![
356 Regex::new(r"\b[A-Z][a-z]+, [A-Z][A-Z]\b")?, Regex::new(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b")?, ];
359 self.entity_patterns
360 .insert(EntityType::Location, location_patterns);
361
362 self.temporal_patterns = vec![
364 Regex::new(r"\b\d{4}-\d{2}-\d{2}\b")?, Regex::new(
366 r"\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b",
367 )?,
368 Regex::new(r"\b(before|after|during|since|until) \d{4}\b")?,
369 ];
370
371 let isa_patterns = vec![
373 Regex::new(r"(.+) is an? (.+)")?,
374 Regex::new(r"(.+) type of (.+)")?,
375 ];
376 self.relationship_patterns
377 .insert(RelationshipType::IsA, isa_patterns);
378
379 let partof_patterns = vec![
380 Regex::new(r"(.+) part of (.+)")?,
381 Regex::new(r"(.+) component of (.+)")?,
382 ];
383 self.relationship_patterns
384 .insert(RelationshipType::PartOf, partof_patterns);
385
386 Ok(())
387 }
388
389 fn initialize_schema_rules(&mut self) -> Result<()> {
391 self.schema_inference_rules = vec![
392 SchemaInferenceRule {
393 rule_id: "class_definition".to_string(),
394 pattern: Regex::new(r"(.+) is a type of (.+)")?,
395 inferred_type: SchemaElementType::Class,
396 confidence_modifier: 0.9,
397 },
398 SchemaInferenceRule {
399 rule_id: "property_definition".to_string(),
400 pattern: Regex::new(r"(.+) has (.+)")?,
401 inferred_type: SchemaElementType::Property,
402 confidence_modifier: 0.8,
403 },
404 ];
405
406 Ok(())
407 }
408
409 fn initialize_language_detectors(&mut self) -> Result<()> {
411 self.language_detectors.insert(
413 "en".to_string(),
414 LanguageDetector {
415 language_code: "en".to_string(),
416 detection_patterns: vec![
417 Regex::new(r"\b(the|and|or|but|if|when|where)\b")?,
418 Regex::new(r"\b(is|are|was|were|have|has|had)\b")?,
419 ],
420 confidence_threshold: 0.7,
421 },
422 );
423
424 Ok(())
426 }
427
428 pub async fn extract_knowledge(&mut self, text: &str) -> Result<ExtractedKnowledge> {
430 let start_time = std::time::Instant::now();
431 info!(
432 "Starting knowledge extraction from text of length: {}",
433 text.len()
434 );
435
436 let knowledge_id = Uuid::new_v4().to_string();
437 let mut extracted_triples = Vec::new();
438 let mut extracted_entities = Vec::new();
439 let mut extracted_relationships = Vec::new();
440 let mut schema_elements = Vec::new();
441 let mut temporal_facts = Vec::new();
442
443 let detected_language = self.detect_language(text).await?;
445 debug!("Detected language: {}", detected_language);
446
447 if self.config.enable_entity_extraction {
449 extracted_entities = self.extract_entities(text).await?;
450 debug!("Extracted {} entities", extracted_entities.len());
451 }
452
453 if self.config.enable_relationship_extraction {
455 extracted_relationships = self
456 .extract_relationships(text, &extracted_entities)
457 .await?;
458 debug!("Extracted {} relationships", extracted_relationships.len());
459 }
460
461 for relationship in &extracted_relationships {
463 if let Ok(triple) = self.relationship_to_triple(relationship) {
464 extracted_triples.push(triple);
465 }
466 }
467
468 if self.config.enable_schema_discovery {
470 schema_elements = self
471 .discover_schema_elements(text, &extracted_entities)
472 .await?;
473 debug!("Discovered {} schema elements", schema_elements.len());
474 }
475
476 if self.config.enable_temporal_extraction {
478 temporal_facts = self
479 .extract_temporal_facts(text, &extracted_entities)
480 .await?;
481 debug!("Extracted {} temporal facts", temporal_facts.len());
482 }
483
484 if self.config.enable_fact_validation {
486 self.validate_extracted_facts(&mut extracted_triples, &extracted_relationships)
487 .await?;
488 }
489
490 let confidence_score = self.calculate_extraction_confidence(
492 &extracted_entities,
493 &extracted_relationships,
494 &schema_elements,
495 );
496
497 let processing_time = start_time.elapsed().as_millis() as u64;
498
499 let extraction_statistics = ExtractionStatistics {
500 entities_extracted: extracted_entities.len(),
501 relationships_extracted: extracted_relationships.len(),
502 triples_generated: extracted_triples.len(),
503 schema_elements_discovered: schema_elements.len(),
504 temporal_facts_extracted: temporal_facts.len(),
505 average_confidence: confidence_score,
506 };
507
508 let extraction_metadata = ExtractionMetadata {
509 extraction_timestamp: Utc::now(),
510 extraction_method: "Pattern-based + LLM-enhanced".to_string(),
511 processing_time_ms: processing_time,
512 language_detected: detected_language,
513 text_length: text.len(),
514 extraction_statistics,
515 };
516
517 info!("Knowledge extraction completed in {}ms", processing_time);
518
519 Ok(ExtractedKnowledge {
520 knowledge_id,
521 source_text: text.to_string(),
522 extracted_triples,
523 extracted_entities,
524 extracted_relationships,
525 schema_elements,
526 temporal_facts,
527 confidence_score,
528 extraction_metadata,
529 })
530 }
531
532 async fn detect_language(&self, text: &str) -> Result<String> {
534 for (lang_code, detector) in &self.language_detectors {
536 let mut matches = 0;
537 let mut total_patterns = 0;
538
539 for pattern in &detector.detection_patterns {
540 total_patterns += 1;
541 if pattern.is_match(text) {
542 matches += 1;
543 }
544 }
545
546 let confidence = matches as f64 / total_patterns as f64;
547 if confidence >= detector.confidence_threshold {
548 return Ok(lang_code.clone());
549 }
550 }
551
552 Ok("unknown".to_string())
553 }
554
555 async fn extract_entities(&self, text: &str) -> Result<Vec<ExtractedEntity>> {
557 let mut entities = Vec::new();
558
559 for (entity_type, patterns) in &self.entity_patterns {
560 for pattern in patterns {
561 for capture in pattern.find_iter(text) {
562 let entity_text = capture.as_str();
563 let start_pos = capture.start();
564 let end_pos = capture.end();
565
566 let entity = ExtractedEntity {
567 entity_id: Uuid::new_v4().to_string(),
568 entity_text: entity_text.to_string(),
569 entity_type: entity_type.clone(),
570 canonical_form: self.canonicalize_entity(entity_text),
571 aliases: Vec::new(),
572 properties: HashMap::new(),
573 confidence: 0.8, source_position: TextPosition {
575 start_offset: start_pos,
576 end_offset: end_pos,
577 line_number: self.get_line_number(text, start_pos),
578 column_number: self.get_column_number(text, start_pos),
579 },
580 linked_entities: Vec::new(),
581 };
582
583 entities.push(entity);
584 }
585 }
586 }
587
588 entities.sort_by(|a, b| a.canonical_form.cmp(&b.canonical_form));
590 entities.dedup_by(|a, b| a.canonical_form == b.canonical_form);
591
592 Ok(entities)
593 }
594
595 async fn extract_relationships(
597 &self,
598 text: &str,
599 entities: &[ExtractedEntity],
600 ) -> Result<Vec<ExtractedRelationship>> {
601 let mut relationships = Vec::new();
602
603 for (relationship_type, patterns) in &self.relationship_patterns {
604 for pattern in patterns {
605 if let Some(captures) = pattern.captures(text) {
606 if captures.len() >= 3 {
607 let subject = captures.get(1).unwrap().as_str();
608 let object = captures.get(2).unwrap().as_str();
609
610 let subject_entity = self.find_matching_entity(subject, entities);
612 let object_entity = self.find_matching_entity(object, entities);
613
614 if let (Some(subj), Some(obj)) = (subject_entity, object_entity) {
615 let relationship = ExtractedRelationship {
616 relationship_id: Uuid::new_v4().to_string(),
617 subject_entity: subj.entity_id.clone(),
618 predicate: self.relationship_type_to_predicate(relationship_type),
619 object_entity: obj.entity_id.clone(),
620 relationship_type: relationship_type.clone(),
621 confidence: 0.8,
622 evidence_text: captures.get(0).unwrap().as_str().to_string(),
623 temporal_context: None,
624 source_position: TextPosition {
625 start_offset: captures.get(0).unwrap().start(),
626 end_offset: captures.get(0).unwrap().end(),
627 line_number: 1, column_number: 1, },
630 };
631
632 relationships.push(relationship);
633 }
634 }
635 }
636 }
637 }
638
639 Ok(relationships)
640 }
641
642 async fn discover_schema_elements(
644 &self,
645 text: &str,
646 _entities: &[ExtractedEntity],
647 ) -> Result<Vec<SchemaElement>> {
648 let mut schema_elements = Vec::new();
649
650 for rule in &self.schema_inference_rules {
651 for capture in rule.pattern.find_iter(text) {
652 let element = SchemaElement {
653 element_id: Uuid::new_v4().to_string(),
654 element_type: rule.inferred_type.clone(),
655 name: capture.as_str().to_string(),
656 description: format!("Inferred from: {}", capture.as_str()),
657 properties: Vec::new(),
658 hierarchical_relations: Vec::new(),
659 constraints: Vec::new(),
660 confidence: rule.confidence_modifier,
661 };
662
663 schema_elements.push(element);
664 }
665 }
666
667 Ok(schema_elements)
668 }
669
670 async fn extract_temporal_facts(
672 &self,
673 text: &str,
674 _entities: &[ExtractedEntity],
675 ) -> Result<Vec<TemporalFact>> {
676 let mut temporal_facts = Vec::new();
677
678 for pattern in &self.temporal_patterns {
679 for capture in pattern.find_iter(text) {
680 let temporal_text = capture.as_str();
681
682 let temporal_fact = TemporalFact {
683 fact_id: Uuid::new_v4().to_string(),
684 subject: "temporal_entity".to_string(), predicate: "occurs_at".to_string(),
686 object: temporal_text.to_string(),
687 temporal_qualifier: TemporalQualifier {
688 qualifier_type: TemporalType::PointInTime,
689 time_point: self.parse_temporal_expression(temporal_text),
690 time_interval: None,
691 frequency: None,
692 },
693 confidence: 0.8,
694 source_text: temporal_text.to_string(),
695 };
696
697 temporal_facts.push(temporal_fact);
698 }
699 }
700
701 Ok(temporal_facts)
702 }
703
704 async fn validate_extracted_facts(
706 &self,
707 triples: &mut Vec<Triple>,
708 relationships: &[ExtractedRelationship],
709 ) -> Result<()> {
710 let valid_relationships: Vec<_> = relationships
712 .iter()
713 .filter(|r| r.confidence >= self.config.confidence_threshold)
714 .collect();
715
716 let mut contradictions_found = 0;
718 let mut validated_triples = Vec::new();
719
720 let mut subject_predicates: HashMap<String, Vec<&ExtractedRelationship>> = HashMap::new();
722 let mut predicate_pairs: HashMap<String, Vec<(&str, &str)>> = HashMap::new();
723
724 for relationship in &valid_relationships {
725 subject_predicates
726 .entry(relationship.subject_entity.clone())
727 .or_default()
728 .push(relationship);
729
730 predicate_pairs
731 .entry(relationship.predicate.clone())
732 .or_default()
733 .push((&relationship.subject_entity, &relationship.object_entity));
734 }
735
736 for (subject, relationships) in &subject_predicates {
738 let mut predicate_values: HashMap<String, Vec<&str>> = HashMap::new();
739
740 for rel in relationships {
741 predicate_values
742 .entry(rel.predicate.clone())
743 .or_default()
744 .push(&rel.object_entity);
745 }
746
747 for (predicate, values) in predicate_values {
748 if values.len() > 1 {
749 let unique_values: std::collections::HashSet<_> = values.into_iter().collect();
751 if unique_values.len() > 1 && self.is_contradictory_predicate(&predicate) {
752 warn!(
753 "Contradiction detected for {}: {} has multiple {} values: {:?}",
754 subject, subject, predicate, unique_values
755 );
756 contradictions_found += 1;
757
758 if let Some(best_rel) = relationships
760 .iter()
761 .filter(|r| r.predicate == predicate)
762 .max_by(|a, b| {
763 a.confidence
764 .partial_cmp(&b.confidence)
765 .unwrap_or(std::cmp::Ordering::Equal)
766 })
767 {
768 if let Ok(triple) = self.relationship_to_triple(best_rel) {
769 validated_triples.push(triple);
770 }
771 }
772 continue;
773 }
774 }
775 }
776 }
777
778 for relationship in &valid_relationships {
780 if let Some(temporal_context) = &relationship.temporal_context {
781 if !self.validate_temporal_consistency(temporal_context, &valid_relationships) {
782 warn!(
783 "Temporal inconsistency detected for relationship: {} {} {}",
784 relationship.subject_entity,
785 relationship.predicate,
786 relationship.object_entity
787 );
788 contradictions_found += 1;
789 continue;
790 }
791 }
792
793 if let Ok(triple) = self.relationship_to_triple(relationship) {
795 validated_triples.push(triple);
796 }
797 }
798
799 self.validate_logical_consistency(&valid_relationships, &mut contradictions_found)?;
801
802 triples.clear();
804 triples.extend(validated_triples);
805
806 if contradictions_found > 0 {
807 warn!(
808 "Found {} contradictions during fact validation",
809 contradictions_found
810 );
811 }
812
813 debug!("Validated {} relationships", valid_relationships.len());
814 Ok(())
815 }
816
817 fn is_contradictory_predicate(&self, predicate: &str) -> bool {
819 let functional_predicates = [
821 "birthDate",
822 "deathDate",
823 "age",
824 "height",
825 "weight",
826 "hasGender",
827 "isA",
828 "type",
829 "hasCapital",
830 "hasPopulation",
831 "hasArea",
832 "founded",
833 "established",
834 "created",
835 "died",
836 "born",
837 ];
838
839 functional_predicates.iter().any(|&fp| {
840 predicate.to_lowercase().contains(&fp.to_lowercase()) || predicate.ends_with(&fp)
841 })
842 }
843
844 fn validate_temporal_consistency(
846 &self,
847 temporal_context: &TemporalContext,
848 all_relationships: &[&ExtractedRelationship],
849 ) -> bool {
850 if let (Some(start), Some(end)) = (&temporal_context.start_time, &temporal_context.end_time)
852 {
853 if start >= end {
854 return false; }
856 }
857
858 for other_rel in all_relationships {
860 if let Some(other_temporal) = &other_rel.temporal_context {
861 if temporal_context != other_temporal {
863 if self.temporal_periods_conflict(temporal_context, other_temporal) {
865 return false;
866 }
867 }
868 }
869 }
870
871 true
872 }
873
874 fn temporal_periods_conflict(
876 &self,
877 context1: &TemporalContext,
878 context2: &TemporalContext,
879 ) -> bool {
880 match (
883 (&context1.start_time, &context1.end_time),
884 (&context2.start_time, &context2.end_time),
885 ) {
886 ((Some(start1), Some(end1)), (Some(start2), Some(end2))) => {
887 end1 < start2 || end2 < start1
889 }
890 _ => false, }
892 }
893
894 fn validate_logical_consistency(
896 &self,
897 relationships: &[&ExtractedRelationship],
898 contradictions_found: &mut usize,
899 ) -> Result<()> {
900 let mut is_a_relationships: HashMap<String, String> = HashMap::new();
902 let mut part_of_relationships: HashMap<String, String> = HashMap::new();
903
904 for rel in relationships {
906 let pred_lower = rel.predicate.to_lowercase();
907 if pred_lower.contains("isa")
908 || pred_lower.contains("instanceof")
909 || pred_lower.contains("type")
910 {
911 is_a_relationships.insert(rel.subject_entity.clone(), rel.object_entity.clone());
912 } else if pred_lower.contains("partof")
913 || pred_lower.contains("contains")
914 || pred_lower.contains("within")
915 {
916 part_of_relationships.insert(rel.subject_entity.clone(), rel.object_entity.clone());
917 }
918 }
919
920 for (subject, object) in &is_a_relationships {
922 if self.has_cycle_in_hierarchy(subject, object, &is_a_relationships) {
923 warn!(
924 "Logical contradiction: Cycle detected in is-a relationship for {}",
925 subject
926 );
927 *contradictions_found += 1;
928 }
929 }
930
931 for (subject, object) in &part_of_relationships {
933 if self.has_cycle_in_hierarchy(subject, object, &part_of_relationships) {
934 warn!(
935 "Logical contradiction: Cycle detected in part-of relationship for {}",
936 subject
937 );
938 *contradictions_found += 1;
939 }
940 }
941
942 self.validate_domain_range_constraints(relationships, contradictions_found)?;
944
945 Ok(())
946 }
947
948 fn has_cycle_in_hierarchy(
950 &self,
951 start: &str,
952 current: &str,
953 hierarchy: &HashMap<String, String>,
954 ) -> bool {
955 if start == current {
956 return true; }
958
959 let mut visited = std::collections::HashSet::new();
961 let mut current_node = current;
962
963 while let Some(parent) = hierarchy.get(current_node) {
964 if visited.contains(current_node) || current_node == start {
965 return true; }
967 visited.insert(current_node.to_string());
968 current_node = parent;
969 }
970
971 false
972 }
973
974 fn validate_domain_range_constraints(
976 &self,
977 relationships: &[&ExtractedRelationship],
978 contradictions_found: &mut usize,
979 ) -> Result<()> {
980 let constraints = [
982 ("age", "Person", "Number"),
983 ("birthDate", "Person", "Date"),
984 ("hasCapital", "Country", "City"),
985 ("hasPopulation", "Place", "Number"),
986 ("authorOf", "Person", "Book"),
987 ("marriedTo", "Person", "Person"),
988 ];
989
990 for rel in relationships {
991 for (predicate, expected_domain, expected_range) in &constraints {
992 if rel
993 .predicate
994 .to_lowercase()
995 .contains(&predicate.to_lowercase())
996 {
997 if !self.entity_matches_type(
999 &rel.subject_entity,
1000 expected_domain,
1001 relationships,
1002 ) {
1003 warn!(
1004 "Domain constraint violation: {} should be of type {} for predicate {}",
1005 rel.subject_entity, expected_domain, rel.predicate
1006 );
1007 *contradictions_found += 1;
1008 }
1009
1010 if !self.entity_matches_type(&rel.object_entity, expected_range, relationships)
1012 {
1013 warn!(
1014 "Range constraint violation: {} should be of type {} for predicate {}",
1015 rel.object_entity, expected_range, rel.predicate
1016 );
1017 *contradictions_found += 1;
1018 }
1019 }
1020 }
1021 }
1022
1023 Ok(())
1024 }
1025
1026 fn entity_matches_type(
1028 &self,
1029 entity: &str,
1030 expected_type: &str,
1031 relationships: &[&ExtractedRelationship],
1032 ) -> bool {
1033 let entity_lower = entity.to_lowercase();
1035 let type_lower = expected_type.to_lowercase();
1036
1037 match type_lower.as_str() {
1039 "person" => {
1040 entity_lower.contains("person") ||
1041 entity_lower.contains("author") ||
1042 entity_lower.contains("writer") ||
1043 entity_lower.contains("scientist") ||
1044 entity.chars().next().is_some_and(|c| c.is_uppercase())
1046 }
1047 "number" => {
1048 entity.parse::<f64>().is_ok()
1049 || entity_lower.contains("million")
1050 || entity_lower.contains("thousand")
1051 || entity_lower.contains("year")
1052 }
1053 "date" => {
1054 entity_lower.contains("19") || entity_lower.contains("20") || entity_lower.contains("january") || entity_lower.contains("february") ||
1056 entity_lower.contains("march") || entity_lower.contains("april") ||
1057 entity_lower.contains("may") || entity_lower.contains("june") ||
1058 entity_lower.contains("july") || entity_lower.contains("august") ||
1059 entity_lower.contains("september") || entity_lower.contains("october") ||
1060 entity_lower.contains("november") || entity_lower.contains("december")
1061 }
1062 "country" => {
1063 entity_lower.contains("country") ||
1064 entity_lower.contains("nation") ||
1065 relationships.iter().any(|r| r.subject_entity == entity &&
1067 r.predicate.to_lowercase().contains("type") &&
1068 r.object_entity.to_lowercase().contains("country"))
1069 }
1070 "city" => {
1071 entity_lower.contains("city") ||
1072 entity_lower.contains("town") ||
1073 relationships.iter().any(|r| r.subject_entity == entity &&
1075 r.predicate.to_lowercase().contains("type") &&
1076 r.object_entity.to_lowercase().contains("city"))
1077 }
1078 "book" => {
1079 entity_lower.contains("book") ||
1080 entity_lower.contains("novel") ||
1081 entity_lower.contains("publication") ||
1082 relationships.iter().any(|r| r.subject_entity == entity &&
1084 r.predicate.to_lowercase().contains("type") &&
1085 r.object_entity.to_lowercase().contains("book"))
1086 }
1087 _ => true, }
1089 }
1090
1091 fn relationship_to_triple(&self, relationship: &ExtractedRelationship) -> Result<Triple> {
1093 let subject = NamedNode::new(format!(
1095 "http://example.org/entity/{}",
1096 relationship.subject_entity
1097 ))?;
1098 let predicate = NamedNode::new(format!(
1099 "http://example.org/predicate/{}",
1100 relationship.predicate
1101 ))?;
1102 let object = NamedNode::new(format!(
1103 "http://example.org/entity/{}",
1104 relationship.object_entity
1105 ))?;
1106
1107 Ok(Triple::new(
1108 Subject::NamedNode(subject),
1109 Predicate::NamedNode(predicate),
1110 Object::NamedNode(object),
1111 ))
1112 }
1113
1114 fn canonicalize_entity(&self, entity: &str) -> String {
1116 entity.trim().to_lowercase()
1117 }
1118
1119 fn get_line_number(&self, text: &str, offset: usize) -> usize {
1120 text[..offset].chars().filter(|&c| c == '\n').count() + 1
1121 }
1122
1123 fn get_column_number(&self, text: &str, offset: usize) -> usize {
1124 text[..offset]
1125 .chars()
1126 .rev()
1127 .take_while(|&c| c != '\n')
1128 .count()
1129 + 1
1130 }
1131
1132 fn find_matching_entity<'a>(
1133 &self,
1134 text: &str,
1135 entities: &'a [ExtractedEntity],
1136 ) -> Option<&'a ExtractedEntity> {
1137 entities
1138 .iter()
1139 .find(|e| e.entity_text == text || e.canonical_form == self.canonicalize_entity(text))
1140 }
1141
1142 fn relationship_type_to_predicate(&self, rel_type: &RelationshipType) -> String {
1143 match rel_type {
1144 RelationshipType::IsA => "rdf:type".to_string(),
1145 RelationshipType::PartOf => "part_of".to_string(),
1146 RelationshipType::LocatedIn => "located_in".to_string(),
1147 RelationshipType::OwnedBy => "owned_by".to_string(),
1148 RelationshipType::CreatedBy => "created_by".to_string(),
1149 RelationshipType::CausedBy => "caused_by".to_string(),
1150 RelationshipType::TemporalSequence => "temporal_sequence".to_string(),
1151 RelationshipType::Similarity => "similar_to".to_string(),
1152 RelationshipType::Dependency => "depends_on".to_string(),
1153 RelationshipType::Custom(pred) => pred.clone(),
1154 }
1155 }
1156
1157 fn parse_temporal_expression(&self, temporal_text: &str) -> Option<DateTime<Utc>> {
1158 if let Ok(dt) = chrono::DateTime::parse_from_str(temporal_text, "%Y-%m-%d") {
1160 Some(dt.with_timezone(&Utc))
1161 } else {
1162 None
1163 }
1164 }
1165
1166 fn calculate_extraction_confidence(
1167 &self,
1168 entities: &[ExtractedEntity],
1169 relationships: &[ExtractedRelationship],
1170 schema_elements: &[SchemaElement],
1171 ) -> f64 {
1172 let mut total_confidence = 0.0;
1173 let mut count = 0;
1174
1175 for entity in entities {
1176 total_confidence += entity.confidence;
1177 count += 1;
1178 }
1179
1180 for relationship in relationships {
1181 total_confidence += relationship.confidence;
1182 count += 1;
1183 }
1184
1185 for schema_element in schema_elements {
1186 total_confidence += schema_element.confidence;
1187 count += 1;
1188 }
1189
1190 if count > 0 {
1191 total_confidence / count as f64
1192 } else {
1193 0.0
1194 }
1195 }
1196}
1197
1198#[cfg(test)]
1199mod tests {
1200 use super::*;
1201
1202 #[tokio::test]
1203 async fn test_knowledge_extraction_engine_creation() {
1204 let config = KnowledgeExtractionConfig::default();
1205 let engine = KnowledgeExtractionEngine::new(config);
1206
1207 assert!(engine.is_ok());
1208 }
1209
1210 #[tokio::test]
1211 async fn test_entity_extraction() {
1212 let config = KnowledgeExtractionConfig::default();
1213 let mut engine = KnowledgeExtractionEngine::new(config).unwrap();
1214
1215 let text = "Dr. John Smith works at Microsoft Corp.";
1216 let result = engine.extract_knowledge(text).await;
1217
1218 assert!(result.is_ok());
1219 let knowledge = result.unwrap();
1220 assert!(!knowledge.extracted_entities.is_empty());
1221 }
1222
1223 #[test]
1224 fn test_canonicalize_entity() {
1225 let config = KnowledgeExtractionConfig::default();
1226 let engine = KnowledgeExtractionEngine::new(config).unwrap();
1227
1228 assert_eq!(engine.canonicalize_entity(" John Smith "), "john smith");
1229 }
1230}