1use anyhow::Result;
11use chrono::{DateTime, Utc};
12use oxirs_core::model::{triple::Triple, NamedNode, Object, Predicate, Subject};
13use regex::Regex;
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16use tracing::{debug, info, warn};
17use uuid::Uuid;
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct KnowledgeExtractionConfig {
22 pub enable_entity_extraction: bool,
23 pub enable_relationship_extraction: bool,
24 pub enable_schema_discovery: bool,
25 pub enable_fact_validation: bool,
26 pub enable_temporal_extraction: bool,
27 pub enable_multilingual_extraction: bool,
28 pub confidence_threshold: f64,
29 pub max_extraction_depth: usize,
30 pub language_models: Vec<String>,
31}
32
33impl Default for KnowledgeExtractionConfig {
34 fn default() -> Self {
35 Self {
36 enable_entity_extraction: true,
37 enable_relationship_extraction: true,
38 enable_schema_discovery: true,
39 enable_fact_validation: true,
40 enable_temporal_extraction: true,
41 enable_multilingual_extraction: false,
42 confidence_threshold: 0.8,
43 max_extraction_depth: 3,
44 language_models: vec!["en".to_string()],
45 }
46 }
47}
48
49#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ExtractedKnowledge {
52 pub knowledge_id: String,
53 pub source_text: String,
54 pub extracted_triples: Vec<Triple>,
55 pub extracted_entities: Vec<ExtractedEntity>,
56 pub extracted_relationships: Vec<ExtractedRelationship>,
57 pub schema_elements: Vec<SchemaElement>,
58 pub temporal_facts: Vec<TemporalFact>,
59 pub confidence_score: f64,
60 pub extraction_metadata: ExtractionMetadata,
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct ExtractedEntity {
66 pub entity_id: String,
67 pub entity_text: String,
68 pub entity_type: EntityType,
69 pub canonical_form: String,
70 pub aliases: Vec<String>,
71 pub properties: HashMap<String, String>,
72 pub confidence: f64,
73 pub source_position: TextPosition,
74 pub linked_entities: Vec<String>,
75}
76
77#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct ExtractedRelationship {
80 pub relationship_id: String,
81 pub subject_entity: String,
82 pub predicate: String,
83 pub object_entity: String,
84 pub relationship_type: RelationshipType,
85 pub confidence: f64,
86 pub evidence_text: String,
87 pub temporal_context: Option<TemporalContext>,
88 pub source_position: TextPosition,
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct SchemaElement {
94 pub element_id: String,
95 pub element_type: SchemaElementType,
96 pub name: String,
97 pub description: String,
98 pub properties: Vec<SchemaProperty>,
99 pub hierarchical_relations: Vec<HierarchicalRelation>,
100 pub constraints: Vec<SchemaConstraint>,
101 pub confidence: f64,
102}
103
104#[derive(Debug, Clone, Serialize, Deserialize)]
106pub struct TemporalFact {
107 pub fact_id: String,
108 pub subject: String,
109 pub predicate: String,
110 pub object: String,
111 pub temporal_qualifier: TemporalQualifier,
112 pub confidence: f64,
113 pub source_text: String,
114}
115
116#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
118pub enum EntityType {
119 Person,
120 Organization,
121 Location,
122 Event,
123 Concept,
124 Product,
125 Technology,
126 Scientific,
127 Temporal,
128 Numerical,
129 Unknown,
130}
131
132#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
134pub enum RelationshipType {
135 IsA,
136 PartOf,
137 LocatedIn,
138 OwnedBy,
139 CreatedBy,
140 CausedBy,
141 TemporalSequence,
142 Similarity,
143 Dependency,
144 Custom(String),
145}
146
147#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
149pub enum SchemaElementType {
150 Class,
151 Property,
152 Relationship,
153 Constraint,
154 Rule,
155}
156
157#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct TextPosition {
160 pub start_offset: usize,
161 pub end_offset: usize,
162 pub line_number: usize,
163 pub column_number: usize,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
168pub struct TemporalContext {
169 pub start_time: Option<DateTime<Utc>>,
170 pub end_time: Option<DateTime<Utc>>,
171 pub duration: Option<std::time::Duration>,
172 pub temporal_relation: String,
173}
174
175#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct TemporalQualifier {
178 pub qualifier_type: TemporalType,
179 pub time_point: Option<DateTime<Utc>>,
180 pub time_interval: Option<TimeInterval>,
181 pub frequency: Option<String>,
182}
183
184#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
186pub enum TemporalType {
187 PointInTime,
188 TimeInterval,
189 Frequency,
190 Duration,
191 Relative,
192}
193
194#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct TimeInterval {
197 pub start: DateTime<Utc>,
198 pub end: DateTime<Utc>,
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize)]
203pub struct SchemaProperty {
204 pub property_name: String,
205 pub property_type: String,
206 pub cardinality: Cardinality,
207 pub domain: Option<String>,
208 pub range: Option<String>,
209}
210
211#[derive(Debug, Clone, Serialize, Deserialize)]
213pub struct HierarchicalRelation {
214 pub relation_type: HierarchyType,
215 pub parent: String,
216 pub child: String,
217}
218
219#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct SchemaConstraint {
222 pub constraint_type: ConstraintType,
223 pub description: String,
224 pub enforcement_level: EnforcementLevel,
225}
226
227#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
229pub enum Cardinality {
230 ZeroOrOne,
231 ExactlyOne,
232 ZeroOrMore,
233 OneOrMore,
234 Exact(usize),
235 Range(usize, usize),
236}
237
238#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
240pub enum HierarchyType {
241 SubClassOf,
242 SubPropertyOf,
243 PartOf,
244 InstanceOf,
245}
246
247#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
249pub enum ConstraintType {
250 UniqueValue,
251 RequiredProperty,
252 ValueRange,
253 DataType,
254 Pattern,
255 Cardinality,
256}
257
258#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
260pub enum EnforcementLevel {
261 Strict,
262 Warning,
263 Suggestion,
264}
265
266#[derive(Debug, Clone, Serialize, Deserialize)]
268pub struct ExtractionMetadata {
269 pub extraction_timestamp: DateTime<Utc>,
270 pub extraction_method: String,
271 pub processing_time_ms: u64,
272 pub language_detected: String,
273 pub text_length: usize,
274 pub extraction_statistics: ExtractionStatistics,
275}
276
277#[derive(Debug, Clone, Serialize, Deserialize)]
279pub struct ExtractionStatistics {
280 pub entities_extracted: usize,
281 pub relationships_extracted: usize,
282 pub triples_generated: usize,
283 pub schema_elements_discovered: usize,
284 pub temporal_facts_extracted: usize,
285 pub average_confidence: f64,
286}
287
288pub struct KnowledgeExtractionEngine {
290 config: KnowledgeExtractionConfig,
291 entity_patterns: HashMap<EntityType, Vec<Regex>>,
292 relationship_patterns: HashMap<RelationshipType, Vec<Regex>>,
293 temporal_patterns: Vec<Regex>,
294 schema_inference_rules: Vec<SchemaInferenceRule>,
295 language_detectors: HashMap<String, LanguageDetector>,
296}
297
298#[derive(Debug, Clone)]
300struct SchemaInferenceRule {
301 rule_id: String,
302 pattern: Regex,
303 inferred_type: SchemaElementType,
304 confidence_modifier: f64,
305}
306
307#[derive(Debug, Clone)]
309struct LanguageDetector {
310 language_code: String,
311 detection_patterns: Vec<Regex>,
312 confidence_threshold: f64,
313}
314
315impl KnowledgeExtractionEngine {
316 pub fn new(config: KnowledgeExtractionConfig) -> Result<Self> {
318 let mut engine = Self {
319 config,
320 entity_patterns: HashMap::new(),
321 relationship_patterns: HashMap::new(),
322 temporal_patterns: Vec::new(),
323 schema_inference_rules: Vec::new(),
324 language_detectors: HashMap::new(),
325 };
326
327 engine.initialize_extraction_patterns()?;
328 engine.initialize_schema_rules()?;
329 engine.initialize_language_detectors()?;
330
331 Ok(engine)
332 }
333
334 fn initialize_extraction_patterns(&mut self) -> Result<()> {
336 let person_patterns = vec![
338 Regex::new(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b")?, Regex::new(r"\bDr\. [A-Z][a-z]+\b")?, Regex::new(r"\bProf\. [A-Z][a-z]+\b")?, ];
342 self.entity_patterns
343 .insert(EntityType::Person, person_patterns);
344
345 let org_patterns = vec![
347 Regex::new(r"\b[A-Z][a-z]+ (Inc|Corp|Ltd|LLC)\b")?,
348 Regex::new(r"\bUniversity of [A-Z][a-z]+\b")?,
349 Regex::new(r"\b[A-Z][A-Z]+ Corporation\b")?,
350 ];
351 self.entity_patterns
352 .insert(EntityType::Organization, org_patterns);
353
354 let location_patterns = vec![
356 Regex::new(r"\b[A-Z][a-z]+, [A-Z][A-Z]\b")?, Regex::new(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b")?, ];
359 self.entity_patterns
360 .insert(EntityType::Location, location_patterns);
361
362 self.temporal_patterns = vec![
364 Regex::new(r"\b\d{4}-\d{2}-\d{2}\b")?, Regex::new(
366 r"\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b",
367 )?,
368 Regex::new(r"\b(before|after|during|since|until) \d{4}\b")?,
369 ];
370
371 let isa_patterns = vec![
373 Regex::new(r"(.+) is an? (.+)")?,
374 Regex::new(r"(.+) type of (.+)")?,
375 ];
376 self.relationship_patterns
377 .insert(RelationshipType::IsA, isa_patterns);
378
379 let partof_patterns = vec![
380 Regex::new(r"(.+) part of (.+)")?,
381 Regex::new(r"(.+) component of (.+)")?,
382 ];
383 self.relationship_patterns
384 .insert(RelationshipType::PartOf, partof_patterns);
385
386 Ok(())
387 }
388
389 fn initialize_schema_rules(&mut self) -> Result<()> {
391 self.schema_inference_rules = vec![
392 SchemaInferenceRule {
393 rule_id: "class_definition".to_string(),
394 pattern: Regex::new(r"(.+) is a type of (.+)")?,
395 inferred_type: SchemaElementType::Class,
396 confidence_modifier: 0.9,
397 },
398 SchemaInferenceRule {
399 rule_id: "property_definition".to_string(),
400 pattern: Regex::new(r"(.+) has (.+)")?,
401 inferred_type: SchemaElementType::Property,
402 confidence_modifier: 0.8,
403 },
404 ];
405
406 Ok(())
407 }
408
409 fn initialize_language_detectors(&mut self) -> Result<()> {
411 self.language_detectors.insert(
413 "en".to_string(),
414 LanguageDetector {
415 language_code: "en".to_string(),
416 detection_patterns: vec![
417 Regex::new(r"\b(the|and|or|but|if|when|where)\b")?,
418 Regex::new(r"\b(is|are|was|were|have|has|had)\b")?,
419 ],
420 confidence_threshold: 0.7,
421 },
422 );
423
424 Ok(())
426 }
427
428 pub async fn extract_knowledge(&mut self, text: &str) -> Result<ExtractedKnowledge> {
430 let start_time = std::time::Instant::now();
431 info!(
432 "Starting knowledge extraction from text of length: {}",
433 text.len()
434 );
435
436 let knowledge_id = Uuid::new_v4().to_string();
437 let mut extracted_triples = Vec::new();
438 let mut extracted_entities = Vec::new();
439 let mut extracted_relationships = Vec::new();
440 let mut schema_elements = Vec::new();
441 let mut temporal_facts = Vec::new();
442
443 let detected_language = self.detect_language(text).await?;
445 debug!("Detected language: {}", detected_language);
446
447 if self.config.enable_entity_extraction {
449 extracted_entities = self.extract_entities(text).await?;
450 debug!("Extracted {} entities", extracted_entities.len());
451 }
452
453 if self.config.enable_relationship_extraction {
455 extracted_relationships = self
456 .extract_relationships(text, &extracted_entities)
457 .await?;
458 debug!("Extracted {} relationships", extracted_relationships.len());
459 }
460
461 for relationship in &extracted_relationships {
463 if let Ok(triple) = self.relationship_to_triple(relationship) {
464 extracted_triples.push(triple);
465 }
466 }
467
468 if self.config.enable_schema_discovery {
470 schema_elements = self
471 .discover_schema_elements(text, &extracted_entities)
472 .await?;
473 debug!("Discovered {} schema elements", schema_elements.len());
474 }
475
476 if self.config.enable_temporal_extraction {
478 temporal_facts = self
479 .extract_temporal_facts(text, &extracted_entities)
480 .await?;
481 debug!("Extracted {} temporal facts", temporal_facts.len());
482 }
483
484 if self.config.enable_fact_validation {
486 self.validate_extracted_facts(&mut extracted_triples, &extracted_relationships)
487 .await?;
488 }
489
490 let confidence_score = self.calculate_extraction_confidence(
492 &extracted_entities,
493 &extracted_relationships,
494 &schema_elements,
495 );
496
497 let processing_time = start_time.elapsed().as_millis() as u64;
498
499 let extraction_statistics = ExtractionStatistics {
500 entities_extracted: extracted_entities.len(),
501 relationships_extracted: extracted_relationships.len(),
502 triples_generated: extracted_triples.len(),
503 schema_elements_discovered: schema_elements.len(),
504 temporal_facts_extracted: temporal_facts.len(),
505 average_confidence: confidence_score,
506 };
507
508 let extraction_metadata = ExtractionMetadata {
509 extraction_timestamp: Utc::now(),
510 extraction_method: "Pattern-based + LLM-enhanced".to_string(),
511 processing_time_ms: processing_time,
512 language_detected: detected_language,
513 text_length: text.len(),
514 extraction_statistics,
515 };
516
517 info!("Knowledge extraction completed in {}ms", processing_time);
518
519 Ok(ExtractedKnowledge {
520 knowledge_id,
521 source_text: text.to_string(),
522 extracted_triples,
523 extracted_entities,
524 extracted_relationships,
525 schema_elements,
526 temporal_facts,
527 confidence_score,
528 extraction_metadata,
529 })
530 }
531
532 async fn detect_language(&self, text: &str) -> Result<String> {
534 for (lang_code, detector) in &self.language_detectors {
536 let mut matches = 0;
537 let mut total_patterns = 0;
538
539 for pattern in &detector.detection_patterns {
540 total_patterns += 1;
541 if pattern.is_match(text) {
542 matches += 1;
543 }
544 }
545
546 let confidence = matches as f64 / total_patterns as f64;
547 if confidence >= detector.confidence_threshold {
548 return Ok(lang_code.clone());
549 }
550 }
551
552 Ok("unknown".to_string())
553 }
554
555 async fn extract_entities(&self, text: &str) -> Result<Vec<ExtractedEntity>> {
557 let mut entities = Vec::new();
558
559 for (entity_type, patterns) in &self.entity_patterns {
560 for pattern in patterns {
561 for capture in pattern.find_iter(text) {
562 let entity_text = capture.as_str();
563 let start_pos = capture.start();
564 let end_pos = capture.end();
565
566 let entity = ExtractedEntity {
567 entity_id: Uuid::new_v4().to_string(),
568 entity_text: entity_text.to_string(),
569 entity_type: entity_type.clone(),
570 canonical_form: self.canonicalize_entity(entity_text),
571 aliases: Vec::new(),
572 properties: HashMap::new(),
573 confidence: 0.8, source_position: TextPosition {
575 start_offset: start_pos,
576 end_offset: end_pos,
577 line_number: self.get_line_number(text, start_pos),
578 column_number: self.get_column_number(text, start_pos),
579 },
580 linked_entities: Vec::new(),
581 };
582
583 entities.push(entity);
584 }
585 }
586 }
587
588 entities.sort_by(|a, b| a.canonical_form.cmp(&b.canonical_form));
590 entities.dedup_by(|a, b| a.canonical_form == b.canonical_form);
591
592 Ok(entities)
593 }
594
595 async fn extract_relationships(
597 &self,
598 text: &str,
599 entities: &[ExtractedEntity],
600 ) -> Result<Vec<ExtractedRelationship>> {
601 let mut relationships = Vec::new();
602
603 for (relationship_type, patterns) in &self.relationship_patterns {
604 for pattern in patterns {
605 if let Some(captures) = pattern.captures(text) {
606 if captures.len() >= 3 {
607 let subject = captures
608 .get(1)
609 .expect("capture group 1 should exist")
610 .as_str();
611 let object = captures
612 .get(2)
613 .expect("capture group 2 should exist")
614 .as_str();
615
616 let subject_entity = self.find_matching_entity(subject, entities);
618 let object_entity = self.find_matching_entity(object, entities);
619
620 if let (Some(subj), Some(obj)) = (subject_entity, object_entity) {
621 let relationship = ExtractedRelationship {
622 relationship_id: Uuid::new_v4().to_string(),
623 subject_entity: subj.entity_id.clone(),
624 predicate: self.relationship_type_to_predicate(relationship_type),
625 object_entity: obj.entity_id.clone(),
626 relationship_type: relationship_type.clone(),
627 confidence: 0.8,
628 evidence_text: captures
629 .get(0)
630 .expect("capture group 0 should exist")
631 .as_str()
632 .to_string(),
633 temporal_context: None,
634 source_position: TextPosition {
635 start_offset: captures
636 .get(0)
637 .expect("capture group 0 should exist")
638 .start(),
639 end_offset: captures
640 .get(0)
641 .expect("capture group 0 should exist")
642 .end(),
643 line_number: 1, column_number: 1, },
646 };
647
648 relationships.push(relationship);
649 }
650 }
651 }
652 }
653 }
654
655 Ok(relationships)
656 }
657
658 async fn discover_schema_elements(
660 &self,
661 text: &str,
662 _entities: &[ExtractedEntity],
663 ) -> Result<Vec<SchemaElement>> {
664 let mut schema_elements = Vec::new();
665
666 for rule in &self.schema_inference_rules {
667 for capture in rule.pattern.find_iter(text) {
668 let element = SchemaElement {
669 element_id: Uuid::new_v4().to_string(),
670 element_type: rule.inferred_type.clone(),
671 name: capture.as_str().to_string(),
672 description: format!("Inferred from: {}", capture.as_str()),
673 properties: Vec::new(),
674 hierarchical_relations: Vec::new(),
675 constraints: Vec::new(),
676 confidence: rule.confidence_modifier,
677 };
678
679 schema_elements.push(element);
680 }
681 }
682
683 Ok(schema_elements)
684 }
685
686 async fn extract_temporal_facts(
688 &self,
689 text: &str,
690 _entities: &[ExtractedEntity],
691 ) -> Result<Vec<TemporalFact>> {
692 let mut temporal_facts = Vec::new();
693
694 for pattern in &self.temporal_patterns {
695 for capture in pattern.find_iter(text) {
696 let temporal_text = capture.as_str();
697
698 let temporal_fact = TemporalFact {
699 fact_id: Uuid::new_v4().to_string(),
700 subject: "temporal_entity".to_string(), predicate: "occurs_at".to_string(),
702 object: temporal_text.to_string(),
703 temporal_qualifier: TemporalQualifier {
704 qualifier_type: TemporalType::PointInTime,
705 time_point: self.parse_temporal_expression(temporal_text),
706 time_interval: None,
707 frequency: None,
708 },
709 confidence: 0.8,
710 source_text: temporal_text.to_string(),
711 };
712
713 temporal_facts.push(temporal_fact);
714 }
715 }
716
717 Ok(temporal_facts)
718 }
719
720 async fn validate_extracted_facts(
722 &self,
723 triples: &mut Vec<Triple>,
724 relationships: &[ExtractedRelationship],
725 ) -> Result<()> {
726 let valid_relationships: Vec<_> = relationships
728 .iter()
729 .filter(|r| r.confidence >= self.config.confidence_threshold)
730 .collect();
731
732 let mut contradictions_found = 0;
734 let mut validated_triples = Vec::new();
735
736 let mut subject_predicates: HashMap<String, Vec<&ExtractedRelationship>> = HashMap::new();
738 let mut predicate_pairs: HashMap<String, Vec<(&str, &str)>> = HashMap::new();
739
740 for relationship in &valid_relationships {
741 subject_predicates
742 .entry(relationship.subject_entity.clone())
743 .or_default()
744 .push(relationship);
745
746 predicate_pairs
747 .entry(relationship.predicate.clone())
748 .or_default()
749 .push((&relationship.subject_entity, &relationship.object_entity));
750 }
751
752 for (subject, relationships) in &subject_predicates {
754 let mut predicate_values: HashMap<String, Vec<&str>> = HashMap::new();
755
756 for rel in relationships {
757 predicate_values
758 .entry(rel.predicate.clone())
759 .or_default()
760 .push(&rel.object_entity);
761 }
762
763 for (predicate, values) in predicate_values {
764 if values.len() > 1 {
765 let unique_values: std::collections::HashSet<_> = values.into_iter().collect();
767 if unique_values.len() > 1 && self.is_contradictory_predicate(&predicate) {
768 warn!(
769 "Contradiction detected for {}: {} has multiple {} values: {:?}",
770 subject, subject, predicate, unique_values
771 );
772 contradictions_found += 1;
773
774 if let Some(best_rel) = relationships
776 .iter()
777 .filter(|r| r.predicate == predicate)
778 .max_by(|a, b| {
779 a.confidence
780 .partial_cmp(&b.confidence)
781 .unwrap_or(std::cmp::Ordering::Equal)
782 })
783 {
784 if let Ok(triple) = self.relationship_to_triple(best_rel) {
785 validated_triples.push(triple);
786 }
787 }
788 continue;
789 }
790 }
791 }
792 }
793
794 for relationship in &valid_relationships {
796 if let Some(temporal_context) = &relationship.temporal_context {
797 if !self.validate_temporal_consistency(temporal_context, &valid_relationships) {
798 warn!(
799 "Temporal inconsistency detected for relationship: {} {} {}",
800 relationship.subject_entity,
801 relationship.predicate,
802 relationship.object_entity
803 );
804 contradictions_found += 1;
805 continue;
806 }
807 }
808
809 if let Ok(triple) = self.relationship_to_triple(relationship) {
811 validated_triples.push(triple);
812 }
813 }
814
815 self.validate_logical_consistency(&valid_relationships, &mut contradictions_found)?;
817
818 triples.clear();
820 triples.extend(validated_triples);
821
822 if contradictions_found > 0 {
823 warn!(
824 "Found {} contradictions during fact validation",
825 contradictions_found
826 );
827 }
828
829 debug!("Validated {} relationships", valid_relationships.len());
830 Ok(())
831 }
832
833 fn is_contradictory_predicate(&self, predicate: &str) -> bool {
835 let functional_predicates = [
837 "birthDate",
838 "deathDate",
839 "age",
840 "height",
841 "weight",
842 "hasGender",
843 "isA",
844 "type",
845 "hasCapital",
846 "hasPopulation",
847 "hasArea",
848 "founded",
849 "established",
850 "created",
851 "died",
852 "born",
853 ];
854
855 functional_predicates.iter().any(|&fp| {
856 predicate.to_lowercase().contains(&fp.to_lowercase()) || predicate.ends_with(&fp)
857 })
858 }
859
860 fn validate_temporal_consistency(
862 &self,
863 temporal_context: &TemporalContext,
864 all_relationships: &[&ExtractedRelationship],
865 ) -> bool {
866 if let (Some(start), Some(end)) = (&temporal_context.start_time, &temporal_context.end_time)
868 {
869 if start >= end {
870 return false; }
872 }
873
874 for other_rel in all_relationships {
876 if let Some(other_temporal) = &other_rel.temporal_context {
877 if temporal_context != other_temporal {
879 if self.temporal_periods_conflict(temporal_context, other_temporal) {
881 return false;
882 }
883 }
884 }
885 }
886
887 true
888 }
889
890 fn temporal_periods_conflict(
892 &self,
893 context1: &TemporalContext,
894 context2: &TemporalContext,
895 ) -> bool {
896 match (
899 (&context1.start_time, &context1.end_time),
900 (&context2.start_time, &context2.end_time),
901 ) {
902 ((Some(start1), Some(end1)), (Some(start2), Some(end2))) => {
903 end1 < start2 || end2 < start1
905 }
906 _ => false, }
908 }
909
910 fn validate_logical_consistency(
912 &self,
913 relationships: &[&ExtractedRelationship],
914 contradictions_found: &mut usize,
915 ) -> Result<()> {
916 let mut is_a_relationships: HashMap<String, String> = HashMap::new();
918 let mut part_of_relationships: HashMap<String, String> = HashMap::new();
919
920 for rel in relationships {
922 let pred_lower = rel.predicate.to_lowercase();
923 if pred_lower.contains("isa")
924 || pred_lower.contains("instanceof")
925 || pred_lower.contains("type")
926 {
927 is_a_relationships.insert(rel.subject_entity.clone(), rel.object_entity.clone());
928 } else if pred_lower.contains("partof")
929 || pred_lower.contains("contains")
930 || pred_lower.contains("within")
931 {
932 part_of_relationships.insert(rel.subject_entity.clone(), rel.object_entity.clone());
933 }
934 }
935
936 for (subject, object) in &is_a_relationships {
938 if self.has_cycle_in_hierarchy(subject, object, &is_a_relationships) {
939 warn!(
940 "Logical contradiction: Cycle detected in is-a relationship for {}",
941 subject
942 );
943 *contradictions_found += 1;
944 }
945 }
946
947 for (subject, object) in &part_of_relationships {
949 if self.has_cycle_in_hierarchy(subject, object, &part_of_relationships) {
950 warn!(
951 "Logical contradiction: Cycle detected in part-of relationship for {}",
952 subject
953 );
954 *contradictions_found += 1;
955 }
956 }
957
958 self.validate_domain_range_constraints(relationships, contradictions_found)?;
960
961 Ok(())
962 }
963
964 fn has_cycle_in_hierarchy(
966 &self,
967 start: &str,
968 current: &str,
969 hierarchy: &HashMap<String, String>,
970 ) -> bool {
971 if start == current {
972 return true; }
974
975 let mut visited = std::collections::HashSet::new();
977 let mut current_node = current;
978
979 while let Some(parent) = hierarchy.get(current_node) {
980 if visited.contains(current_node) || current_node == start {
981 return true; }
983 visited.insert(current_node.to_string());
984 current_node = parent;
985 }
986
987 false
988 }
989
990 fn validate_domain_range_constraints(
992 &self,
993 relationships: &[&ExtractedRelationship],
994 contradictions_found: &mut usize,
995 ) -> Result<()> {
996 let constraints = [
998 ("age", "Person", "Number"),
999 ("birthDate", "Person", "Date"),
1000 ("hasCapital", "Country", "City"),
1001 ("hasPopulation", "Place", "Number"),
1002 ("authorOf", "Person", "Book"),
1003 ("marriedTo", "Person", "Person"),
1004 ];
1005
1006 for rel in relationships {
1007 for (predicate, expected_domain, expected_range) in &constraints {
1008 if rel
1009 .predicate
1010 .to_lowercase()
1011 .contains(&predicate.to_lowercase())
1012 {
1013 if !self.entity_matches_type(
1015 &rel.subject_entity,
1016 expected_domain,
1017 relationships,
1018 ) {
1019 warn!(
1020 "Domain constraint violation: {} should be of type {} for predicate {}",
1021 rel.subject_entity, expected_domain, rel.predicate
1022 );
1023 *contradictions_found += 1;
1024 }
1025
1026 if !self.entity_matches_type(&rel.object_entity, expected_range, relationships)
1028 {
1029 warn!(
1030 "Range constraint violation: {} should be of type {} for predicate {}",
1031 rel.object_entity, expected_range, rel.predicate
1032 );
1033 *contradictions_found += 1;
1034 }
1035 }
1036 }
1037 }
1038
1039 Ok(())
1040 }
1041
1042 fn entity_matches_type(
1044 &self,
1045 entity: &str,
1046 expected_type: &str,
1047 relationships: &[&ExtractedRelationship],
1048 ) -> bool {
1049 let entity_lower = entity.to_lowercase();
1051 let type_lower = expected_type.to_lowercase();
1052
1053 match type_lower.as_str() {
1055 "person" => {
1056 entity_lower.contains("person") ||
1057 entity_lower.contains("author") ||
1058 entity_lower.contains("writer") ||
1059 entity_lower.contains("scientist") ||
1060 entity.chars().next().is_some_and(|c| c.is_uppercase())
1062 }
1063 "number" => {
1064 entity.parse::<f64>().is_ok()
1065 || entity_lower.contains("million")
1066 || entity_lower.contains("thousand")
1067 || entity_lower.contains("year")
1068 }
1069 "date" => {
1070 entity_lower.contains("19") || entity_lower.contains("20") || entity_lower.contains("january") || entity_lower.contains("february") ||
1072 entity_lower.contains("march") || entity_lower.contains("april") ||
1073 entity_lower.contains("may") || entity_lower.contains("june") ||
1074 entity_lower.contains("july") || entity_lower.contains("august") ||
1075 entity_lower.contains("september") || entity_lower.contains("october") ||
1076 entity_lower.contains("november") || entity_lower.contains("december")
1077 }
1078 "country" => {
1079 entity_lower.contains("country") ||
1080 entity_lower.contains("nation") ||
1081 relationships.iter().any(|r| r.subject_entity == entity &&
1083 r.predicate.to_lowercase().contains("type") &&
1084 r.object_entity.to_lowercase().contains("country"))
1085 }
1086 "city" => {
1087 entity_lower.contains("city") ||
1088 entity_lower.contains("town") ||
1089 relationships.iter().any(|r| r.subject_entity == entity &&
1091 r.predicate.to_lowercase().contains("type") &&
1092 r.object_entity.to_lowercase().contains("city"))
1093 }
1094 "book" => {
1095 entity_lower.contains("book") ||
1096 entity_lower.contains("novel") ||
1097 entity_lower.contains("publication") ||
1098 relationships.iter().any(|r| r.subject_entity == entity &&
1100 r.predicate.to_lowercase().contains("type") &&
1101 r.object_entity.to_lowercase().contains("book"))
1102 }
1103 _ => true, }
1105 }
1106
1107 fn relationship_to_triple(&self, relationship: &ExtractedRelationship) -> Result<Triple> {
1109 let subject = NamedNode::new(format!(
1111 "http://example.org/entity/{}",
1112 relationship.subject_entity
1113 ))?;
1114 let predicate = NamedNode::new(format!(
1115 "http://example.org/predicate/{}",
1116 relationship.predicate
1117 ))?;
1118 let object = NamedNode::new(format!(
1119 "http://example.org/entity/{}",
1120 relationship.object_entity
1121 ))?;
1122
1123 Ok(Triple::new(
1124 Subject::NamedNode(subject),
1125 Predicate::NamedNode(predicate),
1126 Object::NamedNode(object),
1127 ))
1128 }
1129
1130 fn canonicalize_entity(&self, entity: &str) -> String {
1132 entity.trim().to_lowercase()
1133 }
1134
1135 fn get_line_number(&self, text: &str, offset: usize) -> usize {
1136 text[..offset].chars().filter(|&c| c == '\n').count() + 1
1137 }
1138
1139 fn get_column_number(&self, text: &str, offset: usize) -> usize {
1140 text[..offset]
1141 .chars()
1142 .rev()
1143 .take_while(|&c| c != '\n')
1144 .count()
1145 + 1
1146 }
1147
1148 fn find_matching_entity<'a>(
1149 &self,
1150 text: &str,
1151 entities: &'a [ExtractedEntity],
1152 ) -> Option<&'a ExtractedEntity> {
1153 entities
1154 .iter()
1155 .find(|e| e.entity_text == text || e.canonical_form == self.canonicalize_entity(text))
1156 }
1157
1158 fn relationship_type_to_predicate(&self, rel_type: &RelationshipType) -> String {
1159 match rel_type {
1160 RelationshipType::IsA => "rdf:type".to_string(),
1161 RelationshipType::PartOf => "part_of".to_string(),
1162 RelationshipType::LocatedIn => "located_in".to_string(),
1163 RelationshipType::OwnedBy => "owned_by".to_string(),
1164 RelationshipType::CreatedBy => "created_by".to_string(),
1165 RelationshipType::CausedBy => "caused_by".to_string(),
1166 RelationshipType::TemporalSequence => "temporal_sequence".to_string(),
1167 RelationshipType::Similarity => "similar_to".to_string(),
1168 RelationshipType::Dependency => "depends_on".to_string(),
1169 RelationshipType::Custom(pred) => pred.clone(),
1170 }
1171 }
1172
1173 fn parse_temporal_expression(&self, temporal_text: &str) -> Option<DateTime<Utc>> {
1174 if let Ok(dt) = chrono::DateTime::parse_from_str(temporal_text, "%Y-%m-%d") {
1176 Some(dt.with_timezone(&Utc))
1177 } else {
1178 None
1179 }
1180 }
1181
1182 fn calculate_extraction_confidence(
1183 &self,
1184 entities: &[ExtractedEntity],
1185 relationships: &[ExtractedRelationship],
1186 schema_elements: &[SchemaElement],
1187 ) -> f64 {
1188 let mut total_confidence = 0.0;
1189 let mut count = 0;
1190
1191 for entity in entities {
1192 total_confidence += entity.confidence;
1193 count += 1;
1194 }
1195
1196 for relationship in relationships {
1197 total_confidence += relationship.confidence;
1198 count += 1;
1199 }
1200
1201 for schema_element in schema_elements {
1202 total_confidence += schema_element.confidence;
1203 count += 1;
1204 }
1205
1206 if count > 0 {
1207 total_confidence / count as f64
1208 } else {
1209 0.0
1210 }
1211 }
1212}
1213
1214#[cfg(test)]
1215mod tests {
1216 use super::*;
1217
1218 #[tokio::test]
1219 async fn test_knowledge_extraction_engine_creation() {
1220 let config = KnowledgeExtractionConfig::default();
1221 let engine = KnowledgeExtractionEngine::new(config);
1222
1223 assert!(engine.is_ok());
1224 }
1225
1226 #[tokio::test]
1227 async fn test_entity_extraction() {
1228 let config = KnowledgeExtractionConfig::default();
1229 let mut engine = KnowledgeExtractionEngine::new(config).expect("should succeed");
1230
1231 let text = "Dr. John Smith works at Microsoft Corp.";
1232 let result = engine.extract_knowledge(text).await;
1233
1234 assert!(result.is_ok());
1235 let knowledge = result.expect("should succeed");
1236 assert!(!knowledge.extracted_entities.is_empty());
1237 }
1238
1239 #[test]
1240 fn test_canonicalize_entity() {
1241 let config = KnowledgeExtractionConfig::default();
1242 let engine = KnowledgeExtractionEngine::new(config).expect("should succeed");
1243
1244 assert_eq!(engine.canonicalize_entity(" John Smith "), "john smith");
1245 }
1246}