oxirs_chat/rag/
knowledge_extraction.rs

1//! Automated Knowledge Extraction Module
2//!
3//! Implements sophisticated knowledge extraction capabilities including:
4//! - Entity and relationship extraction from text
5//! - Schema discovery and ontology generation
6//! - Fact validation and consistency checking
7//! - Temporal knowledge extraction
8//! - Multi-lingual knowledge extraction
9
10use anyhow::Result;
11use chrono::{DateTime, Utc};
12use oxirs_core::model::{triple::Triple, NamedNode, Object, Predicate, Subject};
13use regex::Regex;
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16use tracing::{debug, info, warn};
17use uuid::Uuid;
18
19/// Configuration for knowledge extraction
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct KnowledgeExtractionConfig {
22    pub enable_entity_extraction: bool,
23    pub enable_relationship_extraction: bool,
24    pub enable_schema_discovery: bool,
25    pub enable_fact_validation: bool,
26    pub enable_temporal_extraction: bool,
27    pub enable_multilingual_extraction: bool,
28    pub confidence_threshold: f64,
29    pub max_extraction_depth: usize,
30    pub language_models: Vec<String>,
31}
32
33impl Default for KnowledgeExtractionConfig {
34    fn default() -> Self {
35        Self {
36            enable_entity_extraction: true,
37            enable_relationship_extraction: true,
38            enable_schema_discovery: true,
39            enable_fact_validation: true,
40            enable_temporal_extraction: true,
41            enable_multilingual_extraction: false,
42            confidence_threshold: 0.8,
43            max_extraction_depth: 3,
44            language_models: vec!["en".to_string()],
45        }
46    }
47}
48
49/// Extracted knowledge item
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ExtractedKnowledge {
52    pub knowledge_id: String,
53    pub source_text: String,
54    pub extracted_triples: Vec<Triple>,
55    pub extracted_entities: Vec<ExtractedEntity>,
56    pub extracted_relationships: Vec<ExtractedRelationship>,
57    pub schema_elements: Vec<SchemaElement>,
58    pub temporal_facts: Vec<TemporalFact>,
59    pub confidence_score: f64,
60    pub extraction_metadata: ExtractionMetadata,
61}
62
63/// Detailed entity information
64#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct ExtractedEntity {
66    pub entity_id: String,
67    pub entity_text: String,
68    pub entity_type: EntityType,
69    pub canonical_form: String,
70    pub aliases: Vec<String>,
71    pub properties: HashMap<String, String>,
72    pub confidence: f64,
73    pub source_position: TextPosition,
74    pub linked_entities: Vec<String>,
75}
76
77/// Relationship between entities
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct ExtractedRelationship {
80    pub relationship_id: String,
81    pub subject_entity: String,
82    pub predicate: String,
83    pub object_entity: String,
84    pub relationship_type: RelationshipType,
85    pub confidence: f64,
86    pub evidence_text: String,
87    pub temporal_context: Option<TemporalContext>,
88    pub source_position: TextPosition,
89}
90
91/// Schema element discovered from text
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct SchemaElement {
94    pub element_id: String,
95    pub element_type: SchemaElementType,
96    pub name: String,
97    pub description: String,
98    pub properties: Vec<SchemaProperty>,
99    pub hierarchical_relations: Vec<HierarchicalRelation>,
100    pub constraints: Vec<SchemaConstraint>,
101    pub confidence: f64,
102}
103
104/// Temporal fact with time information
105#[derive(Debug, Clone, Serialize, Deserialize)]
106pub struct TemporalFact {
107    pub fact_id: String,
108    pub subject: String,
109    pub predicate: String,
110    pub object: String,
111    pub temporal_qualifier: TemporalQualifier,
112    pub confidence: f64,
113    pub source_text: String,
114}
115
116/// Types of entities that can be extracted
117#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
118pub enum EntityType {
119    Person,
120    Organization,
121    Location,
122    Event,
123    Concept,
124    Product,
125    Technology,
126    Scientific,
127    Temporal,
128    Numerical,
129    Unknown,
130}
131
132/// Types of relationships
133#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
134pub enum RelationshipType {
135    IsA,
136    PartOf,
137    LocatedIn,
138    OwnedBy,
139    CreatedBy,
140    CausedBy,
141    TemporalSequence,
142    Similarity,
143    Dependency,
144    Custom(String),
145}
146
147/// Schema element types
148#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
149pub enum SchemaElementType {
150    Class,
151    Property,
152    Relationship,
153    Constraint,
154    Rule,
155}
156
157/// Position in source text
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct TextPosition {
160    pub start_offset: usize,
161    pub end_offset: usize,
162    pub line_number: usize,
163    pub column_number: usize,
164}
165
166/// Temporal context information
167#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
168pub struct TemporalContext {
169    pub start_time: Option<DateTime<Utc>>,
170    pub end_time: Option<DateTime<Utc>>,
171    pub duration: Option<std::time::Duration>,
172    pub temporal_relation: String,
173}
174
175/// Temporal qualifier for facts
176#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct TemporalQualifier {
178    pub qualifier_type: TemporalType,
179    pub time_point: Option<DateTime<Utc>>,
180    pub time_interval: Option<TimeInterval>,
181    pub frequency: Option<String>,
182}
183
184/// Types of temporal information
185#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
186pub enum TemporalType {
187    PointInTime,
188    TimeInterval,
189    Frequency,
190    Duration,
191    Relative,
192}
193
194/// Time interval
195#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct TimeInterval {
197    pub start: DateTime<Utc>,
198    pub end: DateTime<Utc>,
199}
200
201/// Schema property
202#[derive(Debug, Clone, Serialize, Deserialize)]
203pub struct SchemaProperty {
204    pub property_name: String,
205    pub property_type: String,
206    pub cardinality: Cardinality,
207    pub domain: Option<String>,
208    pub range: Option<String>,
209}
210
211/// Hierarchical relation in schema
212#[derive(Debug, Clone, Serialize, Deserialize)]
213pub struct HierarchicalRelation {
214    pub relation_type: HierarchyType,
215    pub parent: String,
216    pub child: String,
217}
218
219/// Schema constraint
220#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct SchemaConstraint {
222    pub constraint_type: ConstraintType,
223    pub description: String,
224    pub enforcement_level: EnforcementLevel,
225}
226
227/// Property cardinality
228#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
229pub enum Cardinality {
230    ZeroOrOne,
231    ExactlyOne,
232    ZeroOrMore,
233    OneOrMore,
234    Exact(usize),
235    Range(usize, usize),
236}
237
238/// Hierarchy types
239#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
240pub enum HierarchyType {
241    SubClassOf,
242    SubPropertyOf,
243    PartOf,
244    InstanceOf,
245}
246
247/// Constraint types
248#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
249pub enum ConstraintType {
250    UniqueValue,
251    RequiredProperty,
252    ValueRange,
253    DataType,
254    Pattern,
255    Cardinality,
256}
257
258/// Enforcement levels
259#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
260pub enum EnforcementLevel {
261    Strict,
262    Warning,
263    Suggestion,
264}
265
266/// Metadata about the extraction process
267#[derive(Debug, Clone, Serialize, Deserialize)]
268pub struct ExtractionMetadata {
269    pub extraction_timestamp: DateTime<Utc>,
270    pub extraction_method: String,
271    pub processing_time_ms: u64,
272    pub language_detected: String,
273    pub text_length: usize,
274    pub extraction_statistics: ExtractionStatistics,
275}
276
277/// Statistics about extraction results
278#[derive(Debug, Clone, Serialize, Deserialize)]
279pub struct ExtractionStatistics {
280    pub entities_extracted: usize,
281    pub relationships_extracted: usize,
282    pub triples_generated: usize,
283    pub schema_elements_discovered: usize,
284    pub temporal_facts_extracted: usize,
285    pub average_confidence: f64,
286}
287
288/// Knowledge extraction engine
289pub struct KnowledgeExtractionEngine {
290    config: KnowledgeExtractionConfig,
291    entity_patterns: HashMap<EntityType, Vec<Regex>>,
292    relationship_patterns: HashMap<RelationshipType, Vec<Regex>>,
293    temporal_patterns: Vec<Regex>,
294    schema_inference_rules: Vec<SchemaInferenceRule>,
295    language_detectors: HashMap<String, LanguageDetector>,
296}
297
298/// Schema inference rule
299#[derive(Debug, Clone)]
300struct SchemaInferenceRule {
301    rule_id: String,
302    pattern: Regex,
303    inferred_type: SchemaElementType,
304    confidence_modifier: f64,
305}
306
307/// Language detector
308#[derive(Debug, Clone)]
309struct LanguageDetector {
310    language_code: String,
311    detection_patterns: Vec<Regex>,
312    confidence_threshold: f64,
313}
314
315impl KnowledgeExtractionEngine {
316    /// Create a new knowledge extraction engine
317    pub fn new(config: KnowledgeExtractionConfig) -> Result<Self> {
318        let mut engine = Self {
319            config,
320            entity_patterns: HashMap::new(),
321            relationship_patterns: HashMap::new(),
322            temporal_patterns: Vec::new(),
323            schema_inference_rules: Vec::new(),
324            language_detectors: HashMap::new(),
325        };
326
327        engine.initialize_extraction_patterns()?;
328        engine.initialize_schema_rules()?;
329        engine.initialize_language_detectors()?;
330
331        Ok(engine)
332    }
333
334    /// Initialize entity and relationship extraction patterns
335    fn initialize_extraction_patterns(&mut self) -> Result<()> {
336        // Person entity patterns
337        let person_patterns = vec![
338            Regex::new(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b")?, // FirstName LastName
339            Regex::new(r"\bDr\. [A-Z][a-z]+\b")?,        // Dr. Name
340            Regex::new(r"\bProf\. [A-Z][a-z]+\b")?,      // Prof. Name
341        ];
342        self.entity_patterns
343            .insert(EntityType::Person, person_patterns);
344
345        // Organization patterns
346        let org_patterns = vec![
347            Regex::new(r"\b[A-Z][a-z]+ (Inc|Corp|Ltd|LLC)\b")?,
348            Regex::new(r"\bUniversity of [A-Z][a-z]+\b")?,
349            Regex::new(r"\b[A-Z][A-Z]+ Corporation\b")?,
350        ];
351        self.entity_patterns
352            .insert(EntityType::Organization, org_patterns);
353
354        // Location patterns
355        let location_patterns = vec![
356            Regex::new(r"\b[A-Z][a-z]+, [A-Z][A-Z]\b")?, // City, State
357            Regex::new(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b")?, // City Country
358        ];
359        self.entity_patterns
360            .insert(EntityType::Location, location_patterns);
361
362        // Temporal patterns
363        self.temporal_patterns = vec![
364            Regex::new(r"\b\d{4}-\d{2}-\d{2}\b")?, // YYYY-MM-DD
365            Regex::new(
366                r"\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b",
367            )?,
368            Regex::new(r"\b(before|after|during|since|until) \d{4}\b")?,
369        ];
370
371        // Relationship patterns
372        let isa_patterns = vec![
373            Regex::new(r"(.+) is an? (.+)")?,
374            Regex::new(r"(.+) type of (.+)")?,
375        ];
376        self.relationship_patterns
377            .insert(RelationshipType::IsA, isa_patterns);
378
379        let partof_patterns = vec![
380            Regex::new(r"(.+) part of (.+)")?,
381            Regex::new(r"(.+) component of (.+)")?,
382        ];
383        self.relationship_patterns
384            .insert(RelationshipType::PartOf, partof_patterns);
385
386        Ok(())
387    }
388
389    /// Initialize schema inference rules
390    fn initialize_schema_rules(&mut self) -> Result<()> {
391        self.schema_inference_rules = vec![
392            SchemaInferenceRule {
393                rule_id: "class_definition".to_string(),
394                pattern: Regex::new(r"(.+) is a type of (.+)")?,
395                inferred_type: SchemaElementType::Class,
396                confidence_modifier: 0.9,
397            },
398            SchemaInferenceRule {
399                rule_id: "property_definition".to_string(),
400                pattern: Regex::new(r"(.+) has (.+)")?,
401                inferred_type: SchemaElementType::Property,
402                confidence_modifier: 0.8,
403            },
404        ];
405
406        Ok(())
407    }
408
409    /// Initialize language detectors
410    fn initialize_language_detectors(&mut self) -> Result<()> {
411        // English detector
412        self.language_detectors.insert(
413            "en".to_string(),
414            LanguageDetector {
415                language_code: "en".to_string(),
416                detection_patterns: vec![
417                    Regex::new(r"\b(the|and|or|but|if|when|where)\b")?,
418                    Regex::new(r"\b(is|are|was|were|have|has|had)\b")?,
419                ],
420                confidence_threshold: 0.7,
421            },
422        );
423
424        // Add more language detectors as needed
425        Ok(())
426    }
427
428    /// Extract knowledge from text
429    pub async fn extract_knowledge(&mut self, text: &str) -> Result<ExtractedKnowledge> {
430        let start_time = std::time::Instant::now();
431        info!(
432            "Starting knowledge extraction from text of length: {}",
433            text.len()
434        );
435
436        let knowledge_id = Uuid::new_v4().to_string();
437        let mut extracted_triples = Vec::new();
438        let mut extracted_entities = Vec::new();
439        let mut extracted_relationships = Vec::new();
440        let mut schema_elements = Vec::new();
441        let mut temporal_facts = Vec::new();
442
443        // Detect language
444        let detected_language = self.detect_language(text).await?;
445        debug!("Detected language: {}", detected_language);
446
447        // Extract entities
448        if self.config.enable_entity_extraction {
449            extracted_entities = self.extract_entities(text).await?;
450            debug!("Extracted {} entities", extracted_entities.len());
451        }
452
453        // Extract relationships
454        if self.config.enable_relationship_extraction {
455            extracted_relationships = self
456                .extract_relationships(text, &extracted_entities)
457                .await?;
458            debug!("Extracted {} relationships", extracted_relationships.len());
459        }
460
461        // Generate triples from relationships
462        for relationship in &extracted_relationships {
463            if let Ok(triple) = self.relationship_to_triple(relationship) {
464                extracted_triples.push(triple);
465            }
466        }
467
468        // Discover schema elements
469        if self.config.enable_schema_discovery {
470            schema_elements = self
471                .discover_schema_elements(text, &extracted_entities)
472                .await?;
473            debug!("Discovered {} schema elements", schema_elements.len());
474        }
475
476        // Extract temporal facts
477        if self.config.enable_temporal_extraction {
478            temporal_facts = self
479                .extract_temporal_facts(text, &extracted_entities)
480                .await?;
481            debug!("Extracted {} temporal facts", temporal_facts.len());
482        }
483
484        // Validate facts if enabled
485        if self.config.enable_fact_validation {
486            self.validate_extracted_facts(&mut extracted_triples, &extracted_relationships)
487                .await?;
488        }
489
490        // Calculate overall confidence
491        let confidence_score = self.calculate_extraction_confidence(
492            &extracted_entities,
493            &extracted_relationships,
494            &schema_elements,
495        );
496
497        let processing_time = start_time.elapsed().as_millis() as u64;
498
499        let extraction_statistics = ExtractionStatistics {
500            entities_extracted: extracted_entities.len(),
501            relationships_extracted: extracted_relationships.len(),
502            triples_generated: extracted_triples.len(),
503            schema_elements_discovered: schema_elements.len(),
504            temporal_facts_extracted: temporal_facts.len(),
505            average_confidence: confidence_score,
506        };
507
508        let extraction_metadata = ExtractionMetadata {
509            extraction_timestamp: Utc::now(),
510            extraction_method: "Pattern-based + LLM-enhanced".to_string(),
511            processing_time_ms: processing_time,
512            language_detected: detected_language,
513            text_length: text.len(),
514            extraction_statistics,
515        };
516
517        info!("Knowledge extraction completed in {}ms", processing_time);
518
519        Ok(ExtractedKnowledge {
520            knowledge_id,
521            source_text: text.to_string(),
522            extracted_triples,
523            extracted_entities,
524            extracted_relationships,
525            schema_elements,
526            temporal_facts,
527            confidence_score,
528            extraction_metadata,
529        })
530    }
531
532    /// Detect language of text
533    async fn detect_language(&self, text: &str) -> Result<String> {
534        // Simple pattern-based language detection
535        for (lang_code, detector) in &self.language_detectors {
536            let mut matches = 0;
537            let mut total_patterns = 0;
538
539            for pattern in &detector.detection_patterns {
540                total_patterns += 1;
541                if pattern.is_match(text) {
542                    matches += 1;
543                }
544            }
545
546            let confidence = matches as f64 / total_patterns as f64;
547            if confidence >= detector.confidence_threshold {
548                return Ok(lang_code.clone());
549            }
550        }
551
552        Ok("unknown".to_string())
553    }
554
555    /// Extract entities from text
556    async fn extract_entities(&self, text: &str) -> Result<Vec<ExtractedEntity>> {
557        let mut entities = Vec::new();
558
559        for (entity_type, patterns) in &self.entity_patterns {
560            for pattern in patterns {
561                for capture in pattern.find_iter(text) {
562                    let entity_text = capture.as_str();
563                    let start_pos = capture.start();
564                    let end_pos = capture.end();
565
566                    let entity = ExtractedEntity {
567                        entity_id: Uuid::new_v4().to_string(),
568                        entity_text: entity_text.to_string(),
569                        entity_type: entity_type.clone(),
570                        canonical_form: self.canonicalize_entity(entity_text),
571                        aliases: Vec::new(),
572                        properties: HashMap::new(),
573                        confidence: 0.8, // Base confidence for pattern matches
574                        source_position: TextPosition {
575                            start_offset: start_pos,
576                            end_offset: end_pos,
577                            line_number: self.get_line_number(text, start_pos),
578                            column_number: self.get_column_number(text, start_pos),
579                        },
580                        linked_entities: Vec::new(),
581                    };
582
583                    entities.push(entity);
584                }
585            }
586        }
587
588        // Remove duplicates based on canonical form
589        entities.sort_by(|a, b| a.canonical_form.cmp(&b.canonical_form));
590        entities.dedup_by(|a, b| a.canonical_form == b.canonical_form);
591
592        Ok(entities)
593    }
594
595    /// Extract relationships from text
596    async fn extract_relationships(
597        &self,
598        text: &str,
599        entities: &[ExtractedEntity],
600    ) -> Result<Vec<ExtractedRelationship>> {
601        let mut relationships = Vec::new();
602
603        for (relationship_type, patterns) in &self.relationship_patterns {
604            for pattern in patterns {
605                if let Some(captures) = pattern.captures(text) {
606                    if captures.len() >= 3 {
607                        let subject = captures.get(1).unwrap().as_str();
608                        let object = captures.get(2).unwrap().as_str();
609
610                        // Find matching entities
611                        let subject_entity = self.find_matching_entity(subject, entities);
612                        let object_entity = self.find_matching_entity(object, entities);
613
614                        if let (Some(subj), Some(obj)) = (subject_entity, object_entity) {
615                            let relationship = ExtractedRelationship {
616                                relationship_id: Uuid::new_v4().to_string(),
617                                subject_entity: subj.entity_id.clone(),
618                                predicate: self.relationship_type_to_predicate(relationship_type),
619                                object_entity: obj.entity_id.clone(),
620                                relationship_type: relationship_type.clone(),
621                                confidence: 0.8,
622                                evidence_text: captures.get(0).unwrap().as_str().to_string(),
623                                temporal_context: None,
624                                source_position: TextPosition {
625                                    start_offset: captures.get(0).unwrap().start(),
626                                    end_offset: captures.get(0).unwrap().end(),
627                                    line_number: 1,   // Simplified
628                                    column_number: 1, // Simplified
629                                },
630                            };
631
632                            relationships.push(relationship);
633                        }
634                    }
635                }
636            }
637        }
638
639        Ok(relationships)
640    }
641
642    /// Discover schema elements from text
643    async fn discover_schema_elements(
644        &self,
645        text: &str,
646        _entities: &[ExtractedEntity],
647    ) -> Result<Vec<SchemaElement>> {
648        let mut schema_elements = Vec::new();
649
650        for rule in &self.schema_inference_rules {
651            for capture in rule.pattern.find_iter(text) {
652                let element = SchemaElement {
653                    element_id: Uuid::new_v4().to_string(),
654                    element_type: rule.inferred_type.clone(),
655                    name: capture.as_str().to_string(),
656                    description: format!("Inferred from: {}", capture.as_str()),
657                    properties: Vec::new(),
658                    hierarchical_relations: Vec::new(),
659                    constraints: Vec::new(),
660                    confidence: rule.confidence_modifier,
661                };
662
663                schema_elements.push(element);
664            }
665        }
666
667        Ok(schema_elements)
668    }
669
670    /// Extract temporal facts from text
671    async fn extract_temporal_facts(
672        &self,
673        text: &str,
674        _entities: &[ExtractedEntity],
675    ) -> Result<Vec<TemporalFact>> {
676        let mut temporal_facts = Vec::new();
677
678        for pattern in &self.temporal_patterns {
679            for capture in pattern.find_iter(text) {
680                let temporal_text = capture.as_str();
681
682                let temporal_fact = TemporalFact {
683                    fact_id: Uuid::new_v4().to_string(),
684                    subject: "temporal_entity".to_string(), // Would be linked to actual entities
685                    predicate: "occurs_at".to_string(),
686                    object: temporal_text.to_string(),
687                    temporal_qualifier: TemporalQualifier {
688                        qualifier_type: TemporalType::PointInTime,
689                        time_point: self.parse_temporal_expression(temporal_text),
690                        time_interval: None,
691                        frequency: None,
692                    },
693                    confidence: 0.8,
694                    source_text: temporal_text.to_string(),
695                };
696
697                temporal_facts.push(temporal_fact);
698            }
699        }
700
701        Ok(temporal_facts)
702    }
703
704    /// Validate extracted facts for consistency
705    async fn validate_extracted_facts(
706        &self,
707        triples: &mut Vec<Triple>,
708        relationships: &[ExtractedRelationship],
709    ) -> Result<()> {
710        // Remove low-confidence relationships
711        let valid_relationships: Vec<_> = relationships
712            .iter()
713            .filter(|r| r.confidence >= self.config.confidence_threshold)
714            .collect();
715
716        // Check for contradictions and validate facts
717        let mut contradictions_found = 0;
718        let mut validated_triples = Vec::new();
719
720        // Create relationship maps for efficient lookup
721        let mut subject_predicates: HashMap<String, Vec<&ExtractedRelationship>> = HashMap::new();
722        let mut predicate_pairs: HashMap<String, Vec<(&str, &str)>> = HashMap::new();
723
724        for relationship in &valid_relationships {
725            subject_predicates
726                .entry(relationship.subject_entity.clone())
727                .or_default()
728                .push(relationship);
729
730            predicate_pairs
731                .entry(relationship.predicate.clone())
732                .or_default()
733                .push((&relationship.subject_entity, &relationship.object_entity));
734        }
735
736        // Check for direct contradictions (same subject-predicate with different objects)
737        for (subject, relationships) in &subject_predicates {
738            let mut predicate_values: HashMap<String, Vec<&str>> = HashMap::new();
739
740            for rel in relationships {
741                predicate_values
742                    .entry(rel.predicate.clone())
743                    .or_default()
744                    .push(&rel.object_entity);
745            }
746
747            for (predicate, values) in predicate_values {
748                if values.len() > 1 {
749                    // Check if multiple values for the same predicate indicate contradiction
750                    let unique_values: std::collections::HashSet<_> = values.into_iter().collect();
751                    if unique_values.len() > 1 && self.is_contradictory_predicate(&predicate) {
752                        warn!(
753                            "Contradiction detected for {}: {} has multiple {} values: {:?}",
754                            subject, subject, predicate, unique_values
755                        );
756                        contradictions_found += 1;
757
758                        // Keep only the highest confidence relationship for this predicate
759                        if let Some(best_rel) = relationships
760                            .iter()
761                            .filter(|r| r.predicate == predicate)
762                            .max_by(|a, b| {
763                                a.confidence
764                                    .partial_cmp(&b.confidence)
765                                    .unwrap_or(std::cmp::Ordering::Equal)
766                            })
767                        {
768                            if let Ok(triple) = self.relationship_to_triple(best_rel) {
769                                validated_triples.push(triple);
770                            }
771                        }
772                        continue;
773                    }
774                }
775            }
776        }
777
778        // Check temporal consistency
779        for relationship in &valid_relationships {
780            if let Some(temporal_context) = &relationship.temporal_context {
781                if !self.validate_temporal_consistency(temporal_context, &valid_relationships) {
782                    warn!(
783                        "Temporal inconsistency detected for relationship: {} {} {}",
784                        relationship.subject_entity,
785                        relationship.predicate,
786                        relationship.object_entity
787                    );
788                    contradictions_found += 1;
789                    continue;
790                }
791            }
792
793            // Add valid relationship as triple
794            if let Ok(triple) = self.relationship_to_triple(relationship) {
795                validated_triples.push(triple);
796            }
797        }
798
799        // Check logical consistency (e.g., transitive relationships)
800        self.validate_logical_consistency(&valid_relationships, &mut contradictions_found)?;
801
802        // Update triples with validated ones
803        triples.clear();
804        triples.extend(validated_triples);
805
806        if contradictions_found > 0 {
807            warn!(
808                "Found {} contradictions during fact validation",
809                contradictions_found
810            );
811        }
812
813        debug!("Validated {} relationships", valid_relationships.len());
814        Ok(())
815    }
816
817    /// Check if a predicate type indicates contradictory values are not allowed
818    fn is_contradictory_predicate(&self, predicate: &str) -> bool {
819        // Define predicates that should have unique values (functional properties)
820        let functional_predicates = [
821            "birthDate",
822            "deathDate",
823            "age",
824            "height",
825            "weight",
826            "hasGender",
827            "isA",
828            "type",
829            "hasCapital",
830            "hasPopulation",
831            "hasArea",
832            "founded",
833            "established",
834            "created",
835            "died",
836            "born",
837        ];
838
839        functional_predicates.iter().any(|&fp| {
840            predicate.to_lowercase().contains(&fp.to_lowercase()) || predicate.ends_with(&fp)
841        })
842    }
843
844    /// Validate temporal consistency of relationships
845    fn validate_temporal_consistency(
846        &self,
847        temporal_context: &TemporalContext,
848        all_relationships: &[&ExtractedRelationship],
849    ) -> bool {
850        // Check if temporal context makes sense
851        if let (Some(start), Some(end)) = (&temporal_context.start_time, &temporal_context.end_time)
852        {
853            if start >= end {
854                return false; // Start time cannot be after end time
855            }
856        }
857
858        // Check for temporal conflicts with other relationships
859        for other_rel in all_relationships {
860            if let Some(other_temporal) = &other_rel.temporal_context {
861                // If same entities with conflicting time periods
862                if temporal_context != other_temporal {
863                    // Check for overlapping time periods that might indicate conflicts
864                    if self.temporal_periods_conflict(temporal_context, other_temporal) {
865                        return false;
866                    }
867                }
868            }
869        }
870
871        true
872    }
873
874    /// Check if two temporal periods conflict
875    fn temporal_periods_conflict(
876        &self,
877        context1: &TemporalContext,
878        context2: &TemporalContext,
879    ) -> bool {
880        // Simple check - in real implementation, this would be more sophisticated
881        // Check if both have explicit time ranges that don't overlap
882        match (
883            (&context1.start_time, &context1.end_time),
884            (&context2.start_time, &context2.end_time),
885        ) {
886            ((Some(start1), Some(end1)), (Some(start2), Some(end2))) => {
887                // If periods don't overlap, they might be conflicting for certain relationships
888                end1 < start2 || end2 < start1
889            }
890            _ => false, // If we don't have full temporal information, assume no conflict
891        }
892    }
893
894    /// Validate logical consistency across relationships
895    fn validate_logical_consistency(
896        &self,
897        relationships: &[&ExtractedRelationship],
898        contradictions_found: &mut usize,
899    ) -> Result<()> {
900        // Check transitive relationships
901        let mut is_a_relationships: HashMap<String, String> = HashMap::new();
902        let mut part_of_relationships: HashMap<String, String> = HashMap::new();
903
904        // Collect hierarchical relationships
905        for rel in relationships {
906            let pred_lower = rel.predicate.to_lowercase();
907            if pred_lower.contains("isa")
908                || pred_lower.contains("instanceof")
909                || pred_lower.contains("type")
910            {
911                is_a_relationships.insert(rel.subject_entity.clone(), rel.object_entity.clone());
912            } else if pred_lower.contains("partof")
913                || pred_lower.contains("contains")
914                || pred_lower.contains("within")
915            {
916                part_of_relationships.insert(rel.subject_entity.clone(), rel.object_entity.clone());
917            }
918        }
919
920        // Check for cycles in is-a relationships (which would be logical contradictions)
921        for (subject, object) in &is_a_relationships {
922            if self.has_cycle_in_hierarchy(subject, object, &is_a_relationships) {
923                warn!(
924                    "Logical contradiction: Cycle detected in is-a relationship for {}",
925                    subject
926                );
927                *contradictions_found += 1;
928            }
929        }
930
931        // Check for cycles in part-of relationships
932        for (subject, object) in &part_of_relationships {
933            if self.has_cycle_in_hierarchy(subject, object, &part_of_relationships) {
934                warn!(
935                    "Logical contradiction: Cycle detected in part-of relationship for {}",
936                    subject
937                );
938                *contradictions_found += 1;
939            }
940        }
941
942        // Check domain/range constraints
943        self.validate_domain_range_constraints(relationships, contradictions_found)?;
944
945        Ok(())
946    }
947
948    /// Check for cycles in hierarchical relationships
949    fn has_cycle_in_hierarchy(
950        &self,
951        start: &str,
952        current: &str,
953        hierarchy: &HashMap<String, String>,
954    ) -> bool {
955        if start == current {
956            return true; // Direct cycle
957        }
958
959        // Follow the chain to detect cycles
960        let mut visited = std::collections::HashSet::new();
961        let mut current_node = current;
962
963        while let Some(parent) = hierarchy.get(current_node) {
964            if visited.contains(current_node) || current_node == start {
965                return true; // Cycle detected
966            }
967            visited.insert(current_node.to_string());
968            current_node = parent;
969        }
970
971        false
972    }
973
974    /// Validate domain and range constraints for relationships
975    fn validate_domain_range_constraints(
976        &self,
977        relationships: &[&ExtractedRelationship],
978        contradictions_found: &mut usize,
979    ) -> Result<()> {
980        // Define some basic domain/range constraints
981        let constraints = [
982            ("age", "Person", "Number"),
983            ("birthDate", "Person", "Date"),
984            ("hasCapital", "Country", "City"),
985            ("hasPopulation", "Place", "Number"),
986            ("authorOf", "Person", "Book"),
987            ("marriedTo", "Person", "Person"),
988        ];
989
990        for rel in relationships {
991            for (predicate, expected_domain, expected_range) in &constraints {
992                if rel
993                    .predicate
994                    .to_lowercase()
995                    .contains(&predicate.to_lowercase())
996                {
997                    // Check if subject matches expected domain type
998                    if !self.entity_matches_type(
999                        &rel.subject_entity,
1000                        expected_domain,
1001                        relationships,
1002                    ) {
1003                        warn!(
1004                            "Domain constraint violation: {} should be of type {} for predicate {}",
1005                            rel.subject_entity, expected_domain, rel.predicate
1006                        );
1007                        *contradictions_found += 1;
1008                    }
1009
1010                    // Check if object matches expected range type
1011                    if !self.entity_matches_type(&rel.object_entity, expected_range, relationships)
1012                    {
1013                        warn!(
1014                            "Range constraint violation: {} should be of type {} for predicate {}",
1015                            rel.object_entity, expected_range, rel.predicate
1016                        );
1017                        *contradictions_found += 1;
1018                    }
1019                }
1020            }
1021        }
1022
1023        Ok(())
1024    }
1025
1026    /// Check if an entity matches a given type based on other relationships
1027    fn entity_matches_type(
1028        &self,
1029        entity: &str,
1030        expected_type: &str,
1031        relationships: &[&ExtractedRelationship],
1032    ) -> bool {
1033        // Simple heuristic-based type checking
1034        let entity_lower = entity.to_lowercase();
1035        let type_lower = expected_type.to_lowercase();
1036
1037        // Check if entity name suggests the type
1038        match type_lower.as_str() {
1039            "person" => {
1040                entity_lower.contains("person") || 
1041                entity_lower.contains("author") ||
1042                entity_lower.contains("writer") ||
1043                entity_lower.contains("scientist") ||
1044                // Common person name patterns
1045                entity.chars().next().is_some_and(|c| c.is_uppercase())
1046            }
1047            "number" => {
1048                entity.parse::<f64>().is_ok()
1049                    || entity_lower.contains("million")
1050                    || entity_lower.contains("thousand")
1051                    || entity_lower.contains("year")
1052            }
1053            "date" => {
1054                entity_lower.contains("19") || entity_lower.contains("20") || // Years
1055                entity_lower.contains("january") || entity_lower.contains("february") ||
1056                entity_lower.contains("march") || entity_lower.contains("april") ||
1057                entity_lower.contains("may") || entity_lower.contains("june") ||
1058                entity_lower.contains("july") || entity_lower.contains("august") ||
1059                entity_lower.contains("september") || entity_lower.contains("october") ||
1060                entity_lower.contains("november") || entity_lower.contains("december")
1061            }
1062            "country" => {
1063                entity_lower.contains("country") ||
1064                entity_lower.contains("nation") ||
1065                // Check if explicitly typed as country in relationships
1066                relationships.iter().any(|r| r.subject_entity == entity &&
1067                    r.predicate.to_lowercase().contains("type") && 
1068                    r.object_entity.to_lowercase().contains("country"))
1069            }
1070            "city" => {
1071                entity_lower.contains("city") ||
1072                entity_lower.contains("town") ||
1073                // Check if explicitly typed as city in relationships
1074                relationships.iter().any(|r| r.subject_entity == entity &&
1075                    r.predicate.to_lowercase().contains("type") && 
1076                    r.object_entity.to_lowercase().contains("city"))
1077            }
1078            "book" => {
1079                entity_lower.contains("book") ||
1080                entity_lower.contains("novel") ||
1081                entity_lower.contains("publication") ||
1082                // Check if explicitly typed as book in relationships
1083                relationships.iter().any(|r| r.subject_entity == entity &&
1084                    r.predicate.to_lowercase().contains("type") && 
1085                    r.object_entity.to_lowercase().contains("book"))
1086            }
1087            _ => true, // Unknown type, assume valid
1088        }
1089    }
1090
1091    /// Convert relationship to RDF triple
1092    fn relationship_to_triple(&self, relationship: &ExtractedRelationship) -> Result<Triple> {
1093        // This is a simplified conversion - real implementation would be more sophisticated
1094        let subject = NamedNode::new(format!(
1095            "http://example.org/entity/{}",
1096            relationship.subject_entity
1097        ))?;
1098        let predicate = NamedNode::new(format!(
1099            "http://example.org/predicate/{}",
1100            relationship.predicate
1101        ))?;
1102        let object = NamedNode::new(format!(
1103            "http://example.org/entity/{}",
1104            relationship.object_entity
1105        ))?;
1106
1107        Ok(Triple::new(
1108            Subject::NamedNode(subject),
1109            Predicate::NamedNode(predicate),
1110            Object::NamedNode(object),
1111        ))
1112    }
1113
1114    /// Helper functions
1115    fn canonicalize_entity(&self, entity: &str) -> String {
1116        entity.trim().to_lowercase()
1117    }
1118
1119    fn get_line_number(&self, text: &str, offset: usize) -> usize {
1120        text[..offset].chars().filter(|&c| c == '\n').count() + 1
1121    }
1122
1123    fn get_column_number(&self, text: &str, offset: usize) -> usize {
1124        text[..offset]
1125            .chars()
1126            .rev()
1127            .take_while(|&c| c != '\n')
1128            .count()
1129            + 1
1130    }
1131
1132    fn find_matching_entity<'a>(
1133        &self,
1134        text: &str,
1135        entities: &'a [ExtractedEntity],
1136    ) -> Option<&'a ExtractedEntity> {
1137        entities
1138            .iter()
1139            .find(|e| e.entity_text == text || e.canonical_form == self.canonicalize_entity(text))
1140    }
1141
1142    fn relationship_type_to_predicate(&self, rel_type: &RelationshipType) -> String {
1143        match rel_type {
1144            RelationshipType::IsA => "rdf:type".to_string(),
1145            RelationshipType::PartOf => "part_of".to_string(),
1146            RelationshipType::LocatedIn => "located_in".to_string(),
1147            RelationshipType::OwnedBy => "owned_by".to_string(),
1148            RelationshipType::CreatedBy => "created_by".to_string(),
1149            RelationshipType::CausedBy => "caused_by".to_string(),
1150            RelationshipType::TemporalSequence => "temporal_sequence".to_string(),
1151            RelationshipType::Similarity => "similar_to".to_string(),
1152            RelationshipType::Dependency => "depends_on".to_string(),
1153            RelationshipType::Custom(pred) => pred.clone(),
1154        }
1155    }
1156
1157    fn parse_temporal_expression(&self, temporal_text: &str) -> Option<DateTime<Utc>> {
1158        // Simplified temporal parsing - real implementation would be more sophisticated
1159        if let Ok(dt) = chrono::DateTime::parse_from_str(temporal_text, "%Y-%m-%d") {
1160            Some(dt.with_timezone(&Utc))
1161        } else {
1162            None
1163        }
1164    }
1165
1166    fn calculate_extraction_confidence(
1167        &self,
1168        entities: &[ExtractedEntity],
1169        relationships: &[ExtractedRelationship],
1170        schema_elements: &[SchemaElement],
1171    ) -> f64 {
1172        let mut total_confidence = 0.0;
1173        let mut count = 0;
1174
1175        for entity in entities {
1176            total_confidence += entity.confidence;
1177            count += 1;
1178        }
1179
1180        for relationship in relationships {
1181            total_confidence += relationship.confidence;
1182            count += 1;
1183        }
1184
1185        for schema_element in schema_elements {
1186            total_confidence += schema_element.confidence;
1187            count += 1;
1188        }
1189
1190        if count > 0 {
1191            total_confidence / count as f64
1192        } else {
1193            0.0
1194        }
1195    }
1196}
1197
1198#[cfg(test)]
1199mod tests {
1200    use super::*;
1201
1202    #[tokio::test]
1203    async fn test_knowledge_extraction_engine_creation() {
1204        let config = KnowledgeExtractionConfig::default();
1205        let engine = KnowledgeExtractionEngine::new(config);
1206
1207        assert!(engine.is_ok());
1208    }
1209
1210    #[tokio::test]
1211    async fn test_entity_extraction() {
1212        let config = KnowledgeExtractionConfig::default();
1213        let mut engine = KnowledgeExtractionEngine::new(config).unwrap();
1214
1215        let text = "Dr. John Smith works at Microsoft Corp.";
1216        let result = engine.extract_knowledge(text).await;
1217
1218        assert!(result.is_ok());
1219        let knowledge = result.unwrap();
1220        assert!(!knowledge.extracted_entities.is_empty());
1221    }
1222
1223    #[test]
1224    fn test_canonicalize_entity() {
1225        let config = KnowledgeExtractionConfig::default();
1226        let engine = KnowledgeExtractionEngine::new(config).unwrap();
1227
1228        assert_eq!(engine.canonicalize_entity("  John Smith  "), "john smith");
1229    }
1230}
oxirs_chat/rag/knowledge_extraction.rs

oxirs_chat/rag/
knowledge_extraction.rs