oxirs_chat/rag/
knowledge_extraction.rs

1//! Automated Knowledge Extraction Module
2//!
3//! Implements sophisticated knowledge extraction capabilities including:
4//! - Entity and relationship extraction from text
5//! - Schema discovery and ontology generation
6//! - Fact validation and consistency checking
7//! - Temporal knowledge extraction
8//! - Multi-lingual knowledge extraction
9
10use anyhow::Result;
11use chrono::{DateTime, Utc};
12use oxirs_core::model::{triple::Triple, NamedNode, Object, Predicate, Subject};
13use regex::Regex;
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16use tracing::{debug, info, warn};
17use uuid::Uuid;
18
19/// Configuration for knowledge extraction
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct KnowledgeExtractionConfig {
22    pub enable_entity_extraction: bool,
23    pub enable_relationship_extraction: bool,
24    pub enable_schema_discovery: bool,
25    pub enable_fact_validation: bool,
26    pub enable_temporal_extraction: bool,
27    pub enable_multilingual_extraction: bool,
28    pub confidence_threshold: f64,
29    pub max_extraction_depth: usize,
30    pub language_models: Vec<String>,
31}
32
33impl Default for KnowledgeExtractionConfig {
34    fn default() -> Self {
35        Self {
36            enable_entity_extraction: true,
37            enable_relationship_extraction: true,
38            enable_schema_discovery: true,
39            enable_fact_validation: true,
40            enable_temporal_extraction: true,
41            enable_multilingual_extraction: false,
42            confidence_threshold: 0.8,
43            max_extraction_depth: 3,
44            language_models: vec!["en".to_string()],
45        }
46    }
47}
48
49/// Extracted knowledge item
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ExtractedKnowledge {
52    pub knowledge_id: String,
53    pub source_text: String,
54    pub extracted_triples: Vec<Triple>,
55    pub extracted_entities: Vec<ExtractedEntity>,
56    pub extracted_relationships: Vec<ExtractedRelationship>,
57    pub schema_elements: Vec<SchemaElement>,
58    pub temporal_facts: Vec<TemporalFact>,
59    pub confidence_score: f64,
60    pub extraction_metadata: ExtractionMetadata,
61}
62
63/// Detailed entity information
64#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct ExtractedEntity {
66    pub entity_id: String,
67    pub entity_text: String,
68    pub entity_type: EntityType,
69    pub canonical_form: String,
70    pub aliases: Vec<String>,
71    pub properties: HashMap<String, String>,
72    pub confidence: f64,
73    pub source_position: TextPosition,
74    pub linked_entities: Vec<String>,
75}
76
77/// Relationship between entities
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct ExtractedRelationship {
80    pub relationship_id: String,
81    pub subject_entity: String,
82    pub predicate: String,
83    pub object_entity: String,
84    pub relationship_type: RelationshipType,
85    pub confidence: f64,
86    pub evidence_text: String,
87    pub temporal_context: Option<TemporalContext>,
88    pub source_position: TextPosition,
89}
90
91/// Schema element discovered from text
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct SchemaElement {
94    pub element_id: String,
95    pub element_type: SchemaElementType,
96    pub name: String,
97    pub description: String,
98    pub properties: Vec<SchemaProperty>,
99    pub hierarchical_relations: Vec<HierarchicalRelation>,
100    pub constraints: Vec<SchemaConstraint>,
101    pub confidence: f64,
102}
103
104/// Temporal fact with time information
105#[derive(Debug, Clone, Serialize, Deserialize)]
106pub struct TemporalFact {
107    pub fact_id: String,
108    pub subject: String,
109    pub predicate: String,
110    pub object: String,
111    pub temporal_qualifier: TemporalQualifier,
112    pub confidence: f64,
113    pub source_text: String,
114}
115
116/// Types of entities that can be extracted
117#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
118pub enum EntityType {
119    Person,
120    Organization,
121    Location,
122    Event,
123    Concept,
124    Product,
125    Technology,
126    Scientific,
127    Temporal,
128    Numerical,
129    Unknown,
130}
131
132/// Types of relationships
133#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
134pub enum RelationshipType {
135    IsA,
136    PartOf,
137    LocatedIn,
138    OwnedBy,
139    CreatedBy,
140    CausedBy,
141    TemporalSequence,
142    Similarity,
143    Dependency,
144    Custom(String),
145}
146
147/// Schema element types
148#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
149pub enum SchemaElementType {
150    Class,
151    Property,
152    Relationship,
153    Constraint,
154    Rule,
155}
156
157/// Position in source text
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct TextPosition {
160    pub start_offset: usize,
161    pub end_offset: usize,
162    pub line_number: usize,
163    pub column_number: usize,
164}
165
166/// Temporal context information
167#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
168pub struct TemporalContext {
169    pub start_time: Option<DateTime<Utc>>,
170    pub end_time: Option<DateTime<Utc>>,
171    pub duration: Option<std::time::Duration>,
172    pub temporal_relation: String,
173}
174
175/// Temporal qualifier for facts
176#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct TemporalQualifier {
178    pub qualifier_type: TemporalType,
179    pub time_point: Option<DateTime<Utc>>,
180    pub time_interval: Option<TimeInterval>,
181    pub frequency: Option<String>,
182}
183
184/// Types of temporal information
185#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
186pub enum TemporalType {
187    PointInTime,
188    TimeInterval,
189    Frequency,
190    Duration,
191    Relative,
192}
193
194/// Time interval
195#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct TimeInterval {
197    pub start: DateTime<Utc>,
198    pub end: DateTime<Utc>,
199}
200
201/// Schema property
202#[derive(Debug, Clone, Serialize, Deserialize)]
203pub struct SchemaProperty {
204    pub property_name: String,
205    pub property_type: String,
206    pub cardinality: Cardinality,
207    pub domain: Option<String>,
208    pub range: Option<String>,
209}
210
211/// Hierarchical relation in schema
212#[derive(Debug, Clone, Serialize, Deserialize)]
213pub struct HierarchicalRelation {
214    pub relation_type: HierarchyType,
215    pub parent: String,
216    pub child: String,
217}
218
219/// Schema constraint
220#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct SchemaConstraint {
222    pub constraint_type: ConstraintType,
223    pub description: String,
224    pub enforcement_level: EnforcementLevel,
225}
226
227/// Property cardinality
228#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
229pub enum Cardinality {
230    ZeroOrOne,
231    ExactlyOne,
232    ZeroOrMore,
233    OneOrMore,
234    Exact(usize),
235    Range(usize, usize),
236}
237
238/// Hierarchy types
239#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
240pub enum HierarchyType {
241    SubClassOf,
242    SubPropertyOf,
243    PartOf,
244    InstanceOf,
245}
246
247/// Constraint types
248#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
249pub enum ConstraintType {
250    UniqueValue,
251    RequiredProperty,
252    ValueRange,
253    DataType,
254    Pattern,
255    Cardinality,
256}
257
258/// Enforcement levels
259#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
260pub enum EnforcementLevel {
261    Strict,
262    Warning,
263    Suggestion,
264}
265
266/// Metadata about the extraction process
267#[derive(Debug, Clone, Serialize, Deserialize)]
268pub struct ExtractionMetadata {
269    pub extraction_timestamp: DateTime<Utc>,
270    pub extraction_method: String,
271    pub processing_time_ms: u64,
272    pub language_detected: String,
273    pub text_length: usize,
274    pub extraction_statistics: ExtractionStatistics,
275}
276
277/// Statistics about extraction results
278#[derive(Debug, Clone, Serialize, Deserialize)]
279pub struct ExtractionStatistics {
280    pub entities_extracted: usize,
281    pub relationships_extracted: usize,
282    pub triples_generated: usize,
283    pub schema_elements_discovered: usize,
284    pub temporal_facts_extracted: usize,
285    pub average_confidence: f64,
286}
287
288/// Knowledge extraction engine
289pub struct KnowledgeExtractionEngine {
290    config: KnowledgeExtractionConfig,
291    entity_patterns: HashMap<EntityType, Vec<Regex>>,
292    relationship_patterns: HashMap<RelationshipType, Vec<Regex>>,
293    temporal_patterns: Vec<Regex>,
294    schema_inference_rules: Vec<SchemaInferenceRule>,
295    language_detectors: HashMap<String, LanguageDetector>,
296}
297
298/// Schema inference rule
299#[derive(Debug, Clone)]
300struct SchemaInferenceRule {
301    rule_id: String,
302    pattern: Regex,
303    inferred_type: SchemaElementType,
304    confidence_modifier: f64,
305}
306
307/// Language detector
308#[derive(Debug, Clone)]
309struct LanguageDetector {
310    language_code: String,
311    detection_patterns: Vec<Regex>,
312    confidence_threshold: f64,
313}
314
315impl KnowledgeExtractionEngine {
316    /// Create a new knowledge extraction engine
317    pub fn new(config: KnowledgeExtractionConfig) -> Result<Self> {
318        let mut engine = Self {
319            config,
320            entity_patterns: HashMap::new(),
321            relationship_patterns: HashMap::new(),
322            temporal_patterns: Vec::new(),
323            schema_inference_rules: Vec::new(),
324            language_detectors: HashMap::new(),
325        };
326
327        engine.initialize_extraction_patterns()?;
328        engine.initialize_schema_rules()?;
329        engine.initialize_language_detectors()?;
330
331        Ok(engine)
332    }
333
334    /// Initialize entity and relationship extraction patterns
335    fn initialize_extraction_patterns(&mut self) -> Result<()> {
336        // Person entity patterns
337        let person_patterns = vec![
338            Regex::new(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b")?, // FirstName LastName
339            Regex::new(r"\bDr\. [A-Z][a-z]+\b")?,        // Dr. Name
340            Regex::new(r"\bProf\. [A-Z][a-z]+\b")?,      // Prof. Name
341        ];
342        self.entity_patterns
343            .insert(EntityType::Person, person_patterns);
344
345        // Organization patterns
346        let org_patterns = vec![
347            Regex::new(r"\b[A-Z][a-z]+ (Inc|Corp|Ltd|LLC)\b")?,
348            Regex::new(r"\bUniversity of [A-Z][a-z]+\b")?,
349            Regex::new(r"\b[A-Z][A-Z]+ Corporation\b")?,
350        ];
351        self.entity_patterns
352            .insert(EntityType::Organization, org_patterns);
353
354        // Location patterns
355        let location_patterns = vec![
356            Regex::new(r"\b[A-Z][a-z]+, [A-Z][A-Z]\b")?, // City, State
357            Regex::new(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b")?, // City Country
358        ];
359        self.entity_patterns
360            .insert(EntityType::Location, location_patterns);
361
362        // Temporal patterns
363        self.temporal_patterns = vec![
364            Regex::new(r"\b\d{4}-\d{2}-\d{2}\b")?, // YYYY-MM-DD
365            Regex::new(
366                r"\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b",
367            )?,
368            Regex::new(r"\b(before|after|during|since|until) \d{4}\b")?,
369        ];
370
371        // Relationship patterns
372        let isa_patterns = vec![
373            Regex::new(r"(.+) is an? (.+)")?,
374            Regex::new(r"(.+) type of (.+)")?,
375        ];
376        self.relationship_patterns
377            .insert(RelationshipType::IsA, isa_patterns);
378
379        let partof_patterns = vec![
380            Regex::new(r"(.+) part of (.+)")?,
381            Regex::new(r"(.+) component of (.+)")?,
382        ];
383        self.relationship_patterns
384            .insert(RelationshipType::PartOf, partof_patterns);
385
386        Ok(())
387    }
388
389    /// Initialize schema inference rules
390    fn initialize_schema_rules(&mut self) -> Result<()> {
391        self.schema_inference_rules = vec![
392            SchemaInferenceRule {
393                rule_id: "class_definition".to_string(),
394                pattern: Regex::new(r"(.+) is a type of (.+)")?,
395                inferred_type: SchemaElementType::Class,
396                confidence_modifier: 0.9,
397            },
398            SchemaInferenceRule {
399                rule_id: "property_definition".to_string(),
400                pattern: Regex::new(r"(.+) has (.+)")?,
401                inferred_type: SchemaElementType::Property,
402                confidence_modifier: 0.8,
403            },
404        ];
405
406        Ok(())
407    }
408
409    /// Initialize language detectors
410    fn initialize_language_detectors(&mut self) -> Result<()> {
411        // English detector
412        self.language_detectors.insert(
413            "en".to_string(),
414            LanguageDetector {
415                language_code: "en".to_string(),
416                detection_patterns: vec![
417                    Regex::new(r"\b(the|and|or|but|if|when|where)\b")?,
418                    Regex::new(r"\b(is|are|was|were|have|has|had)\b")?,
419                ],
420                confidence_threshold: 0.7,
421            },
422        );
423
424        // Add more language detectors as needed
425        Ok(())
426    }
427
428    /// Extract knowledge from text
429    pub async fn extract_knowledge(&mut self, text: &str) -> Result<ExtractedKnowledge> {
430        let start_time = std::time::Instant::now();
431        info!(
432            "Starting knowledge extraction from text of length: {}",
433            text.len()
434        );
435
436        let knowledge_id = Uuid::new_v4().to_string();
437        let mut extracted_triples = Vec::new();
438        let mut extracted_entities = Vec::new();
439        let mut extracted_relationships = Vec::new();
440        let mut schema_elements = Vec::new();
441        let mut temporal_facts = Vec::new();
442
443        // Detect language
444        let detected_language = self.detect_language(text).await?;
445        debug!("Detected language: {}", detected_language);
446
447        // Extract entities
448        if self.config.enable_entity_extraction {
449            extracted_entities = self.extract_entities(text).await?;
450            debug!("Extracted {} entities", extracted_entities.len());
451        }
452
453        // Extract relationships
454        if self.config.enable_relationship_extraction {
455            extracted_relationships = self
456                .extract_relationships(text, &extracted_entities)
457                .await?;
458            debug!("Extracted {} relationships", extracted_relationships.len());
459        }
460
461        // Generate triples from relationships
462        for relationship in &extracted_relationships {
463            if let Ok(triple) = self.relationship_to_triple(relationship) {
464                extracted_triples.push(triple);
465            }
466        }
467
468        // Discover schema elements
469        if self.config.enable_schema_discovery {
470            schema_elements = self
471                .discover_schema_elements(text, &extracted_entities)
472                .await?;
473            debug!("Discovered {} schema elements", schema_elements.len());
474        }
475
476        // Extract temporal facts
477        if self.config.enable_temporal_extraction {
478            temporal_facts = self
479                .extract_temporal_facts(text, &extracted_entities)
480                .await?;
481            debug!("Extracted {} temporal facts", temporal_facts.len());
482        }
483
484        // Validate facts if enabled
485        if self.config.enable_fact_validation {
486            self.validate_extracted_facts(&mut extracted_triples, &extracted_relationships)
487                .await?;
488        }
489
490        // Calculate overall confidence
491        let confidence_score = self.calculate_extraction_confidence(
492            &extracted_entities,
493            &extracted_relationships,
494            &schema_elements,
495        );
496
497        let processing_time = start_time.elapsed().as_millis() as u64;
498
499        let extraction_statistics = ExtractionStatistics {
500            entities_extracted: extracted_entities.len(),
501            relationships_extracted: extracted_relationships.len(),
502            triples_generated: extracted_triples.len(),
503            schema_elements_discovered: schema_elements.len(),
504            temporal_facts_extracted: temporal_facts.len(),
505            average_confidence: confidence_score,
506        };
507
508        let extraction_metadata = ExtractionMetadata {
509            extraction_timestamp: Utc::now(),
510            extraction_method: "Pattern-based + LLM-enhanced".to_string(),
511            processing_time_ms: processing_time,
512            language_detected: detected_language,
513            text_length: text.len(),
514            extraction_statistics,
515        };
516
517        info!("Knowledge extraction completed in {}ms", processing_time);
518
519        Ok(ExtractedKnowledge {
520            knowledge_id,
521            source_text: text.to_string(),
522            extracted_triples,
523            extracted_entities,
524            extracted_relationships,
525            schema_elements,
526            temporal_facts,
527            confidence_score,
528            extraction_metadata,
529        })
530    }
531
532    /// Detect language of text
533    async fn detect_language(&self, text: &str) -> Result<String> {
534        // Simple pattern-based language detection
535        for (lang_code, detector) in &self.language_detectors {
536            let mut matches = 0;
537            let mut total_patterns = 0;
538
539            for pattern in &detector.detection_patterns {
540                total_patterns += 1;
541                if pattern.is_match(text) {
542                    matches += 1;
543                }
544            }
545
546            let confidence = matches as f64 / total_patterns as f64;
547            if confidence >= detector.confidence_threshold {
548                return Ok(lang_code.clone());
549            }
550        }
551
552        Ok("unknown".to_string())
553    }
554
555    /// Extract entities from text
556    async fn extract_entities(&self, text: &str) -> Result<Vec<ExtractedEntity>> {
557        let mut entities = Vec::new();
558
559        for (entity_type, patterns) in &self.entity_patterns {
560            for pattern in patterns {
561                for capture in pattern.find_iter(text) {
562                    let entity_text = capture.as_str();
563                    let start_pos = capture.start();
564                    let end_pos = capture.end();
565
566                    let entity = ExtractedEntity {
567                        entity_id: Uuid::new_v4().to_string(),
568                        entity_text: entity_text.to_string(),
569                        entity_type: entity_type.clone(),
570                        canonical_form: self.canonicalize_entity(entity_text),
571                        aliases: Vec::new(),
572                        properties: HashMap::new(),
573                        confidence: 0.8, // Base confidence for pattern matches
574                        source_position: TextPosition {
575                            start_offset: start_pos,
576                            end_offset: end_pos,
577                            line_number: self.get_line_number(text, start_pos),
578                            column_number: self.get_column_number(text, start_pos),
579                        },
580                        linked_entities: Vec::new(),
581                    };
582
583                    entities.push(entity);
584                }
585            }
586        }
587
588        // Remove duplicates based on canonical form
589        entities.sort_by(|a, b| a.canonical_form.cmp(&b.canonical_form));
590        entities.dedup_by(|a, b| a.canonical_form == b.canonical_form);
591
592        Ok(entities)
593    }
594
595    /// Extract relationships from text
596    async fn extract_relationships(
597        &self,
598        text: &str,
599        entities: &[ExtractedEntity],
600    ) -> Result<Vec<ExtractedRelationship>> {
601        let mut relationships = Vec::new();
602
603        for (relationship_type, patterns) in &self.relationship_patterns {
604            for pattern in patterns {
605                if let Some(captures) = pattern.captures(text) {
606                    if captures.len() >= 3 {
607                        let subject = captures
608                            .get(1)
609                            .expect("capture group 1 should exist")
610                            .as_str();
611                        let object = captures
612                            .get(2)
613                            .expect("capture group 2 should exist")
614                            .as_str();
615
616                        // Find matching entities
617                        let subject_entity = self.find_matching_entity(subject, entities);
618                        let object_entity = self.find_matching_entity(object, entities);
619
620                        if let (Some(subj), Some(obj)) = (subject_entity, object_entity) {
621                            let relationship = ExtractedRelationship {
622                                relationship_id: Uuid::new_v4().to_string(),
623                                subject_entity: subj.entity_id.clone(),
624                                predicate: self.relationship_type_to_predicate(relationship_type),
625                                object_entity: obj.entity_id.clone(),
626                                relationship_type: relationship_type.clone(),
627                                confidence: 0.8,
628                                evidence_text: captures
629                                    .get(0)
630                                    .expect("capture group 0 should exist")
631                                    .as_str()
632                                    .to_string(),
633                                temporal_context: None,
634                                source_position: TextPosition {
635                                    start_offset: captures
636                                        .get(0)
637                                        .expect("capture group 0 should exist")
638                                        .start(),
639                                    end_offset: captures
640                                        .get(0)
641                                        .expect("capture group 0 should exist")
642                                        .end(),
643                                    line_number: 1,   // Simplified
644                                    column_number: 1, // Simplified
645                                },
646                            };
647
648                            relationships.push(relationship);
649                        }
650                    }
651                }
652            }
653        }
654
655        Ok(relationships)
656    }
657
658    /// Discover schema elements from text
659    async fn discover_schema_elements(
660        &self,
661        text: &str,
662        _entities: &[ExtractedEntity],
663    ) -> Result<Vec<SchemaElement>> {
664        let mut schema_elements = Vec::new();
665
666        for rule in &self.schema_inference_rules {
667            for capture in rule.pattern.find_iter(text) {
668                let element = SchemaElement {
669                    element_id: Uuid::new_v4().to_string(),
670                    element_type: rule.inferred_type.clone(),
671                    name: capture.as_str().to_string(),
672                    description: format!("Inferred from: {}", capture.as_str()),
673                    properties: Vec::new(),
674                    hierarchical_relations: Vec::new(),
675                    constraints: Vec::new(),
676                    confidence: rule.confidence_modifier,
677                };
678
679                schema_elements.push(element);
680            }
681        }
682
683        Ok(schema_elements)
684    }
685
686    /// Extract temporal facts from text
687    async fn extract_temporal_facts(
688        &self,
689        text: &str,
690        _entities: &[ExtractedEntity],
691    ) -> Result<Vec<TemporalFact>> {
692        let mut temporal_facts = Vec::new();
693
694        for pattern in &self.temporal_patterns {
695            for capture in pattern.find_iter(text) {
696                let temporal_text = capture.as_str();
697
698                let temporal_fact = TemporalFact {
699                    fact_id: Uuid::new_v4().to_string(),
700                    subject: "temporal_entity".to_string(), // Would be linked to actual entities
701                    predicate: "occurs_at".to_string(),
702                    object: temporal_text.to_string(),
703                    temporal_qualifier: TemporalQualifier {
704                        qualifier_type: TemporalType::PointInTime,
705                        time_point: self.parse_temporal_expression(temporal_text),
706                        time_interval: None,
707                        frequency: None,
708                    },
709                    confidence: 0.8,
710                    source_text: temporal_text.to_string(),
711                };
712
713                temporal_facts.push(temporal_fact);
714            }
715        }
716
717        Ok(temporal_facts)
718    }
719
720    /// Validate extracted facts for consistency
721    async fn validate_extracted_facts(
722        &self,
723        triples: &mut Vec<Triple>,
724        relationships: &[ExtractedRelationship],
725    ) -> Result<()> {
726        // Remove low-confidence relationships
727        let valid_relationships: Vec<_> = relationships
728            .iter()
729            .filter(|r| r.confidence >= self.config.confidence_threshold)
730            .collect();
731
732        // Check for contradictions and validate facts
733        let mut contradictions_found = 0;
734        let mut validated_triples = Vec::new();
735
736        // Create relationship maps for efficient lookup
737        let mut subject_predicates: HashMap<String, Vec<&ExtractedRelationship>> = HashMap::new();
738        let mut predicate_pairs: HashMap<String, Vec<(&str, &str)>> = HashMap::new();
739
740        for relationship in &valid_relationships {
741            subject_predicates
742                .entry(relationship.subject_entity.clone())
743                .or_default()
744                .push(relationship);
745
746            predicate_pairs
747                .entry(relationship.predicate.clone())
748                .or_default()
749                .push((&relationship.subject_entity, &relationship.object_entity));
750        }
751
752        // Check for direct contradictions (same subject-predicate with different objects)
753        for (subject, relationships) in &subject_predicates {
754            let mut predicate_values: HashMap<String, Vec<&str>> = HashMap::new();
755
756            for rel in relationships {
757                predicate_values
758                    .entry(rel.predicate.clone())
759                    .or_default()
760                    .push(&rel.object_entity);
761            }
762
763            for (predicate, values) in predicate_values {
764                if values.len() > 1 {
765                    // Check if multiple values for the same predicate indicate contradiction
766                    let unique_values: std::collections::HashSet<_> = values.into_iter().collect();
767                    if unique_values.len() > 1 && self.is_contradictory_predicate(&predicate) {
768                        warn!(
769                            "Contradiction detected for {}: {} has multiple {} values: {:?}",
770                            subject, subject, predicate, unique_values
771                        );
772                        contradictions_found += 1;
773
774                        // Keep only the highest confidence relationship for this predicate
775                        if let Some(best_rel) = relationships
776                            .iter()
777                            .filter(|r| r.predicate == predicate)
778                            .max_by(|a, b| {
779                                a.confidence
780                                    .partial_cmp(&b.confidence)
781                                    .unwrap_or(std::cmp::Ordering::Equal)
782                            })
783                        {
784                            if let Ok(triple) = self.relationship_to_triple(best_rel) {
785                                validated_triples.push(triple);
786                            }
787                        }
788                        continue;
789                    }
790                }
791            }
792        }
793
794        // Check temporal consistency
795        for relationship in &valid_relationships {
796            if let Some(temporal_context) = &relationship.temporal_context {
797                if !self.validate_temporal_consistency(temporal_context, &valid_relationships) {
798                    warn!(
799                        "Temporal inconsistency detected for relationship: {} {} {}",
800                        relationship.subject_entity,
801                        relationship.predicate,
802                        relationship.object_entity
803                    );
804                    contradictions_found += 1;
805                    continue;
806                }
807            }
808
809            // Add valid relationship as triple
810            if let Ok(triple) = self.relationship_to_triple(relationship) {
811                validated_triples.push(triple);
812            }
813        }
814
815        // Check logical consistency (e.g., transitive relationships)
816        self.validate_logical_consistency(&valid_relationships, &mut contradictions_found)?;
817
818        // Update triples with validated ones
819        triples.clear();
820        triples.extend(validated_triples);
821
822        if contradictions_found > 0 {
823            warn!(
824                "Found {} contradictions during fact validation",
825                contradictions_found
826            );
827        }
828
829        debug!("Validated {} relationships", valid_relationships.len());
830        Ok(())
831    }
832
833    /// Check if a predicate type indicates contradictory values are not allowed
834    fn is_contradictory_predicate(&self, predicate: &str) -> bool {
835        // Define predicates that should have unique values (functional properties)
836        let functional_predicates = [
837            "birthDate",
838            "deathDate",
839            "age",
840            "height",
841            "weight",
842            "hasGender",
843            "isA",
844            "type",
845            "hasCapital",
846            "hasPopulation",
847            "hasArea",
848            "founded",
849            "established",
850            "created",
851            "died",
852            "born",
853        ];
854
855        functional_predicates.iter().any(|&fp| {
856            predicate.to_lowercase().contains(&fp.to_lowercase()) || predicate.ends_with(&fp)
857        })
858    }
859
860    /// Validate temporal consistency of relationships
861    fn validate_temporal_consistency(
862        &self,
863        temporal_context: &TemporalContext,
864        all_relationships: &[&ExtractedRelationship],
865    ) -> bool {
866        // Check if temporal context makes sense
867        if let (Some(start), Some(end)) = (&temporal_context.start_time, &temporal_context.end_time)
868        {
869            if start >= end {
870                return false; // Start time cannot be after end time
871            }
872        }
873
874        // Check for temporal conflicts with other relationships
875        for other_rel in all_relationships {
876            if let Some(other_temporal) = &other_rel.temporal_context {
877                // If same entities with conflicting time periods
878                if temporal_context != other_temporal {
879                    // Check for overlapping time periods that might indicate conflicts
880                    if self.temporal_periods_conflict(temporal_context, other_temporal) {
881                        return false;
882                    }
883                }
884            }
885        }
886
887        true
888    }
889
890    /// Check if two temporal periods conflict
891    fn temporal_periods_conflict(
892        &self,
893        context1: &TemporalContext,
894        context2: &TemporalContext,
895    ) -> bool {
896        // Simple check - in real implementation, this would be more sophisticated
897        // Check if both have explicit time ranges that don't overlap
898        match (
899            (&context1.start_time, &context1.end_time),
900            (&context2.start_time, &context2.end_time),
901        ) {
902            ((Some(start1), Some(end1)), (Some(start2), Some(end2))) => {
903                // If periods don't overlap, they might be conflicting for certain relationships
904                end1 < start2 || end2 < start1
905            }
906            _ => false, // If we don't have full temporal information, assume no conflict
907        }
908    }
909
910    /// Validate logical consistency across relationships
911    fn validate_logical_consistency(
912        &self,
913        relationships: &[&ExtractedRelationship],
914        contradictions_found: &mut usize,
915    ) -> Result<()> {
916        // Check transitive relationships
917        let mut is_a_relationships: HashMap<String, String> = HashMap::new();
918        let mut part_of_relationships: HashMap<String, String> = HashMap::new();
919
920        // Collect hierarchical relationships
921        for rel in relationships {
922            let pred_lower = rel.predicate.to_lowercase();
923            if pred_lower.contains("isa")
924                || pred_lower.contains("instanceof")
925                || pred_lower.contains("type")
926            {
927                is_a_relationships.insert(rel.subject_entity.clone(), rel.object_entity.clone());
928            } else if pred_lower.contains("partof")
929                || pred_lower.contains("contains")
930                || pred_lower.contains("within")
931            {
932                part_of_relationships.insert(rel.subject_entity.clone(), rel.object_entity.clone());
933            }
934        }
935
936        // Check for cycles in is-a relationships (which would be logical contradictions)
937        for (subject, object) in &is_a_relationships {
938            if self.has_cycle_in_hierarchy(subject, object, &is_a_relationships) {
939                warn!(
940                    "Logical contradiction: Cycle detected in is-a relationship for {}",
941                    subject
942                );
943                *contradictions_found += 1;
944            }
945        }
946
947        // Check for cycles in part-of relationships
948        for (subject, object) in &part_of_relationships {
949            if self.has_cycle_in_hierarchy(subject, object, &part_of_relationships) {
950                warn!(
951                    "Logical contradiction: Cycle detected in part-of relationship for {}",
952                    subject
953                );
954                *contradictions_found += 1;
955            }
956        }
957
958        // Check domain/range constraints
959        self.validate_domain_range_constraints(relationships, contradictions_found)?;
960
961        Ok(())
962    }
963
964    /// Check for cycles in hierarchical relationships
965    fn has_cycle_in_hierarchy(
966        &self,
967        start: &str,
968        current: &str,
969        hierarchy: &HashMap<String, String>,
970    ) -> bool {
971        if start == current {
972            return true; // Direct cycle
973        }
974
975        // Follow the chain to detect cycles
976        let mut visited = std::collections::HashSet::new();
977        let mut current_node = current;
978
979        while let Some(parent) = hierarchy.get(current_node) {
980            if visited.contains(current_node) || current_node == start {
981                return true; // Cycle detected
982            }
983            visited.insert(current_node.to_string());
984            current_node = parent;
985        }
986
987        false
988    }
989
990    /// Validate domain and range constraints for relationships
991    fn validate_domain_range_constraints(
992        &self,
993        relationships: &[&ExtractedRelationship],
994        contradictions_found: &mut usize,
995    ) -> Result<()> {
996        // Define some basic domain/range constraints
997        let constraints = [
998            ("age", "Person", "Number"),
999            ("birthDate", "Person", "Date"),
1000            ("hasCapital", "Country", "City"),
1001            ("hasPopulation", "Place", "Number"),
1002            ("authorOf", "Person", "Book"),
1003            ("marriedTo", "Person", "Person"),
1004        ];
1005
1006        for rel in relationships {
1007            for (predicate, expected_domain, expected_range) in &constraints {
1008                if rel
1009                    .predicate
1010                    .to_lowercase()
1011                    .contains(&predicate.to_lowercase())
1012                {
1013                    // Check if subject matches expected domain type
1014                    if !self.entity_matches_type(
1015                        &rel.subject_entity,
1016                        expected_domain,
1017                        relationships,
1018                    ) {
1019                        warn!(
1020                            "Domain constraint violation: {} should be of type {} for predicate {}",
1021                            rel.subject_entity, expected_domain, rel.predicate
1022                        );
1023                        *contradictions_found += 1;
1024                    }
1025
1026                    // Check if object matches expected range type
1027                    if !self.entity_matches_type(&rel.object_entity, expected_range, relationships)
1028                    {
1029                        warn!(
1030                            "Range constraint violation: {} should be of type {} for predicate {}",
1031                            rel.object_entity, expected_range, rel.predicate
1032                        );
1033                        *contradictions_found += 1;
1034                    }
1035                }
1036            }
1037        }
1038
1039        Ok(())
1040    }
1041
1042    /// Check if an entity matches a given type based on other relationships
1043    fn entity_matches_type(
1044        &self,
1045        entity: &str,
1046        expected_type: &str,
1047        relationships: &[&ExtractedRelationship],
1048    ) -> bool {
1049        // Simple heuristic-based type checking
1050        let entity_lower = entity.to_lowercase();
1051        let type_lower = expected_type.to_lowercase();
1052
1053        // Check if entity name suggests the type
1054        match type_lower.as_str() {
1055            "person" => {
1056                entity_lower.contains("person") || 
1057                entity_lower.contains("author") ||
1058                entity_lower.contains("writer") ||
1059                entity_lower.contains("scientist") ||
1060                // Common person name patterns
1061                entity.chars().next().is_some_and(|c| c.is_uppercase())
1062            }
1063            "number" => {
1064                entity.parse::<f64>().is_ok()
1065                    || entity_lower.contains("million")
1066                    || entity_lower.contains("thousand")
1067                    || entity_lower.contains("year")
1068            }
1069            "date" => {
1070                entity_lower.contains("19") || entity_lower.contains("20") || // Years
1071                entity_lower.contains("january") || entity_lower.contains("february") ||
1072                entity_lower.contains("march") || entity_lower.contains("april") ||
1073                entity_lower.contains("may") || entity_lower.contains("june") ||
1074                entity_lower.contains("july") || entity_lower.contains("august") ||
1075                entity_lower.contains("september") || entity_lower.contains("october") ||
1076                entity_lower.contains("november") || entity_lower.contains("december")
1077            }
1078            "country" => {
1079                entity_lower.contains("country") ||
1080                entity_lower.contains("nation") ||
1081                // Check if explicitly typed as country in relationships
1082                relationships.iter().any(|r| r.subject_entity == entity &&
1083                    r.predicate.to_lowercase().contains("type") && 
1084                    r.object_entity.to_lowercase().contains("country"))
1085            }
1086            "city" => {
1087                entity_lower.contains("city") ||
1088                entity_lower.contains("town") ||
1089                // Check if explicitly typed as city in relationships
1090                relationships.iter().any(|r| r.subject_entity == entity &&
1091                    r.predicate.to_lowercase().contains("type") && 
1092                    r.object_entity.to_lowercase().contains("city"))
1093            }
1094            "book" => {
1095                entity_lower.contains("book") ||
1096                entity_lower.contains("novel") ||
1097                entity_lower.contains("publication") ||
1098                // Check if explicitly typed as book in relationships
1099                relationships.iter().any(|r| r.subject_entity == entity &&
1100                    r.predicate.to_lowercase().contains("type") && 
1101                    r.object_entity.to_lowercase().contains("book"))
1102            }
1103            _ => true, // Unknown type, assume valid
1104        }
1105    }
1106
1107    /// Convert relationship to RDF triple
1108    fn relationship_to_triple(&self, relationship: &ExtractedRelationship) -> Result<Triple> {
1109        // This is a simplified conversion - real implementation would be more sophisticated
1110        let subject = NamedNode::new(format!(
1111            "http://example.org/entity/{}",
1112            relationship.subject_entity
1113        ))?;
1114        let predicate = NamedNode::new(format!(
1115            "http://example.org/predicate/{}",
1116            relationship.predicate
1117        ))?;
1118        let object = NamedNode::new(format!(
1119            "http://example.org/entity/{}",
1120            relationship.object_entity
1121        ))?;
1122
1123        Ok(Triple::new(
1124            Subject::NamedNode(subject),
1125            Predicate::NamedNode(predicate),
1126            Object::NamedNode(object),
1127        ))
1128    }
1129
1130    /// Helper functions
1131    fn canonicalize_entity(&self, entity: &str) -> String {
1132        entity.trim().to_lowercase()
1133    }
1134
1135    fn get_line_number(&self, text: &str, offset: usize) -> usize {
1136        text[..offset].chars().filter(|&c| c == '\n').count() + 1
1137    }
1138
1139    fn get_column_number(&self, text: &str, offset: usize) -> usize {
1140        text[..offset]
1141            .chars()
1142            .rev()
1143            .take_while(|&c| c != '\n')
1144            .count()
1145            + 1
1146    }
1147
1148    fn find_matching_entity<'a>(
1149        &self,
1150        text: &str,
1151        entities: &'a [ExtractedEntity],
1152    ) -> Option<&'a ExtractedEntity> {
1153        entities
1154            .iter()
1155            .find(|e| e.entity_text == text || e.canonical_form == self.canonicalize_entity(text))
1156    }
1157
1158    fn relationship_type_to_predicate(&self, rel_type: &RelationshipType) -> String {
1159        match rel_type {
1160            RelationshipType::IsA => "rdf:type".to_string(),
1161            RelationshipType::PartOf => "part_of".to_string(),
1162            RelationshipType::LocatedIn => "located_in".to_string(),
1163            RelationshipType::OwnedBy => "owned_by".to_string(),
1164            RelationshipType::CreatedBy => "created_by".to_string(),
1165            RelationshipType::CausedBy => "caused_by".to_string(),
1166            RelationshipType::TemporalSequence => "temporal_sequence".to_string(),
1167            RelationshipType::Similarity => "similar_to".to_string(),
1168            RelationshipType::Dependency => "depends_on".to_string(),
1169            RelationshipType::Custom(pred) => pred.clone(),
1170        }
1171    }
1172
1173    fn parse_temporal_expression(&self, temporal_text: &str) -> Option<DateTime<Utc>> {
1174        // Simplified temporal parsing - real implementation would be more sophisticated
1175        if let Ok(dt) = chrono::DateTime::parse_from_str(temporal_text, "%Y-%m-%d") {
1176            Some(dt.with_timezone(&Utc))
1177        } else {
1178            None
1179        }
1180    }
1181
1182    fn calculate_extraction_confidence(
1183        &self,
1184        entities: &[ExtractedEntity],
1185        relationships: &[ExtractedRelationship],
1186        schema_elements: &[SchemaElement],
1187    ) -> f64 {
1188        let mut total_confidence = 0.0;
1189        let mut count = 0;
1190
1191        for entity in entities {
1192            total_confidence += entity.confidence;
1193            count += 1;
1194        }
1195
1196        for relationship in relationships {
1197            total_confidence += relationship.confidence;
1198            count += 1;
1199        }
1200
1201        for schema_element in schema_elements {
1202            total_confidence += schema_element.confidence;
1203            count += 1;
1204        }
1205
1206        if count > 0 {
1207            total_confidence / count as f64
1208        } else {
1209            0.0
1210        }
1211    }
1212}
1213
1214#[cfg(test)]
1215mod tests {
1216    use super::*;
1217
1218    #[tokio::test]
1219    async fn test_knowledge_extraction_engine_creation() {
1220        let config = KnowledgeExtractionConfig::default();
1221        let engine = KnowledgeExtractionEngine::new(config);
1222
1223        assert!(engine.is_ok());
1224    }
1225
1226    #[tokio::test]
1227    async fn test_entity_extraction() {
1228        let config = KnowledgeExtractionConfig::default();
1229        let mut engine = KnowledgeExtractionEngine::new(config).expect("should succeed");
1230
1231        let text = "Dr. John Smith works at Microsoft Corp.";
1232        let result = engine.extract_knowledge(text).await;
1233
1234        assert!(result.is_ok());
1235        let knowledge = result.expect("should succeed");
1236        assert!(!knowledge.extracted_entities.is_empty());
1237    }
1238
1239    #[test]
1240    fn test_canonicalize_entity() {
1241        let config = KnowledgeExtractionConfig::default();
1242        let engine = KnowledgeExtractionEngine::new(config).expect("should succeed");
1243
1244        assert_eq!(engine.canonicalize_entity("  John Smith  "), "john smith");
1245    }
1246}
oxirs_chat/rag/knowledge_extraction.rs

oxirs_chat/rag/
knowledge_extraction.rs