1use anyhow::{anyhow, Result};
16use serde::{Deserialize, Serialize};
17use std::collections::{HashMap, HashSet};
18use std::sync::Arc;
19use tokio::sync::RwLock;
20use tracing::{debug, info};
21
22use crate::semantic_reasoner::{ReasonerConfig, SemanticReasoner};
26
27#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct AlignmentConfig {
30 pub similarity_threshold: f64,
32 pub enable_property_alignment: bool,
34 pub enable_class_alignment: bool,
36 pub enable_instance_alignment: bool,
38 pub enable_ml_mapping: bool,
40 pub ml_confidence_threshold: f64,
42 pub max_suggestions_per_entity: usize,
44 pub enable_owl_reasoning: bool,
46}
47
48impl Default for AlignmentConfig {
49 fn default() -> Self {
50 Self {
51 similarity_threshold: 0.75,
52 enable_property_alignment: true,
53 enable_class_alignment: true,
54 enable_instance_alignment: true,
55 enable_ml_mapping: true,
56 ml_confidence_threshold: 0.8,
57 max_suggestions_per_entity: 5,
58 enable_owl_reasoning: true,
59 }
60 }
61}
62
63pub struct SchemaAligner {
65 config: AlignmentConfig,
66 property_mappings: Arc<RwLock<HashMap<(String, String), f64>>>,
68 class_mappings: Arc<RwLock<HashMap<(String, String), f64>>>,
70 instance_mappings: Arc<RwLock<HashMap<(String, String), f64>>>,
72 vocabulary_cache: Arc<RwLock<HashMap<String, VocabularyMetadata>>>,
74 ml_predictor: Arc<RwLock<Option<MappingPredictor>>>,
76 reasoner: Arc<RwLock<Option<SemanticReasoner>>>,
78}
79
80impl SchemaAligner {
81 pub fn new(config: AlignmentConfig) -> Self {
83 Self {
84 config,
85 property_mappings: Arc::new(RwLock::new(HashMap::new())),
86 class_mappings: Arc::new(RwLock::new(HashMap::new())),
87 instance_mappings: Arc::new(RwLock::new(HashMap::new())),
88 vocabulary_cache: Arc::new(RwLock::new(HashMap::new())),
89 ml_predictor: Arc::new(RwLock::new(None)),
90 reasoner: Arc::new(RwLock::new(None)),
91 }
92 }
93
94 pub async fn enable_reasoning(&self, reasoner_config: ReasonerConfig) -> Result<()> {
96 let reasoner = SemanticReasoner::new(reasoner_config);
97 let mut reasoner_guard = self.reasoner.write().await;
98 *reasoner_guard = Some(reasoner);
99 info!("Semantic reasoning enabled for schema alignment");
100 Ok(())
101 }
102
103 pub async fn align_vocabularies(
105 &self,
106 source_vocab: &str,
107 target_vocab: &str,
108 ) -> Result<AlignmentResult> {
109 info!(
110 "Aligning vocabularies: {} -> {}",
111 source_vocab, target_vocab
112 );
113
114 let source_meta = self.load_vocabulary_metadata(source_vocab).await?;
116 let target_meta = self.load_vocabulary_metadata(target_vocab).await?;
117
118 let mut property_alignments = Vec::new();
119 let mut class_alignments = Vec::new();
120
121 if self.config.enable_property_alignment {
123 property_alignments = self
124 .align_properties(&source_meta.properties, &target_meta.properties)
125 .await?;
126 }
127
128 if self.config.enable_class_alignment {
130 class_alignments = self
131 .align_classes(&source_meta.classes, &target_meta.classes)
132 .await?;
133 }
134
135 self.store_alignments(&property_alignments, &class_alignments)
137 .await?;
138
139 let overall_confidence =
140 self.calculate_overall_confidence(&property_alignments, &class_alignments);
141
142 Ok(AlignmentResult {
143 source_vocabulary: source_vocab.to_string(),
144 target_vocabulary: target_vocab.to_string(),
145 property_alignments,
146 class_alignments,
147 instance_alignments: vec![],
148 overall_confidence,
149 })
150 }
151
152 async fn align_properties(
154 &self,
155 source_properties: &[PropertyMetadata],
156 target_properties: &[PropertyMetadata],
157 ) -> Result<Vec<Alignment>> {
158 let mut alignments = Vec::new();
159
160 for source_prop in source_properties {
161 let mut candidates = Vec::new();
162
163 for target_prop in target_properties {
164 let similarity = self
166 .calculate_property_similarity(source_prop, target_prop)
167 .await?;
168
169 if similarity >= self.config.similarity_threshold {
170 candidates.push((target_prop, similarity));
171 }
172 }
173
174 candidates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
176
177 for (target_prop, confidence) in candidates
179 .iter()
180 .take(self.config.max_suggestions_per_entity)
181 {
182 alignments.push(Alignment {
183 source_entity: source_prop.uri.clone(),
184 target_entity: target_prop.uri.clone(),
185 alignment_type: AlignmentType::Property,
186 confidence: *confidence,
187 evidence: vec![format!(
188 "String similarity: {:.3}, Domain/range match",
189 confidence
190 )],
191 });
192 }
193 }
194
195 Ok(alignments)
196 }
197
198 async fn align_classes(
200 &self,
201 source_classes: &[ClassMetadata],
202 target_classes: &[ClassMetadata],
203 ) -> Result<Vec<Alignment>> {
204 let mut alignments = Vec::new();
205
206 for source_class in source_classes {
207 let mut candidates = Vec::new();
208
209 for target_class in target_classes {
210 let similarity = self
212 .calculate_class_similarity(source_class, target_class)
213 .await?;
214
215 if similarity >= self.config.similarity_threshold {
216 candidates.push((target_class, similarity));
217 }
218 }
219
220 candidates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
222
223 for (target_class, confidence) in candidates
225 .iter()
226 .take(self.config.max_suggestions_per_entity)
227 {
228 alignments.push(Alignment {
229 source_entity: source_class.uri.clone(),
230 target_entity: target_class.uri.clone(),
231 alignment_type: AlignmentType::Class,
232 confidence: *confidence,
233 evidence: vec![format!("Class similarity: {:.3}", confidence)],
234 });
235 }
236 }
237
238 Ok(alignments)
239 }
240
241 async fn calculate_property_similarity(
243 &self,
244 source: &PropertyMetadata,
245 target: &PropertyMetadata,
246 ) -> Result<f64> {
247 let mut similarities = Vec::new();
248
249 let label_sim = self.calculate_string_similarity(&source.label, &target.label);
251 similarities.push(label_sim * 0.4); let local_name_sim = self.calculate_string_similarity(
255 &Self::extract_local_name(&source.uri),
256 &Self::extract_local_name(&target.uri),
257 );
258 similarities.push(local_name_sim * 0.3); let domain_range_sim = self.calculate_domain_range_similarity(source, target);
262 similarities.push(domain_range_sim * 0.2); if let (Some(ref src_desc), Some(ref tgt_desc)) = (&source.description, &target.description)
266 {
267 let desc_sim = self.calculate_string_similarity(src_desc, tgt_desc);
268 similarities.push(desc_sim * 0.1); }
270
271 let total_similarity = similarities.iter().sum::<f64>();
273
274 Ok(total_similarity.min(1.0))
275 }
276
277 async fn calculate_class_similarity(
279 &self,
280 source: &ClassMetadata,
281 target: &ClassMetadata,
282 ) -> Result<f64> {
283 let mut similarities = Vec::new();
284
285 let label_sim = self.calculate_string_similarity(&source.label, &target.label);
287 similarities.push(label_sim * 0.5); let local_name_sim = self.calculate_string_similarity(
291 &Self::extract_local_name(&source.uri),
292 &Self::extract_local_name(&target.uri),
293 );
294 similarities.push(local_name_sim * 0.3); let hierarchy_sim = self.calculate_hierarchy_similarity(source, target);
298 similarities.push(hierarchy_sim * 0.2); let total_similarity = similarities.iter().sum::<f64>();
301
302 Ok(total_similarity.min(1.0))
303 }
304
305 fn calculate_string_similarity(&self, s1: &str, s2: &str) -> f64 {
307 let s1_norm = s1.to_lowercase().trim().to_string();
309 let s2_norm = s2.to_lowercase().trim().to_string();
310
311 if s1_norm == s2_norm {
313 return 1.0;
314 }
315
316 let lev_dist = Self::simple_levenshtein(&s1_norm, &s2_norm);
318 let max_len = s1_norm.len().max(s2_norm.len()) as f64;
319 let lev_sim = 1.0 - (lev_dist as f64 / max_len);
320
321 let tokens1: HashSet<&str> = s1_norm.split_whitespace().collect();
323 let tokens2: HashSet<&str> = s2_norm.split_whitespace().collect();
324 let intersection = tokens1.intersection(&tokens2).count();
325 let union = tokens1.union(&tokens2).count();
326 let jaccard_sim = if union > 0 {
327 intersection as f64 / union as f64
328 } else {
329 0.0
330 };
331
332 (lev_sim * 0.6 + jaccard_sim * 0.4).min(1.0)
334 }
335
336 fn calculate_domain_range_similarity(
338 &self,
339 source: &PropertyMetadata,
340 target: &PropertyMetadata,
341 ) -> f64 {
342 let mut similarity: f64 = 0.0;
343
344 if let (Some(ref src_domain), Some(ref tgt_domain)) = (&source.domain, &target.domain) {
346 if src_domain == tgt_domain {
347 similarity += 0.5;
348 } else {
349 let src_local = Self::extract_local_name(src_domain);
351 let tgt_local = Self::extract_local_name(tgt_domain);
352 if src_local == tgt_local {
353 similarity += 0.3;
354 }
355 }
356 }
357
358 if let (Some(ref src_range), Some(ref tgt_range)) = (&source.range, &target.range) {
360 if src_range == tgt_range {
361 similarity += 0.5;
362 } else {
363 let src_local = Self::extract_local_name(src_range);
364 let tgt_local = Self::extract_local_name(tgt_range);
365 if src_local == tgt_local {
366 similarity += 0.3;
367 }
368 }
369 }
370
371 similarity.min(1.0)
372 }
373
374 fn calculate_hierarchy_similarity(
376 &self,
377 source: &ClassMetadata,
378 target: &ClassMetadata,
379 ) -> f64 {
380 let src_supers: HashSet<_> = source.superclasses.iter().collect();
382 let tgt_supers: HashSet<_> = target.superclasses.iter().collect();
383
384 let intersection = src_supers.intersection(&tgt_supers).count();
385 let union = src_supers.union(&tgt_supers).count();
386
387 if union > 0 {
388 intersection as f64 / union as f64
389 } else {
390 0.0
391 }
392 }
393
394 fn extract_local_name(uri: &str) -> String {
396 uri.rsplit(['/', '#']).next().unwrap_or(uri).to_string()
397 }
398
399 fn simple_levenshtein(s1: &str, s2: &str) -> usize {
401 let len1 = s1.len();
402 let len2 = s2.len();
403
404 if len1 == 0 {
405 return len2;
406 }
407 if len2 == 0 {
408 return len1;
409 }
410
411 let mut prev_row: Vec<usize> = (0..=len2).collect();
412 let mut curr_row = vec![0; len2 + 1];
413
414 for (i, c1) in s1.chars().enumerate() {
415 curr_row[0] = i + 1;
416
417 for (j, c2) in s2.chars().enumerate() {
418 let cost = if c1 == c2 { 0 } else { 1 };
419 curr_row[j + 1] = (curr_row[j] + 1)
420 .min(prev_row[j + 1] + 1)
421 .min(prev_row[j] + cost);
422 }
423
424 std::mem::swap(&mut prev_row, &mut curr_row);
425 }
426
427 prev_row[len2]
428 }
429
430 async fn load_vocabulary_metadata(&self, vocab_uri: &str) -> Result<VocabularyMetadata> {
432 let cache = self.vocabulary_cache.read().await;
434 if let Some(meta) = cache.get(vocab_uri) {
435 return Ok(meta.clone());
436 }
437 drop(cache);
438
439 let metadata = VocabularyMetadata {
446 namespace: vocab_uri.to_string(),
447 prefix: Self::extract_local_name(vocab_uri),
448 properties: vec![],
449 classes: vec![],
450 version: None,
451 };
452
453 let mut cache = self.vocabulary_cache.write().await;
455 cache.insert(vocab_uri.to_string(), metadata.clone());
456
457 Ok(metadata)
458 }
459
460 async fn store_alignments(
462 &self,
463 property_alignments: &[Alignment],
464 class_alignments: &[Alignment],
465 ) -> Result<()> {
466 let mut prop_mappings = self.property_mappings.write().await;
467 let mut class_mappings = self.class_mappings.write().await;
468
469 for alignment in property_alignments {
470 prop_mappings.insert(
471 (
472 alignment.source_entity.clone(),
473 alignment.target_entity.clone(),
474 ),
475 alignment.confidence,
476 );
477 }
478
479 for alignment in class_alignments {
480 class_mappings.insert(
481 (
482 alignment.source_entity.clone(),
483 alignment.target_entity.clone(),
484 ),
485 alignment.confidence,
486 );
487 }
488
489 info!(
490 "Stored {} property alignments and {} class alignments",
491 property_alignments.len(),
492 class_alignments.len()
493 );
494
495 Ok(())
496 }
497
498 fn calculate_overall_confidence(
500 &self,
501 property_alignments: &[Alignment],
502 class_alignments: &[Alignment],
503 ) -> f64 {
504 let all_alignments: Vec<f64> = property_alignments
505 .iter()
506 .chain(class_alignments.iter())
507 .map(|a| a.confidence)
508 .collect();
509
510 if all_alignments.is_empty() {
511 return 0.0;
512 }
513
514 all_alignments.iter().sum::<f64>() / all_alignments.len() as f64
516 }
517
518 pub async fn rewrite_query(&self, query: &str, target_vocabulary: &str) -> Result<String> {
520 debug!("Rewriting query for vocabulary: {}", target_vocabulary);
521
522 let mut rewritten = query.to_string();
524
525 let prop_mappings = self.property_mappings.read().await;
527 for ((source, target), _confidence) in prop_mappings.iter() {
528 rewritten = rewritten.replace(source, target);
529 }
530
531 let class_mappings = self.class_mappings.read().await;
533 for ((source, target), _confidence) in class_mappings.iter() {
534 rewritten = rewritten.replace(source, target);
535 }
536
537 Ok(rewritten)
538 }
539
540 pub async fn get_mapping(
542 &self,
543 source_entity: &str,
544 entity_type: AlignmentType,
545 ) -> Result<Option<String>> {
546 match entity_type {
547 AlignmentType::Property => {
548 let mappings = self.property_mappings.read().await;
549 Ok(mappings
550 .iter()
551 .find(|((s, _), _)| s == source_entity)
552 .map(|((_, t), _)| t.clone()))
553 }
554 AlignmentType::Class => {
555 let mappings = self.class_mappings.read().await;
556 Ok(mappings
557 .iter()
558 .find(|((s, _), _)| s == source_entity)
559 .map(|((_, t), _)| t.clone()))
560 }
561 AlignmentType::Instance => {
562 let mappings = self.instance_mappings.read().await;
563 Ok(mappings
564 .iter()
565 .find(|((s, _), _)| s == source_entity)
566 .map(|((_, t), _)| t.clone()))
567 }
568 }
569 }
570
571 pub async fn train_ml_predictor(&self, training_data: Vec<MappingExample>) -> Result<()> {
573 if !self.config.enable_ml_mapping {
574 return Err(anyhow!("ML mapping is disabled"));
575 }
576
577 info!(
578 "Training ML-based mapping predictor with {} examples",
579 training_data.len()
580 );
581
582 let predictor = MappingPredictor::train(training_data)?;
583
584 let mut ml_predictor = self.ml_predictor.write().await;
585 *ml_predictor = Some(predictor);
586
587 Ok(())
588 }
589}
590
591#[derive(Debug, Clone, Serialize, Deserialize)]
593pub struct VocabularyMetadata {
594 pub namespace: String,
595 pub prefix: String,
596 pub properties: Vec<PropertyMetadata>,
597 pub classes: Vec<ClassMetadata>,
598 pub version: Option<String>,
599}
600
601#[derive(Debug, Clone, Serialize, Deserialize)]
603pub struct PropertyMetadata {
604 pub uri: String,
605 pub label: String,
606 pub description: Option<String>,
607 pub domain: Option<String>,
608 pub range: Option<String>,
609}
610
611#[derive(Debug, Clone, Serialize, Deserialize)]
613pub struct ClassMetadata {
614 pub uri: String,
615 pub label: String,
616 pub description: Option<String>,
617 pub superclasses: Vec<String>,
618 pub subclasses: Vec<String>,
619}
620
621#[derive(Debug, Clone, Serialize, Deserialize)]
623pub struct AlignmentResult {
624 pub source_vocabulary: String,
625 pub target_vocabulary: String,
626 pub property_alignments: Vec<Alignment>,
627 pub class_alignments: Vec<Alignment>,
628 pub instance_alignments: Vec<Alignment>,
629 pub overall_confidence: f64,
630}
631
632#[derive(Debug, Clone, Serialize, Deserialize)]
634pub struct Alignment {
635 pub source_entity: String,
636 pub target_entity: String,
637 pub alignment_type: AlignmentType,
638 pub confidence: f64,
639 pub evidence: Vec<String>,
640}
641
642#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
644pub enum AlignmentType {
645 Property,
646 Class,
647 Instance,
648}
649
650#[derive(Debug, Clone, Serialize, Deserialize)]
652pub struct MappingExample {
653 pub source_entity: String,
654 pub target_entity: String,
655 pub is_correct: bool,
656 pub features: HashMap<String, f64>,
657}
658
659#[derive(Debug, Clone)]
661pub struct MappingPredictor {
662 weights: HashMap<String, f64>,
664}
665
666impl MappingPredictor {
667 pub fn train(examples: Vec<MappingExample>) -> Result<Self> {
669 let mut weights = HashMap::new();
671
672 for example in &examples {
674 for (feature_name, &feature_value) in &example.features {
675 let weight = if example.is_correct {
676 feature_value
677 } else {
678 -feature_value
679 };
680 *weights.entry(feature_name.clone()).or_insert(0.0) += weight;
681 }
682 }
683
684 let sum: f64 = weights.values().map(|w| w.abs()).sum();
686 if sum > 0.0 {
687 for weight in weights.values_mut() {
688 *weight /= sum;
689 }
690 }
691
692 Ok(Self { weights })
693 }
694
695 pub fn predict(&self, features: &HashMap<String, f64>) -> f64 {
697 let mut score = 0.5; for (feature_name, &feature_value) in features {
700 if let Some(&weight) = self.weights.get(feature_name) {
701 score += weight * feature_value;
702 }
703 }
704
705 score.clamp(0.0, 1.0)
706 }
707}
708
709#[cfg(test)]
710mod tests {
711 use super::*;
712
713 #[test]
714 fn test_alignment_config_default() {
715 let config = AlignmentConfig::default();
716 assert_eq!(config.similarity_threshold, 0.75);
717 assert!(config.enable_property_alignment);
718 assert!(config.enable_class_alignment);
719 }
720
721 #[test]
722 fn test_extract_local_name() {
723 assert_eq!(
724 SchemaAligner::extract_local_name("http://xmlns.com/foaf/0.1/name"),
725 "name"
726 );
727 assert_eq!(
728 SchemaAligner::extract_local_name("http://schema.org#Person"),
729 "Person"
730 );
731 }
732
733 #[tokio::test]
734 async fn test_schema_aligner_creation() {
735 let config = AlignmentConfig::default();
736 let aligner = SchemaAligner::new(config);
737
738 assert!(aligner.property_mappings.read().await.is_empty());
740 assert!(aligner.class_mappings.read().await.is_empty());
741 }
742
743 #[test]
744 fn test_string_similarity() {
745 let config = AlignmentConfig::default();
746 let aligner = SchemaAligner::new(config);
747
748 let sim1 = aligner.calculate_string_similarity("name", "name");
750 assert_eq!(sim1, 1.0);
751
752 let sim2 = aligner.calculate_string_similarity("firstName", "first_name");
754 assert!(sim2 > 0.5);
755
756 let sim3 = aligner.calculate_string_similarity("name", "age");
758 assert!(sim3 < 0.5);
759 }
760
761 #[test]
762 fn test_mapping_predictor_train() {
763 let examples = vec![
764 MappingExample {
765 source_entity: "foaf:name".to_string(),
766 target_entity: "schema:name".to_string(),
767 is_correct: true,
768 features: [("string_sim".to_string(), 0.9)].into(),
769 },
770 MappingExample {
771 source_entity: "foaf:age".to_string(),
772 target_entity: "schema:birthDate".to_string(),
773 is_correct: false,
774 features: [("string_sim".to_string(), 0.3)].into(),
775 },
776 ];
777
778 let predictor = MappingPredictor::train(examples).unwrap();
779 assert!(!predictor.weights.is_empty());
780 }
781}