Skip to main content

oxirs_vec/
hierarchical_similarity.rs

1//! Hierarchical Similarity Computation for Knowledge Graphs
2//!
3//! This module provides advanced similarity computation that takes into account
4//! ontological hierarchies, concept relationships, and contextual similarities.
5
6use crate::VectorStore;
7use anyhow::{anyhow, Result};
8use serde::{Deserialize, Serialize};
9use std::collections::{HashMap, HashSet, VecDeque};
10use std::sync::{Arc, RwLock};
11
12/// Configuration for hierarchical similarity computation
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct HierarchicalSimilarityConfig {
15    /// Enable ontology-based similarity
16    pub enable_ontology_similarity: bool,
17    /// Enable concept hierarchy traversal
18    pub enable_hierarchy_traversal: bool,
19    /// Enable contextual similarity computation
20    pub enable_contextual_similarity: bool,
21    /// Enable adaptive similarity learning
22    pub enable_adaptive_similarity: bool,
23    /// Maximum hierarchy traversal depth
24    pub max_hierarchy_depth: usize,
25    /// Weight for direct similarity
26    pub direct_similarity_weight: f32,
27    /// Weight for hierarchical similarity
28    pub hierarchical_similarity_weight: f32,
29    /// Weight for contextual similarity
30    pub contextual_similarity_weight: f32,
31    /// Cache size for computed similarities
32    pub similarity_cache_size: usize,
33}
34
35impl Default for HierarchicalSimilarityConfig {
36    fn default() -> Self {
37        Self {
38            enable_ontology_similarity: true,
39            enable_hierarchy_traversal: true,
40            enable_contextual_similarity: true,
41            enable_adaptive_similarity: false,
42            max_hierarchy_depth: 5,
43            direct_similarity_weight: 0.6,
44            hierarchical_similarity_weight: 0.3,
45            contextual_similarity_weight: 0.1,
46            similarity_cache_size: 10000,
47        }
48    }
49}
50
51/// Concept hierarchy for ontology-based similarity
52#[derive(Debug, Clone, Serialize, Deserialize, Default)]
53pub struct ConceptHierarchy {
54    /// Parent-child relationships (child -> parent)
55    pub child_to_parent: HashMap<String, String>,
56    /// Parent-child relationships (parent -> children)
57    pub parent_to_children: HashMap<String, HashSet<String>>,
58    /// Concept levels in hierarchy (concept -> level)
59    pub concept_levels: HashMap<String, usize>,
60    /// Concept types and categories
61    pub concept_types: HashMap<String, String>,
62    /// Concept weights for similarity computation
63    pub concept_weights: HashMap<String, f32>,
64}
65
66impl ConceptHierarchy {
67    /// Add a parent-child relationship to the hierarchy
68    pub fn add_relationship(&mut self, parent: String, child: String) {
69        self.child_to_parent.insert(child.clone(), parent.clone());
70        self.parent_to_children
71            .entry(parent)
72            .or_default()
73            .insert(child);
74
75        // Recompute levels
76        self.recompute_levels();
77    }
78
79    /// Find the lowest common ancestor of two concepts
80    pub fn lowest_common_ancestor(&self, concept1: &str, concept2: &str) -> Option<String> {
81        let ancestors1 = self.get_ancestors(concept1);
82        let ancestors2 = self.get_ancestors(concept2);
83
84        // Find common ancestors
85        let common_ancestors: HashSet<&String> = ancestors1.intersection(&ancestors2).collect();
86
87        // Find the one with the highest level (closest to concepts)
88        common_ancestors
89            .into_iter()
90            .max_by_key(|ancestor| self.concept_levels.get(*ancestor).unwrap_or(&0))
91            .cloned()
92    }
93
94    /// Get all ancestors of a concept
95    pub fn get_ancestors(&self, concept: &str) -> HashSet<String> {
96        let mut ancestors = HashSet::new();
97        let mut current = concept.to_string();
98
99        while let Some(parent) = self.child_to_parent.get(&current) {
100            ancestors.insert(parent.clone());
101            current = parent.clone();
102        }
103
104        ancestors
105    }
106
107    /// Get all descendants of a concept
108    pub fn get_descendants(&self, concept: &str) -> HashSet<String> {
109        let mut descendants = HashSet::new();
110        let mut queue = VecDeque::new();
111
112        if let Some(children) = self.parent_to_children.get(concept) {
113            for child in children {
114                queue.push_back(child.clone());
115            }
116        }
117
118        while let Some(current) = queue.pop_front() {
119            if descendants.insert(current.clone()) {
120                if let Some(children) = self.parent_to_children.get(&current) {
121                    for child in children {
122                        queue.push_back(child.clone());
123                    }
124                }
125            }
126        }
127
128        descendants
129    }
130
131    /// Compute the distance between two concepts in the hierarchy
132    pub fn concept_distance(&self, concept1: &str, concept2: &str) -> f32 {
133        if concept1 == concept2 {
134            return 0.0;
135        }
136
137        if let Some(lca) = self.lowest_common_ancestor(concept1, concept2) {
138            let level1 = self.concept_levels.get(concept1).unwrap_or(&0);
139            let level2 = self.concept_levels.get(concept2).unwrap_or(&0);
140            let lca_level = self.concept_levels.get(&lca).unwrap_or(&0);
141
142            // Distance is the sum of distances to LCA
143            let distance = (level1 - lca_level) + (level2 - lca_level);
144            distance as f32
145        } else {
146            // No common ancestor, maximum distance
147            f32::INFINITY
148        }
149    }
150
151    /// Recompute the levels of all concepts in the hierarchy
152    fn recompute_levels(&mut self) {
153        self.concept_levels.clear();
154
155        // Find root concepts (those with no parents)
156        let roots: Vec<String> = self
157            .parent_to_children
158            .keys()
159            .filter(|concept| !self.child_to_parent.contains_key(*concept))
160            .cloned()
161            .collect();
162
163        // BFS to assign levels
164        let mut queue = VecDeque::new();
165        for root in roots {
166            self.concept_levels.insert(root.clone(), 0);
167            queue.push_back((root, 0));
168        }
169
170        while let Some((concept, level)) = queue.pop_front() {
171            if let Some(children) = self.parent_to_children.get(&concept) {
172                for child in children {
173                    let child_level = level + 1;
174                    // Only update if this gives a higher level (closer to root)
175                    if child_level > *self.concept_levels.get(child).unwrap_or(&0) {
176                        self.concept_levels.insert(child.clone(), child_level);
177                        queue.push_back((child.clone(), child_level));
178                    }
179                }
180            }
181        }
182    }
183}
184
185/// Context information for similarity computation
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct SimilarityContext {
188    /// Domain context (e.g., "medical", "technical", "general")
189    pub domain: Option<String>,
190    /// Temporal context (time period relevance)
191    pub temporal_weight: f32,
192    /// Cultural context for cross-cultural similarity
193    pub cultural_context: Option<String>,
194    /// User context (preferences, history)
195    pub user_context: HashMap<String, f32>,
196    /// Task context (what kind of search/comparison)
197    pub task_type: SimilarityTaskType,
198}
199
200impl Default for SimilarityContext {
201    fn default() -> Self {
202        Self {
203            domain: None,
204            temporal_weight: 1.0,
205            cultural_context: None,
206            user_context: HashMap::new(),
207            task_type: SimilarityTaskType::General,
208        }
209    }
210}
211
212/// Types of similarity computation tasks
213#[derive(Debug, Clone, Copy, PartialEq, Hash, Serialize, Deserialize)]
214pub enum SimilarityTaskType {
215    /// General similarity comparison
216    General,
217    /// Document classification
218    Classification,
219    /// Information retrieval
220    Retrieval,
221    /// Recommendation generation
222    Recommendation,
223    /// Semantic clustering
224    Clustering,
225}
226
227/// Result of hierarchical similarity computation
228#[derive(Debug, Clone, Serialize, Deserialize)]
229pub struct HierarchicalSimilarityResult {
230    /// Overall similarity score
231    pub overall_similarity: f32,
232    /// Direct vector similarity component
233    pub direct_similarity: f32,
234    /// Hierarchical similarity component
235    pub hierarchical_similarity: f32,
236    /// Contextual similarity component
237    pub contextual_similarity: f32,
238    /// Explanation of the similarity computation
239    pub explanation: SimilarityExplanation,
240}
241
242/// Explanation of how similarity was computed
243#[derive(Debug, Clone, Serialize, Deserialize)]
244pub struct SimilarityExplanation {
245    /// Concepts involved in the computation
246    pub concepts_involved: Vec<String>,
247    /// Hierarchy paths used
248    pub hierarchy_paths: Vec<String>,
249    /// Contextual factors considered
250    pub contextual_factors: Vec<String>,
251    /// Weight breakdown
252    pub weight_breakdown: HashMap<String, f32>,
253}
254
255/// Hierarchical similarity computer
256pub struct HierarchicalSimilarity {
257    config: HierarchicalSimilarityConfig,
258    concept_hierarchy: Arc<RwLock<ConceptHierarchy>>,
259    similarity_cache: Arc<RwLock<HashMap<String, f32>>>,
260    concept_to_resource: Arc<RwLock<HashMap<String, Vec<String>>>>,
261    adaptive_weights: Arc<RwLock<HashMap<String, f32>>>,
262}
263
264impl HierarchicalSimilarity {
265    /// Create a new hierarchical similarity computer
266    pub fn new(config: HierarchicalSimilarityConfig) -> Self {
267        Self {
268            config,
269            concept_hierarchy: Arc::new(RwLock::new(ConceptHierarchy::default())),
270            similarity_cache: Arc::new(RwLock::new(HashMap::new())),
271            concept_to_resource: Arc::new(RwLock::new(HashMap::new())),
272            adaptive_weights: Arc::new(RwLock::new(HashMap::new())),
273        }
274    }
275
276    /// Compute hierarchical similarity between two resources
277    pub fn compute_similarity(
278        &self,
279        vector_store: &VectorStore,
280        resource1: &str,
281        resource2: &str,
282        context: &SimilarityContext,
283    ) -> Result<HierarchicalSimilarityResult> {
284        // Check cache first
285        let cache_key = format!("{}:{}:{}", resource1, resource2, self.context_hash(context));
286        if let Ok(cache) = self.similarity_cache.read() {
287            if let Some(&cached_similarity) = cache.get(&cache_key) {
288                return Ok(HierarchicalSimilarityResult {
289                    overall_similarity: cached_similarity,
290                    direct_similarity: cached_similarity,
291                    hierarchical_similarity: 0.0,
292                    contextual_similarity: 0.0,
293                    explanation: SimilarityExplanation {
294                        concepts_involved: vec![],
295                        hierarchy_paths: vec!["cached".to_string()],
296                        contextual_factors: vec![],
297                        weight_breakdown: HashMap::new(),
298                    },
299                });
300            }
301        }
302
303        // Compute direct vector similarity
304        let direct_sim = vector_store.calculate_similarity(resource1, resource2)?;
305
306        // Compute hierarchical similarity if enabled
307        let hierarchical_sim = if self.config.enable_ontology_similarity {
308            self.compute_ontology_similarity(resource1, resource2, context)?
309        } else {
310            0.0
311        };
312
313        // Compute contextual similarity if enabled
314        let contextual_sim = if self.config.enable_contextual_similarity {
315            self.compute_contextual_similarity(resource1, resource2, context)?
316        } else {
317            0.0
318        };
319
320        // Apply adaptive weights if enabled
321        let (direct_weight, hierarchical_weight, contextual_weight) =
322            if self.config.enable_adaptive_similarity {
323                self.get_adaptive_weights(context)
324            } else {
325                (
326                    self.config.direct_similarity_weight,
327                    self.config.hierarchical_similarity_weight,
328                    self.config.contextual_similarity_weight,
329                )
330            };
331
332        // Compute weighted overall similarity
333        let overall_sim = direct_sim * direct_weight
334            + hierarchical_sim * hierarchical_weight
335            + contextual_sim * contextual_weight;
336
337        // Cache the result
338        if let Ok(mut cache) = self.similarity_cache.write() {
339            if cache.len() >= self.config.similarity_cache_size {
340                cache.clear(); // Simple cache eviction
341            }
342            cache.insert(cache_key, overall_sim);
343        }
344
345        let explanation = SimilarityExplanation {
346            concepts_involved: self
347                .get_concepts_for_resource(resource1)
348                .into_iter()
349                .chain(self.get_concepts_for_resource(resource2))
350                .collect(),
351            hierarchy_paths: vec![format!(
352                "hierarchy_depth_{}",
353                self.config.max_hierarchy_depth
354            )],
355            contextual_factors: vec![format!("domain_{:?}", context.domain)],
356            weight_breakdown: {
357                let mut breakdown = HashMap::new();
358                breakdown.insert("direct".to_string(), direct_weight);
359                breakdown.insert("hierarchical".to_string(), hierarchical_weight);
360                breakdown.insert("contextual".to_string(), contextual_weight);
361                breakdown
362            },
363        };
364
365        Ok(HierarchicalSimilarityResult {
366            overall_similarity: overall_sim,
367            direct_similarity: direct_sim,
368            hierarchical_similarity: hierarchical_sim,
369            contextual_similarity: contextual_sim,
370            explanation,
371        })
372    }
373
374    /// Add a concept hierarchy relationship
375    pub fn add_concept_relationship(&self, parent: &str, child: &str) -> Result<()> {
376        match self.concept_hierarchy.write() {
377            Ok(mut hierarchy) => {
378                hierarchy.add_relationship(parent.to_string(), child.to_string());
379                Ok(())
380            }
381            _ => Err(anyhow!("Failed to acquire write lock on concept hierarchy")),
382        }
383    }
384
385    /// Associate a resource with concepts
386    pub fn associate_resource_with_concepts(
387        &self,
388        resource: &str,
389        concepts: Vec<String>,
390    ) -> Result<()> {
391        match self.concept_to_resource.write() {
392            Ok(mut mapping) => {
393                mapping.insert(resource.to_string(), concepts);
394                Ok(())
395            }
396            _ => Err(anyhow!(
397                "Failed to acquire write lock on concept-resource mapping"
398            )),
399        }
400    }
401
402    /// Compute ontology-based similarity
403    fn compute_ontology_similarity(
404        &self,
405        resource1: &str,
406        resource2: &str,
407        _context: &SimilarityContext,
408    ) -> Result<f32> {
409        let concepts1 = self.get_concepts_for_resource(resource1);
410        let concepts2 = self.get_concepts_for_resource(resource2);
411
412        if concepts1.is_empty() || concepts2.is_empty() {
413            return Ok(0.0);
414        }
415
416        let hierarchy = self
417            .concept_hierarchy
418            .read()
419            .map_err(|_| anyhow!("Failed to acquire read lock on concept hierarchy"))?;
420
421        let mut max_similarity = 0.0f32;
422
423        // Find the maximum similarity between any pair of concepts
424        for concept1 in &concepts1 {
425            for concept2 in &concepts2 {
426                let distance = hierarchy.concept_distance(concept1, concept2);
427
428                // Convert distance to similarity (inverse relationship)
429                let similarity = if distance.is_infinite() {
430                    0.0
431                } else {
432                    1.0 / (1.0 + distance)
433                };
434
435                max_similarity = max_similarity.max(similarity);
436            }
437        }
438
439        Ok(max_similarity)
440    }
441
442    /// Compute contextual similarity
443    fn compute_contextual_similarity(
444        &self,
445        _resource1: &str,
446        _resource2: &str,
447        context: &SimilarityContext,
448    ) -> Result<f32> {
449        let mut contextual_score = 0.0f32;
450
451        // Domain context
452        if context.domain.is_some() {
453            contextual_score += 0.3; // Base score for having domain context
454        }
455
456        // Temporal context
457        contextual_score += context.temporal_weight * 0.2;
458
459        // Cultural context
460        if context.cultural_context.is_some() {
461            contextual_score += 0.2;
462        }
463
464        // Task type context
465        let task_boost = match context.task_type {
466            SimilarityTaskType::General => 0.1,
467            SimilarityTaskType::Classification => 0.15,
468            SimilarityTaskType::Retrieval => 0.2,
469            SimilarityTaskType::Recommendation => 0.25,
470            SimilarityTaskType::Clustering => 0.15,
471        };
472        contextual_score += task_boost;
473
474        // User context boost
475        if !context.user_context.is_empty() {
476            let user_boost =
477                context.user_context.values().sum::<f32>() / context.user_context.len() as f32;
478            contextual_score += user_boost * 0.2;
479        }
480
481        Ok(contextual_score.clamp(0.0, 1.0))
482    }
483
484    /// Get adaptive weights based on context and learning
485    fn get_adaptive_weights(&self, _context: &SimilarityContext) -> (f32, f32, f32) {
486        // For now, return default weights
487        // In a full implementation, this would use machine learning to adapt weights
488        (
489            self.config.direct_similarity_weight,
490            self.config.hierarchical_similarity_weight,
491            self.config.contextual_similarity_weight,
492        )
493    }
494
495    /// Get concepts associated with a resource
496    fn get_concepts_for_resource(&self, resource: &str) -> Vec<String> {
497        match self.concept_to_resource.read() {
498            Ok(mapping) => mapping.get(resource).cloned().unwrap_or_default(),
499            _ => Vec::new(),
500        }
501    }
502
503    /// Generate a hash for context (for caching)
504    fn context_hash(&self, context: &SimilarityContext) -> u64 {
505        use std::collections::hash_map::DefaultHasher;
506        use std::hash::{Hash, Hasher};
507
508        let mut hasher = DefaultHasher::new();
509        context.domain.hash(&mut hasher);
510        context.cultural_context.hash(&mut hasher);
511        context.task_type.hash(&mut hasher);
512        (context.temporal_weight as u64).hash(&mut hasher);
513
514        hasher.finish()
515    }
516
517    /// Build concept hierarchy from ontology data
518    pub fn build_hierarchy_from_ontology(&self, ontology_data: &[(String, String)]) -> Result<()> {
519        let mut hierarchy = self
520            .concept_hierarchy
521            .write()
522            .map_err(|_| anyhow!("Failed to acquire write lock on concept hierarchy"))?;
523
524        for (parent, child) in ontology_data {
525            hierarchy.add_relationship(parent.clone(), child.clone());
526        }
527
528        Ok(())
529    }
530
531    /// Update adaptive weights based on feedback
532    pub fn update_adaptive_weights(
533        &self,
534        context: &SimilarityContext,
535        feedback_score: f32,
536    ) -> Result<()> {
537        if !self.config.enable_adaptive_similarity {
538            return Ok(());
539        }
540
541        let context_key = format!("{:?}:{:?}", context.domain, context.task_type);
542
543        if let Ok(mut weights) = self.adaptive_weights.write() {
544            // Simple learning rule: move weights toward feedback
545            let current_weight = weights.get(&context_key).cloned().unwrap_or(0.5);
546            let new_weight = current_weight * 0.9 + feedback_score * 0.1;
547            weights.insert(context_key, new_weight);
548        }
549
550        Ok(())
551    }
552
553    /// Get statistics about the hierarchical similarity system
554    pub fn get_statistics(&self) -> Result<HierarchicalSimilarityStats> {
555        let hierarchy = self
556            .concept_hierarchy
557            .read()
558            .map_err(|_| anyhow!("Failed to acquire read lock on concept hierarchy"))?;
559        let cache = self
560            .similarity_cache
561            .read()
562            .map_err(|_| anyhow!("Failed to acquire read lock on similarity cache"))?;
563        let concept_mapping = self
564            .concept_to_resource
565            .read()
566            .map_err(|_| anyhow!("Failed to acquire read lock on concept mapping"))?;
567
568        Ok(HierarchicalSimilarityStats {
569            total_concepts: hierarchy.concept_levels.len(),
570            max_hierarchy_depth: hierarchy
571                .concept_levels
572                .values()
573                .max()
574                .cloned()
575                .unwrap_or(0),
576            cached_similarities: cache.len(),
577            mapped_resources: concept_mapping.len(),
578            average_concepts_per_resource: if concept_mapping.is_empty() {
579                0.0
580            } else {
581                concept_mapping.values().map(|v| v.len()).sum::<usize>() as f32
582                    / concept_mapping.len() as f32
583            },
584        })
585    }
586}
587
588/// Statistics about the hierarchical similarity system
589#[derive(Debug, Clone, Serialize, Deserialize)]
590pub struct HierarchicalSimilarityStats {
591    pub total_concepts: usize,
592    pub max_hierarchy_depth: usize,
593    pub cached_similarities: usize,
594    pub mapped_resources: usize,
595    pub average_concepts_per_resource: f32,
596}
597
598#[cfg(test)]
599mod tests {
600    use super::*;
601
602    #[test]
603    fn test_concept_hierarchy() {
604        let mut hierarchy = ConceptHierarchy::default();
605
606        // Build a simple hierarchy: Animal -> Mammal -> Dog
607        hierarchy.add_relationship("Animal".to_string(), "Mammal".to_string());
608        hierarchy.add_relationship("Mammal".to_string(), "Dog".to_string());
609        hierarchy.add_relationship("Animal".to_string(), "Bird".to_string());
610
611        // Test distance computation
612        let distance = hierarchy.concept_distance("Dog", "Bird");
613        assert!(distance > 0.0);
614
615        let distance_same = hierarchy.concept_distance("Dog", "Dog");
616        assert_eq!(distance_same, 0.0);
617
618        // Test LCA
619        let lca = hierarchy.lowest_common_ancestor("Dog", "Bird");
620        assert_eq!(lca, Some("Animal".to_string()));
621    }
622
623    #[test]
624    fn test_hierarchical_similarity() -> Result<()> {
625        let config = HierarchicalSimilarityConfig::default();
626        let hierarchical_sim = HierarchicalSimilarity::new(config);
627
628        // Add some test concepts
629        hierarchical_sim.add_concept_relationship("Animal", "Mammal")?;
630        hierarchical_sim.add_concept_relationship("Mammal", "Dog")?;
631
632        // Associate resources with concepts
633        hierarchical_sim.associate_resource_with_concepts("resource1", vec!["Dog".to_string()])?;
634        hierarchical_sim
635            .associate_resource_with_concepts("resource2", vec!["Mammal".to_string()])?;
636
637        // Test would require a VectorStore instance, which is complex to set up in a unit test
638        // In practice, integration tests would cover this functionality
639        Ok(())
640    }
641}