Skip to main content

oxirs_core/ai/
entity_resolution.rs

1//! Entity Resolution with Machine Learning
2//!
3//! This module provides entity resolution capabilities to identify and merge
4//! duplicate entities across different data sources.
5
6use crate::ai::AiConfig;
7use crate::model::Triple;
8use anyhow::Result;
9use serde::{Deserialize, Serialize};
10use std::collections::{HashMap, HashSet};
11
12/// Entity resolution module
13pub struct EntityResolver {
14    /// Configuration
15    config: ResolutionConfig,
16
17    /// Similarity calculator
18    similarity_calculator: Box<dyn SimilarityCalculator>,
19
20    /// Clustering algorithm
21    clustering_algorithm: Box<dyn ClusteringAlgorithm>,
22
23    /// Feature extractor
24    #[allow(dead_code)]
25    feature_extractor: Box<dyn FeatureExtractor>,
26
27    /// Blocking strategy
28    blocking_strategy: Box<dyn BlockingStrategy>,
29}
30
31/// Entity resolution configuration
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct ResolutionConfig {
34    /// Similarity threshold for entity matching
35    pub similarity_threshold: f32,
36
37    /// Clustering algorithm to use
38    pub clustering_algorithm: ClusteringType,
39
40    /// Features to use for similarity calculation
41    pub features: Vec<FeatureType>,
42
43    /// Blocking strategy
44    pub blocking_strategy: BlockingType,
45
46    /// Maximum cluster size
47    pub max_cluster_size: usize,
48
49    /// Enable machine learning similarity
50    pub enable_ml_similarity: bool,
51
52    /// Training data path (if using ML)
53    pub training_data_path: Option<String>,
54}
55
56impl Default for ResolutionConfig {
57    fn default() -> Self {
58        Self {
59            similarity_threshold: 0.8,
60            clustering_algorithm: ClusteringType::HierarchicalClustering,
61            features: vec![
62                FeatureType::StringSimilarity,
63                FeatureType::NumericSimilarity,
64                FeatureType::StructuralSimilarity,
65            ],
66            blocking_strategy: BlockingType::SortedNeighborhood,
67            max_cluster_size: 100,
68            enable_ml_similarity: true,
69            training_data_path: None,
70        }
71    }
72}
73
74/// Clustering algorithm types
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub enum ClusteringType {
77    /// Hierarchical clustering
78    HierarchicalClustering,
79
80    /// Connected components
81    ConnectedComponents,
82
83    /// Correlation clustering
84    CorrelationClustering,
85
86    /// DBSCAN
87    DBSCAN { eps: f32, min_samples: usize },
88
89    /// Markov clustering
90    MarkovClustering { inflation: f32 },
91}
92
93/// Feature types for entity similarity
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub enum FeatureType {
96    /// String similarity features
97    StringSimilarity,
98
99    /// Numeric similarity features
100    NumericSimilarity,
101
102    /// Structural similarity (graph-based)
103    StructuralSimilarity,
104
105    /// Semantic similarity (embedding-based)
106    SemanticSimilarity,
107
108    /// Temporal similarity
109    TemporalSimilarity,
110
111    /// Contextual similarity
112    ContextualSimilarity,
113}
114
115/// Blocking strategy types
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub enum BlockingType {
118    /// Standard blocking
119    StandardBlocking,
120
121    /// Sorted neighborhood method
122    SortedNeighborhood,
123
124    /// Locality-sensitive hashing
125    LSH {
126        num_hashes: usize,
127        hash_length: usize,
128    },
129
130    /// Canopy clustering
131    CanopyClustering { t1: f32, t2: f32 },
132
133    /// Multi-pass blocking
134    MultiPass(Vec<BlockingType>),
135}
136
137/// Entity cluster result
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct EntityCluster {
140    /// Cluster ID
141    pub id: String,
142
143    /// Entities in the cluster
144    pub entities: Vec<EntityRecord>,
145
146    /// Canonical entity (representative)
147    pub canonical_entity: EntityRecord,
148
149    /// Cluster confidence score
150    pub confidence: f32,
151
152    /// Cluster size
153    pub size: usize,
154
155    /// Merge decisions
156    pub merge_decisions: Vec<MergeDecision>,
157}
158
159/// Entity record for resolution
160#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
161pub struct EntityRecord {
162    /// Entity ID
163    pub id: String,
164
165    /// Entity URI
166    pub uri: String,
167
168    /// Attributes
169    pub attributes: HashMap<String, String>,
170
171    /// Associated triples
172    pub triples: Vec<Triple>,
173
174    /// Source information
175    pub source: String,
176
177    /// Quality score
178    pub quality_score: f32,
179}
180
181/// Merge decision
182#[derive(Debug, Clone, Serialize, Deserialize)]
183pub struct MergeDecision {
184    /// Source entity
185    pub source_entity: String,
186
187    /// Target entity
188    pub target_entity: String,
189
190    /// Similarity score
191    pub similarity: f32,
192
193    /// Decision type
194    pub decision: DecisionType,
195
196    /// Confidence in decision
197    pub confidence: f32,
198
199    /// Features used
200    pub features_used: Vec<FeatureType>,
201}
202
203/// Decision types
204#[derive(Debug, Clone, Serialize, Deserialize)]
205pub enum DecisionType {
206    Merge,
207    NoMerge,
208    Uncertain,
209}
210
211/// Similarity calculator trait
212pub trait SimilarityCalculator: Send + Sync {
213    /// Calculate similarity between two entities
214    fn calculate_similarity(&self, entity1: &EntityRecord, entity2: &EntityRecord) -> Result<f32>;
215
216    /// Get feature vector for entity
217    fn get_feature_vector(&self, entity: &EntityRecord) -> Result<Vec<f32>>;
218}
219
220/// Clustering algorithm trait
221pub trait ClusteringAlgorithm: Send + Sync {
222    /// Cluster entities based on similarity
223    fn cluster_entities(
224        &self,
225        entities: &[EntityRecord],
226        similarity_matrix: &[Vec<f32>],
227        threshold: f32,
228    ) -> Result<Vec<EntityCluster>>;
229}
230
231/// Feature extractor trait
232pub trait FeatureExtractor: Send + Sync {
233    /// Extract features from entity
234    fn extract_features(&self, entity: &EntityRecord) -> Result<HashMap<String, f32>>;
235
236    /// Get feature names
237    fn feature_names(&self) -> Vec<String>;
238}
239
240/// Blocking strategy trait
241pub trait BlockingStrategy: Send + Sync {
242    /// Generate blocks of potentially matching entities
243    fn generate_blocks(&self, entities: &[EntityRecord]) -> Result<Vec<Vec<usize>>>;
244
245    /// Get blocking key for entity
246    fn get_blocking_key(&self, entity: &EntityRecord) -> Result<String>;
247}
248
249impl EntityResolver {
250    /// Create new entity resolver
251    pub fn new(_config: &AiConfig) -> Result<Self> {
252        let resolution_config = ResolutionConfig::default();
253
254        // Create components
255        let similarity_calculator = Box::new(DefaultSimilarityCalculator::new());
256        let clustering_algorithm = Box::new(HierarchicalClusterer::new());
257        let feature_extractor = Box::new(DefaultFeatureExtractor::new());
258        let blocking_strategy = Box::new(SortedNeighborhoodBlocking::new());
259
260        Ok(Self {
261            config: resolution_config,
262            similarity_calculator,
263            clustering_algorithm,
264            feature_extractor,
265            blocking_strategy,
266        })
267    }
268
269    /// Resolve entities from triples
270    pub async fn resolve_entities(&self, triples: &[Triple]) -> Result<Vec<EntityCluster>> {
271        // Step 1: Extract entity records from triples
272        let entities = self.extract_entity_records(triples)?;
273
274        // Step 2: Apply blocking strategy to reduce comparisons
275        let blocks = self.blocking_strategy.generate_blocks(&entities)?;
276
277        let mut all_clusters = Vec::new();
278
279        // Step 3: Process each block separately
280        for block in blocks {
281            let block_entities: Vec<&EntityRecord> = block.iter().map(|&i| &entities[i]).collect();
282
283            // Step 4: Calculate similarity matrix for block
284            let similarity_matrix = self.calculate_similarity_matrix(&block_entities)?;
285
286            // Step 5: Cluster entities
287            let block_entities_owned: Vec<EntityRecord> =
288                block_entities.into_iter().cloned().collect();
289            let clusters = self.clustering_algorithm.cluster_entities(
290                &block_entities_owned,
291                &similarity_matrix,
292                self.config.similarity_threshold,
293            )?;
294
295            all_clusters.extend(clusters);
296        }
297
298        // Step 6: Post-process clusters
299        let final_clusters = self.post_process_clusters(all_clusters)?;
300
301        Ok(final_clusters)
302    }
303
304    /// Extract entity records from triples
305    fn extract_entity_records(&self, triples: &[Triple]) -> Result<Vec<EntityRecord>> {
306        let mut entity_map: HashMap<String, EntityRecord> = HashMap::new();
307        let entity_counter = std::cell::RefCell::new(0);
308
309        for triple in triples {
310            let subject_uri = triple.subject().to_string();
311            let predicate_uri = triple.predicate().to_string();
312            let object_string = triple.object().to_string();
313
314            // Process subject entity
315            let subject_entry = entity_map.entry(subject_uri.clone()).or_insert_with(|| {
316                let id = {
317                    let mut counter = entity_counter.borrow_mut();
318                    *counter += 1;
319                    *counter
320                };
321                EntityRecord {
322                    id: format!("entity_{id}"),
323                    uri: subject_uri.clone(),
324                    attributes: HashMap::new(),
325                    triples: Vec::new(),
326                    source: "unknown".to_string(),
327                    quality_score: 1.0,
328                }
329            });
330
331            subject_entry.triples.push(triple.clone());
332            subject_entry
333                .attributes
334                .insert(predicate_uri.clone(), object_string.clone());
335
336            // Process object entity if it's not a literal
337            if let crate::model::Object::NamedNode(node) = triple.object() {
338                let object_uri = node.to_string();
339                let object_entry = entity_map.entry(object_uri.clone()).or_insert_with(|| {
340                    let id = {
341                        let mut counter = entity_counter.borrow_mut();
342                        *counter += 1;
343                        *counter
344                    };
345                    EntityRecord {
346                        id: format!("entity_{id}"),
347                        uri: object_uri.clone(),
348                        attributes: HashMap::new(),
349                        triples: Vec::new(),
350                        source: "unknown".to_string(),
351                        quality_score: 1.0,
352                    }
353                });
354
355                // Add reverse relation
356                object_entry
357                    .attributes
358                    .insert(format!("{predicate_uri}^-1"), subject_uri.clone());
359            }
360        }
361
362        Ok(entity_map.into_values().collect())
363    }
364
365    /// Calculate similarity matrix for entities
366    fn calculate_similarity_matrix(&self, entities: &[&EntityRecord]) -> Result<Vec<Vec<f32>>> {
367        let n = entities.len();
368        let mut matrix = vec![vec![0.0; n]; n];
369
370        for i in 0..n {
371            for j in i..n {
372                if i == j {
373                    matrix[i][j] = 1.0;
374                } else {
375                    let similarity = self
376                        .similarity_calculator
377                        .calculate_similarity(entities[i], entities[j])?;
378                    matrix[i][j] = similarity;
379                    matrix[j][i] = similarity;
380                }
381            }
382        }
383
384        Ok(matrix)
385    }
386
387    /// Post-process clusters
388    fn post_process_clusters(&self, clusters: Vec<EntityCluster>) -> Result<Vec<EntityCluster>> {
389        let mut processed_clusters = clusters;
390
391        // Step 1: Merge overlapping clusters
392        processed_clusters = self.merge_overlapping_clusters(processed_clusters)?;
393
394        // Step 2: Split large clusters
395        processed_clusters = self.split_large_clusters(processed_clusters)?;
396
397        // Step 3: Validate cluster quality
398        processed_clusters = self.validate_cluster_quality(processed_clusters)?;
399
400        Ok(processed_clusters)
401    }
402
403    /// Merge clusters that have overlapping entities
404    fn merge_overlapping_clusters(
405        &self,
406        clusters: Vec<EntityCluster>,
407    ) -> Result<Vec<EntityCluster>> {
408        let mut merged_clusters = Vec::new();
409        let mut processed = vec![false; clusters.len()];
410
411        for (i, cluster_a) in clusters.iter().enumerate() {
412            if processed[i] {
413                continue;
414            }
415
416            let mut merged_cluster = cluster_a.clone();
417            processed[i] = true;
418
419            // Find overlapping clusters
420            for (j, cluster_b) in clusters.iter().enumerate().skip(i + 1) {
421                if processed[j] {
422                    continue;
423                }
424
425                // Check for entity overlap
426                let overlap_count = cluster_a
427                    .entities
428                    .iter()
429                    .filter(|entity| cluster_b.entities.contains(entity))
430                    .count();
431
432                let min_size = cluster_a.entities.len().min(cluster_b.entities.len());
433                let overlap_ratio = overlap_count as f64 / min_size as f64;
434
435                // Merge if overlap ratio exceeds threshold
436                if overlap_ratio > 0.3 {
437                    // Merge entities
438                    for entity in &cluster_b.entities {
439                        if !merged_cluster.entities.contains(entity) {
440                            merged_cluster.entities.push(entity.clone());
441                        }
442                    }
443
444                    // Update cluster properties
445                    merged_cluster.size = merged_cluster.entities.len();
446                    merged_cluster.confidence =
447                        (merged_cluster.confidence + cluster_b.confidence) / 2.0;
448
449                    // Record merge decision
450                    merged_cluster.merge_decisions.push(MergeDecision {
451                        source_entity: cluster_b.id.clone(),
452                        target_entity: merged_cluster.id.clone(),
453                        similarity: overlap_ratio as f32,
454                        decision: DecisionType::Merge,
455                        confidence: overlap_ratio as f32,
456                        features_used: vec![FeatureType::StructuralSimilarity],
457                    });
458
459                    processed[j] = true;
460                }
461            }
462
463            merged_clusters.push(merged_cluster);
464        }
465
466        Ok(merged_clusters)
467    }
468
469    /// Split clusters that are too large
470    fn split_large_clusters(&self, clusters: Vec<EntityCluster>) -> Result<Vec<EntityCluster>> {
471        let mut split_clusters = Vec::new();
472        let max_cluster_size = 50; // Configurable threshold
473
474        for cluster in clusters {
475            if cluster.entities.len() <= max_cluster_size {
476                split_clusters.push(cluster);
477                continue;
478            }
479
480            // Split large cluster using similarity-based grouping
481            let sub_clusters = self.split_cluster_by_similarity(&cluster, max_cluster_size)?;
482            split_clusters.extend(sub_clusters);
483        }
484
485        Ok(split_clusters)
486    }
487
488    /// Split a cluster by similarity into smaller sub-clusters
489    fn split_cluster_by_similarity(
490        &self,
491        cluster: &EntityCluster,
492        max_size: usize,
493    ) -> Result<Vec<EntityCluster>> {
494        let mut sub_clusters = Vec::new();
495        let mut remaining_entities = cluster.entities.clone();
496        let mut cluster_id_counter = 0;
497
498        while !remaining_entities.is_empty() {
499            let mut current_cluster_entities = Vec::new();
500            let seed_entity = remaining_entities.remove(0);
501            current_cluster_entities.push(seed_entity.clone());
502
503            // Add similar entities to current cluster
504            let mut i = 0;
505            while i < remaining_entities.len() && current_cluster_entities.len() < max_size {
506                let entity = &remaining_entities[i];
507
508                // Check similarity with entities in current cluster
509                let mut max_similarity = 0.0;
510                for cluster_entity in &current_cluster_entities {
511                    let similarity = self.calculate_entity_similarity(entity, cluster_entity)?;
512                    if similarity > max_similarity {
513                        max_similarity = similarity;
514                    }
515                }
516
517                // Add to cluster if similarity exceeds threshold
518                if max_similarity > 0.7 {
519                    current_cluster_entities.push(remaining_entities.remove(i));
520                } else {
521                    i += 1;
522                }
523            }
524
525            // Create sub-cluster
526            let canonical_entity = current_cluster_entities[0].clone();
527            let sub_cluster = EntityCluster {
528                id: format!("{}_split_{}", cluster.id, cluster_id_counter),
529                entities: current_cluster_entities.clone(),
530                canonical_entity,
531                confidence: cluster.confidence * 0.9, // Slightly lower confidence for split clusters
532                size: current_cluster_entities.len(),
533                merge_decisions: vec![MergeDecision {
534                    source_entity: cluster.id.clone(),
535                    target_entity: format!("{}_split_{}", cluster.id, cluster_id_counter),
536                    similarity: cluster.confidence,
537                    decision: DecisionType::NoMerge,
538                    confidence: cluster.confidence,
539                    features_used: vec![FeatureType::StructuralSimilarity],
540                }],
541            };
542
543            sub_clusters.push(sub_cluster);
544            cluster_id_counter += 1;
545        }
546
547        Ok(sub_clusters)
548    }
549
550    /// Validate cluster quality and filter out low-quality clusters
551    fn validate_cluster_quality(&self, clusters: Vec<EntityCluster>) -> Result<Vec<EntityCluster>> {
552        let mut validated_clusters = Vec::new();
553
554        for cluster in clusters {
555            // Quality metrics
556            let min_cluster_size = 2;
557            let min_confidence = 0.5;
558
559            // Check minimum size
560            if cluster.entities.len() < min_cluster_size {
561                continue;
562            }
563
564            // Check minimum confidence
565            if cluster.confidence < min_confidence {
566                continue;
567            }
568
569            // Calculate internal similarity
570            let internal_similarity = self.calculate_cluster_internal_similarity(&cluster)?;
571            if internal_similarity < 0.6 {
572                continue;
573            }
574
575            validated_clusters.push(cluster);
576        }
577
578        Ok(validated_clusters)
579    }
580
581    /// Calculate internal similarity of a cluster
582    fn calculate_cluster_internal_similarity(&self, cluster: &EntityCluster) -> Result<f64> {
583        if cluster.entities.len() < 2 {
584            return Ok(1.0);
585        }
586
587        let mut total_similarity = 0.0;
588        let mut comparison_count = 0;
589
590        for i in 0..cluster.entities.len() {
591            for j in (i + 1)..cluster.entities.len() {
592                let similarity =
593                    self.calculate_entity_similarity(&cluster.entities[i], &cluster.entities[j])?;
594                total_similarity += similarity;
595                comparison_count += 1;
596            }
597        }
598
599        if comparison_count > 0 {
600            Ok(total_similarity / comparison_count as f64)
601        } else {
602            Ok(0.0)
603        }
604    }
605
606    /// Calculate similarity between two entities (helper method)
607    fn calculate_entity_similarity(
608        &self,
609        entity1: &EntityRecord,
610        entity2: &EntityRecord,
611    ) -> Result<f64> {
612        // Simple string similarity based on labels from attributes
613        let label1 = entity1
614            .attributes
615            .get("label")
616            .unwrap_or(&entity1.uri)
617            .to_lowercase();
618        let label2 = entity2
619            .attributes
620            .get("label")
621            .unwrap_or(&entity2.uri)
622            .to_lowercase();
623
624        // Jaccard similarity on character n-grams
625        let ngrams1: std::collections::HashSet<String> = self.generate_character_ngrams(&label1, 2);
626        let ngrams2: std::collections::HashSet<String> = self.generate_character_ngrams(&label2, 2);
627
628        let intersection = ngrams1.intersection(&ngrams2).count();
629        let union = ngrams1.union(&ngrams2).count();
630
631        if union > 0 {
632            Ok(intersection as f64 / union as f64)
633        } else {
634            Ok(0.0)
635        }
636    }
637
638    /// Generate character n-grams
639    fn generate_character_ngrams(&self, text: &str, n: usize) -> std::collections::HashSet<String> {
640        let mut ngrams = std::collections::HashSet::new();
641        let chars: Vec<char> = text.chars().collect();
642
643        if chars.len() >= n {
644            for i in 0..=(chars.len() - n) {
645                let ngram: String = chars[i..i + n].iter().collect();
646                ngrams.insert(ngram);
647            }
648        }
649
650        ngrams
651    }
652}
653
654/// Default similarity calculator
655struct DefaultSimilarityCalculator;
656
657impl DefaultSimilarityCalculator {
658    fn new() -> Self {
659        Self
660    }
661
662    fn string_similarity(&self, s1: &str, s2: &str) -> f32 {
663        // Simplified Jaccard similarity
664        let set1: HashSet<char> = s1.chars().collect();
665        let set2: HashSet<char> = s2.chars().collect();
666
667        let intersection = set1.intersection(&set2).count();
668        let union = set1.union(&set2).count();
669
670        if union == 0 {
671            0.0
672        } else {
673            intersection as f32 / union as f32
674        }
675    }
676
677    fn attribute_similarity(
678        &self,
679        attrs1: &HashMap<String, String>,
680        attrs2: &HashMap<String, String>,
681    ) -> f32 {
682        let mut total_similarity = 0.0;
683        let mut count = 0;
684
685        for (key, value1) in attrs1 {
686            if let Some(value2) = attrs2.get(key) {
687                total_similarity += self.string_similarity(value1, value2);
688                count += 1;
689            }
690        }
691
692        if count == 0 {
693            0.0
694        } else {
695            total_similarity / count as f32
696        }
697    }
698}
699
700impl SimilarityCalculator for DefaultSimilarityCalculator {
701    fn calculate_similarity(&self, entity1: &EntityRecord, entity2: &EntityRecord) -> Result<f32> {
702        // Combine multiple similarity measures
703        let uri_similarity = self.string_similarity(&entity1.uri, &entity2.uri);
704        let attr_similarity = self.attribute_similarity(&entity1.attributes, &entity2.attributes);
705
706        // Weighted combination
707        let similarity = 0.3 * uri_similarity + 0.7 * attr_similarity;
708
709        Ok(similarity)
710    }
711
712    fn get_feature_vector(&self, entity: &EntityRecord) -> Result<Vec<f32>> {
713        // Extract simple features
714        let features = vec![
715            // URI length
716            entity.uri.len() as f32,
717            // Number of attributes
718            entity.attributes.len() as f32,
719            // Number of triples
720            entity.triples.len() as f32,
721            // Quality score
722            entity.quality_score,
723        ];
724
725        Ok(features)
726    }
727}
728
729/// Hierarchical clustering implementation
730struct HierarchicalClusterer;
731
732impl HierarchicalClusterer {
733    fn new() -> Self {
734        Self
735    }
736}
737
738impl ClusteringAlgorithm for HierarchicalClusterer {
739    fn cluster_entities(
740        &self,
741        entities: &[EntityRecord],
742        similarity_matrix: &[Vec<f32>],
743        threshold: f32,
744    ) -> Result<Vec<EntityCluster>> {
745        let n = entities.len();
746        if n == 0 {
747            return Ok(Vec::new());
748        }
749
750        // Simple clustering: group entities with similarity above threshold
751        let mut clusters = Vec::new();
752        let mut visited = vec![false; n];
753
754        for i in 0..n {
755            if visited[i] {
756                continue;
757            }
758
759            let mut cluster_entities = vec![entities[i].clone()];
760            visited[i] = true;
761
762            for j in (i + 1)..n {
763                if !visited[j] && similarity_matrix[i][j] >= threshold {
764                    cluster_entities.push(entities[j].clone());
765                    visited[j] = true;
766                }
767            }
768
769            // Create cluster
770            let canonical_entity = cluster_entities[0].clone(); // Simplified
771            let cluster = EntityCluster {
772                id: format!("cluster_{}", clusters.len()),
773                entities: cluster_entities.clone(),
774                canonical_entity,
775                confidence: 0.8, // Simplified
776                size: cluster_entities.len(),
777                merge_decisions: vec![MergeDecision {
778                    source_entity: "initial".to_string(),
779                    target_entity: format!("cluster_{}", clusters.len()),
780                    similarity: 0.8,
781                    decision: DecisionType::Merge,
782                    confidence: 0.8,
783                    features_used: vec![FeatureType::StructuralSimilarity],
784                }],
785            };
786
787            clusters.push(cluster);
788        }
789
790        Ok(clusters)
791    }
792}
793
794/// Default feature extractor
795struct DefaultFeatureExtractor;
796
797impl DefaultFeatureExtractor {
798    fn new() -> Self {
799        Self
800    }
801}
802
803impl FeatureExtractor for DefaultFeatureExtractor {
804    fn extract_features(&self, entity: &EntityRecord) -> Result<HashMap<String, f32>> {
805        let mut features = HashMap::new();
806
807        // Basic features
808        features.insert("uri_length".to_string(), entity.uri.len() as f32);
809        features.insert("num_attributes".to_string(), entity.attributes.len() as f32);
810        features.insert("num_triples".to_string(), entity.triples.len() as f32);
811        features.insert("quality_score".to_string(), entity.quality_score);
812
813        // Attribute-based features
814        for (key, value) in &entity.attributes {
815            features.insert(format!("attr_{key}_length"), value.len() as f32);
816        }
817
818        Ok(features)
819    }
820
821    fn feature_names(&self) -> Vec<String> {
822        vec![
823            "uri_length".to_string(),
824            "num_attributes".to_string(),
825            "num_triples".to_string(),
826            "quality_score".to_string(),
827        ]
828    }
829}
830
831/// Sorted neighborhood blocking
832struct SortedNeighborhoodBlocking;
833
834impl SortedNeighborhoodBlocking {
835    fn new() -> Self {
836        Self
837    }
838}
839
840impl BlockingStrategy for SortedNeighborhoodBlocking {
841    fn generate_blocks(&self, entities: &[EntityRecord]) -> Result<Vec<Vec<usize>>> {
842        // Sort entities by blocking key and create windows
843        let mut indexed_entities: Vec<(usize, String)> = entities
844            .iter()
845            .enumerate()
846            .map(|(i, entity)| (i, self.get_blocking_key(entity).unwrap_or_default()))
847            .collect();
848
849        indexed_entities.sort_by_key(|x| x.1.clone());
850
851        // Create sliding windows
852        let window_size = 10; // Configurable
853        let mut blocks = Vec::new();
854
855        for start in 0..entities.len() {
856            if start + window_size <= entities.len() {
857                let block: Vec<usize> = indexed_entities[start..start + window_size]
858                    .iter()
859                    .map(|(i, _)| *i)
860                    .collect();
861                blocks.push(block);
862            }
863        }
864
865        if blocks.is_empty() {
866            // Single block with all entities
867            blocks.push((0..entities.len()).collect());
868        }
869
870        Ok(blocks)
871    }
872
873    fn get_blocking_key(&self, entity: &EntityRecord) -> Result<String> {
874        // Use first few characters of URI as blocking key
875        let key = entity
876            .uri
877            .chars()
878            .take(10)
879            .collect::<String>()
880            .to_lowercase();
881        Ok(key)
882    }
883}
884
885#[cfg(test)]
886mod tests {
887    use super::*;
888    use crate::ai::AiConfig;
889    use crate::model::{Literal, NamedNode};
890
891    #[tokio::test]
892    async fn test_entity_resolver_creation() {
893        let config = AiConfig::default();
894        let resolver = EntityResolver::new(&config);
895        assert!(resolver.is_ok());
896    }
897
898    #[tokio::test]
899    async fn test_entity_resolution() {
900        let config = AiConfig::default();
901        let mut resolver = EntityResolver::new(&config).expect("construction should succeed");
902
903        // Use a lower similarity threshold for testing
904        resolver.config.similarity_threshold = 0.3;
905
906        let triples = vec![
907            Triple::new(
908                NamedNode::new("http://example.org/person1").expect("valid IRI"),
909                NamedNode::new("http://example.org/name").expect("valid IRI"),
910                Literal::new("John Smith"),
911            ),
912            Triple::new(
913                NamedNode::new("http://example.org/person2").expect("valid IRI"),
914                NamedNode::new("http://example.org/name").expect("valid IRI"),
915                Literal::new("John Smith"),
916            ),
917            Triple::new(
918                NamedNode::new("http://example.org/person3").expect("valid IRI"),
919                NamedNode::new("http://example.org/name").expect("valid IRI"),
920                Literal::new("Jane Doe"),
921            ),
922            Triple::new(
923                NamedNode::new("http://example.org/person4").expect("valid IRI"),
924                NamedNode::new("http://example.org/name").expect("valid IRI"),
925                Literal::new("Jane Doe"),
926            ),
927        ];
928
929        let clusters = resolver
930            .resolve_entities(&triples)
931            .await
932            .expect("async operation should succeed");
933        // Should create clusters for similar entities
934        assert!(!clusters.is_empty());
935    }
936
937    #[test]
938    fn test_similarity_calculation() {
939        let calculator = DefaultSimilarityCalculator::new();
940
941        let entity1 = EntityRecord {
942            id: "1".to_string(),
943            uri: "http://example.org/john".to_string(),
944            attributes: [("name".to_string(), "John".to_string())]
945                .iter()
946                .cloned()
947                .collect(),
948            triples: Vec::new(),
949            source: "source1".to_string(),
950            quality_score: 1.0,
951        };
952
953        let entity2 = EntityRecord {
954            id: "2".to_string(),
955            uri: "http://example.org/john_smith".to_string(),
956            attributes: [("name".to_string(), "John Smith".to_string())]
957                .iter()
958                .cloned()
959                .collect(),
960            triples: Vec::new(),
961            source: "source2".to_string(),
962            quality_score: 1.0,
963        };
964
965        let similarity = calculator
966            .calculate_similarity(&entity1, &entity2)
967            .expect("operation should succeed");
968        assert!(similarity > 0.0);
969        assert!(similarity <= 1.0);
970    }
971}