Skip to main content

fastskill_core/core/
analysis.rs

1//! Analysis utilities for skill similarity and duplicate detection
2
3use serde::Serialize;
4use std::collections::HashSet;
5
6/// A skill and its most similar skills, used in similarity matrix computation
7#[derive(Debug, Serialize)]
8pub struct SimilarityPair {
9    /// Skill identifier
10    pub skill_id: String,
11    /// Human-readable skill name
12    pub name: String,
13    /// List of (skill_id, similarity_score) tuples, sorted by similarity descending
14    pub similar_skills: Vec<(String, f32)>,
15}
16
17/// Finds potential duplicate skills by reusing the similarity matrix data
18///
19/// Returns pairs with similarity >= duplicate_threshold, sorted by similarity descending.
20/// Uses a HashSet to avoid reporting the same pair twice (A-B and B-A).
21pub fn find_potential_duplicates(
22    similarity_matrix: &[SimilarityPair],
23    duplicate_threshold: f32,
24) -> Vec<(String, String, f32)> {
25    let mut duplicates = Vec::new();
26    let mut seen = HashSet::new();
27
28    for pair in similarity_matrix {
29        for (similar_id, similarity) in &pair.similar_skills {
30            if *similarity >= duplicate_threshold {
31                // Create canonical ordering (alphabetically first ID, then second)
32                let (id_a, id_b) = if pair.skill_id < *similar_id {
33                    (pair.skill_id.as_str(), similar_id.as_str())
34                } else {
35                    (similar_id.as_str(), pair.skill_id.as_str())
36                };
37
38                // Use a unique key to track seen pairs
39                let key = format!("{}|{}", id_a, id_b);
40                if seen.insert(key) {
41                    duplicates.push((id_a.to_string(), id_b.to_string(), *similarity));
42                }
43            }
44        }
45    }
46
47    // Sort by similarity descending (highest first)
48    duplicates.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
49
50    // Limit to top 20 potential duplicates
51    duplicates.truncate(20);
52
53    duplicates
54}
55
56#[cfg(test)]
57mod tests {
58    use super::*;
59
60    #[test]
61    fn test_find_potential_duplicates_no_duplicates() {
62        let matrix = vec![SimilarityPair {
63            skill_id: "skill-a".to_string(),
64            name: "Skill A".to_string(),
65            similar_skills: vec![("skill-b".to_string(), 0.5)],
66        }];
67
68        let duplicates = find_potential_duplicates(&matrix, 0.95);
69        assert!(
70            duplicates.is_empty(),
71            "Should find no duplicates below threshold"
72        );
73    }
74
75    #[test]
76    fn test_find_potential_duplicates_with_duplicates() {
77        let matrix = vec![
78            SimilarityPair {
79                skill_id: "skill-a".to_string(),
80                name: "Skill A".to_string(),
81                similar_skills: vec![("skill-b".to_string(), 0.98), ("skill-c".to_string(), 0.50)],
82            },
83            SimilarityPair {
84                skill_id: "skill-b".to_string(),
85                name: "Skill B".to_string(),
86                similar_skills: vec![
87                    ("skill-a".to_string(), 0.98), // Should not be duplicated
88                ],
89            },
90        ];
91
92        let duplicates = find_potential_duplicates(&matrix, 0.95);
93        assert_eq!(
94            duplicates.len(),
95            1,
96            "Should find exactly one duplicate pair"
97        );
98        assert_eq!(duplicates[0].2, 0.98, "Similarity should be 0.98");
99
100        // Check canonical ordering (skill-a comes before skill-b alphabetically)
101        assert_eq!(duplicates[0].0, "skill-a");
102        assert_eq!(duplicates[0].1, "skill-b");
103    }
104}