fastskill_core/core/
analysis.rs1use serde::Serialize;
4use std::collections::HashSet;
5
6#[derive(Debug, Serialize)]
8pub struct SimilarityPair {
9 pub skill_id: String,
11 pub name: String,
13 pub similar_skills: Vec<(String, f32)>,
15}
16
17pub fn find_potential_duplicates(
22 similarity_matrix: &[SimilarityPair],
23 duplicate_threshold: f32,
24) -> Vec<(String, String, f32)> {
25 let mut duplicates = Vec::new();
26 let mut seen = HashSet::new();
27
28 for pair in similarity_matrix {
29 for (similar_id, similarity) in &pair.similar_skills {
30 if *similarity >= duplicate_threshold {
31 let (id_a, id_b) = if pair.skill_id < *similar_id {
33 (pair.skill_id.as_str(), similar_id.as_str())
34 } else {
35 (similar_id.as_str(), pair.skill_id.as_str())
36 };
37
38 let key = format!("{}|{}", id_a, id_b);
40 if seen.insert(key) {
41 duplicates.push((id_a.to_string(), id_b.to_string(), *similarity));
42 }
43 }
44 }
45 }
46
47 duplicates.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
49
50 duplicates.truncate(20);
52
53 duplicates
54}
55
56#[cfg(test)]
57mod tests {
58 use super::*;
59
60 #[test]
61 fn test_find_potential_duplicates_no_duplicates() {
62 let matrix = vec![SimilarityPair {
63 skill_id: "skill-a".to_string(),
64 name: "Skill A".to_string(),
65 similar_skills: vec![("skill-b".to_string(), 0.5)],
66 }];
67
68 let duplicates = find_potential_duplicates(&matrix, 0.95);
69 assert!(
70 duplicates.is_empty(),
71 "Should find no duplicates below threshold"
72 );
73 }
74
75 #[test]
76 fn test_find_potential_duplicates_with_duplicates() {
77 let matrix = vec![
78 SimilarityPair {
79 skill_id: "skill-a".to_string(),
80 name: "Skill A".to_string(),
81 similar_skills: vec![("skill-b".to_string(), 0.98), ("skill-c".to_string(), 0.50)],
82 },
83 SimilarityPair {
84 skill_id: "skill-b".to_string(),
85 name: "Skill B".to_string(),
86 similar_skills: vec![
87 ("skill-a".to_string(), 0.98), ],
89 },
90 ];
91
92 let duplicates = find_potential_duplicates(&matrix, 0.95);
93 assert_eq!(
94 duplicates.len(),
95 1,
96 "Should find exactly one duplicate pair"
97 );
98 assert_eq!(duplicates[0].2, 0.98, "Similarity should be 0.98");
99
100 assert_eq!(duplicates[0].0, "skill-a");
102 assert_eq!(duplicates[0].1, "skill-b");
103 }
104}