Skip to main content

ferritin_common/
string_utils.rs

1/// Jaro-Winkler similarity, but with case differences costing less than character differences
2///
3/// Blends case-insensitive and case-sensitive comparisons, giving more weight to the
4/// case-insensitive score. This makes case differences cost less than character differences.
5pub(crate) fn case_aware_jaro_winkler(a: &str, b: &str) -> f64 {
6    let case_insensitive = strsim::jaro_winkler(&a.to_lowercase(), &b.to_lowercase());
7    let case_sensitive = strsim::jaro_winkler(a, b);
8
9    // Blend: mostly case-insensitive with moderate influence from case-sensitive matching
10    const CASE_WEIGHT: f64 = 0.1; // 10% influence from case-sensitivity
11    (1.0 - CASE_WEIGHT) * case_insensitive + CASE_WEIGHT * case_sensitive
12}
13
14#[cfg(test)]
15mod case_aware_jaro_winkler_tests {
16    use crate::string_utils::case_aware_jaro_winkler;
17
18    fn sort_in_comparison<const N: usize>(
19        mut strings: [&'static str; N],
20        reference: &'static str,
21    ) -> [&'static str; N] {
22        strings.sort_by(|a, b| {
23            case_aware_jaro_winkler(b, reference).total_cmp(&case_aware_jaro_winkler(a, reference))
24        });
25        strings
26    }
27
28    #[test]
29    fn same_letter_different_case_is_more_similar_than_different_letters() {
30        assert_eq!(
31            sort_in_comparison(["word", "WORD", "other", "Word"], "Word"),
32            ["Word", "word", "WORD", "other"]
33        );
34    }
35
36    #[test]
37    fn exact_match_returns_highest_score() {
38        let score = case_aware_jaro_winkler("hello", "hello");
39        assert_eq!(score, 1.0, "Exact match should be 1.0");
40    }
41
42    #[test]
43    fn case_difference_costs_less_than_character_difference() {
44        let reference = "Test";
45        let case_only = case_aware_jaro_winkler("TEST", reference);
46        let char_diff = case_aware_jaro_winkler("Tast", reference);
47
48        assert!(
49            case_only > char_diff,
50            "Case-only difference ({}) should score higher than character substitution ({})",
51            case_only,
52            char_diff
53        );
54    }
55
56    #[test]
57    fn single_case_difference_vs_single_char_difference() {
58        let reference = "word";
59        let one_case = case_aware_jaro_winkler("Word", reference);
60        let one_char = case_aware_jaro_winkler("wopd", reference);
61
62        assert!(
63            one_case > one_char,
64            "Single case change ({}) should cost less than single char change ({})",
65            one_case,
66            one_char
67        );
68    }
69
70    #[test]
71    fn multiple_case_differences() {
72        // Test how multiple case differences accumulate
73        let reference = "test";
74        let one_case = case_aware_jaro_winkler("Test", reference);
75        let two_case = case_aware_jaro_winkler("TEst", reference);
76        let all_case = case_aware_jaro_winkler("TEST", reference);
77
78        println!("One case diff: {}", one_case);
79        println!("Two case diffs: {}", two_case);
80        println!("All case diffs: {}", all_case);
81
82        assert!(
83            one_case > two_case && two_case > all_case,
84            "More case differences should decrease score: {} > {} > {}",
85            one_case,
86            two_case,
87            all_case
88        );
89    }
90
91    #[test]
92    fn different_length_strings_shorter_target() {
93        // Testing min vs max length normalization
94        let reference = "Hi";
95        let score_longer = case_aware_jaro_winkler("HELLO", reference);
96        let score_case = case_aware_jaro_winkler("HI", reference);
97
98        println!("'HELLO' vs 'Hi': {}", score_longer);
99        println!("'HI' vs 'Hi': {}", score_case);
100
101        // HI has case differences on matching chars but is same length
102        // HELLO has matching prefix but is much longer
103        assert!(
104            score_case > score_longer,
105            "Same-length with case diff ({}) should beat longer string ({})",
106            score_case,
107            score_longer
108        );
109    }
110
111    #[test]
112    fn different_length_strings_longer_target() {
113        let reference = "HelloWorld";
114        let score_prefix_case = case_aware_jaro_winkler("helloworld", reference);
115        let score_partial = case_aware_jaro_winkler("Hello", reference);
116
117        println!("'helloworld' vs 'HelloWorld': {}", score_prefix_case);
118        println!("'Hello' vs 'HelloWorld': {}", score_partial);
119
120        // Full length with all case differences vs partial match
121        assert!(
122            score_prefix_case > score_partial,
123            "Full-length case diff ({}) should beat partial match ({})",
124            score_prefix_case,
125            score_partial
126        );
127    }
128
129    #[test]
130    fn empty_string_edge_case() {
131        let score_empty = case_aware_jaro_winkler("", "");
132        let score_one_empty = case_aware_jaro_winkler("test", "");
133
134        assert_eq!(score_empty, 1.0, "Empty strings should match perfectly");
135        assert_eq!(score_one_empty, 0.0, "Empty vs non-empty should score 0");
136    }
137
138    #[test]
139    fn common_prefix_with_case_differences() {
140        // Jaro-Winkler gives bonus for common prefix
141        let reference = "TestCase";
142        let prefix_case_diff = case_aware_jaro_winkler("testCase", reference);
143        let suffix_case_diff = case_aware_jaro_winkler("TestCASE", reference);
144        let no_prefix = case_aware_jaro_winkler("xestCase", reference);
145
146        println!("'testCase' vs 'TestCase': {}", prefix_case_diff);
147        println!("'TestCASE' vs 'TestCase': {}", suffix_case_diff);
148        println!("'xestCase' vs 'TestCase': {}", no_prefix);
149
150        // Case diff in prefix vs case diff in suffix vs char substitution
151        assert!(
152            prefix_case_diff > no_prefix,
153            "Case diff should beat char substitution"
154        );
155    }
156
157    #[test]
158    fn transposition_vs_case_difference() {
159        // This documents a limitation: extreme case differences (all caps of short words)
160        // can score lower than transpositions due to how the blending works
161        let reference = "test";
162        let transposed = case_aware_jaro_winkler("tset", reference);
163        let case_diff = case_aware_jaro_winkler("TEST", reference);
164
165        println!("'tset' vs 'test' (transposed): {}", transposed);
166        println!("'TEST' vs 'test' (all case diff): {}", case_diff);
167
168        // In practice, this edge case doesn't matter much for "did you mean" suggestions
169        // because real identifiers rarely have ALL characters case-swapped
170    }
171
172    #[test]
173    fn mixed_case_and_character_differences() {
174        let reference = "Example";
175        let all_case_diff = case_aware_jaro_winkler("EXAMPLE", reference);
176        let one_char = case_aware_jaro_winkler("Examplf", reference);
177        let two_chars = case_aware_jaro_winkler("Exaople", reference);
178        let few_case_diff = case_aware_jaro_winkler("ExamplE", reference);
179
180        println!("'EXAMPLE' vs 'Example' (all case):  {:.4}", all_case_diff);
181        println!("'ExamplE' vs 'Example' (few case):  {:.4}", few_case_diff);
182        println!("'Examplf' vs 'Example' (1 char):    {:.4}", one_char);
183        println!("'Exaople' vs 'Example' (2 chars):   {:.4}", two_chars);
184
185        // Few case differences should beat character differences
186        assert!(
187            few_case_diff > one_char,
188            "Few case diffs ({:.4}) should beat char diff ({:.4})",
189            few_case_diff,
190            one_char
191        );
192
193        // Edge case: Extreme case differences (ALL caps) can score similar to typos
194        // This is acceptable - in practice, "EXAMPLE" and "Example" are often different
195        // identifiers (constant vs type), while "Examplf" is clearly a typo
196        println!(
197            "Note: All-caps scoring is {:.4}, which is acceptable for did-you-mean",
198            all_case_diff
199        );
200    }
201
202    #[test]
203    fn sort_by_quality_realistic_example() {
204        // Simulate searching for "MyStruct" in documentation
205        let results = sort_in_comparison(
206            [
207                "MyStruct",   // exact match
208                "myStruct",   // camelCase variant
209                "MYSTRUCT",   // all caps
210                "my_struct",  // snake_case
211                "MyString",   // similar but different type
212                "YourStruct", // different prefix
213            ],
214            "MyStruct",
215        );
216
217        println!("Sorted results for 'MyStruct':");
218        for (i, result) in results.iter().enumerate() {
219            println!(
220                "  {}. {} (score: {:.4})",
221                i + 1,
222                result,
223                case_aware_jaro_winkler(result, "MyStruct")
224            );
225        }
226
227        // We'd expect exact match first, then case variants, then structural variants
228        assert_eq!(results[0], "MyStruct", "Exact match should be first");
229
230        // The relative ordering of myStruct, MYSTRUCT, and my_struct is interesting
231        // - myStruct has 1 case diff
232        // - MYSTRUCT has 6 case diffs
233        // - my_struct has 1 case diff + 1 underscore
234        let mystruct_pos = results.iter().position(|&s| s == "myStruct").unwrap();
235        let mystruct_caps_pos = results.iter().position(|&s| s == "MYSTRUCT").unwrap();
236
237        assert!(
238            mystruct_pos < mystruct_caps_pos,
239            "Fewer case differences should rank higher"
240        );
241    }
242
243    #[test]
244    fn potential_score_overflow() {
245        // Check if adding case_bonus can push score > 1.0
246        let reference = "test";
247        let all_case = case_aware_jaro_winkler("TEST", reference);
248
249        assert!(
250            all_case <= 1.0,
251            "Score should not exceed 1.0, got {}",
252            all_case
253        );
254    }
255
256    #[test]
257    fn length_normalization_exploration() {
258        // Exploring whether max or min length is better for normalization
259        let short = "Hi";
260        let long = "HiThere";
261
262        // Current implementation uses max(a.len(), b.len())
263        let score = case_aware_jaro_winkler("HI", short);
264
265        println!("Testing length normalization:");
266        println!("  'HI' vs 'Hi' (same length): {}", score);
267        println!(
268            "  'HITHERE' vs 'HiThere': {}",
269            case_aware_jaro_winkler("HITHERE", long)
270        );
271        println!(
272            "  'HI' vs 'HiThere': {}",
273            case_aware_jaro_winkler("HI", long)
274        );
275
276        // With max: bonus is diluted for shorter strings matching longer ones
277        // With min: bonus would be concentrated on matched portion
278        // This test documents current behavior for evaluation
279    }
280
281    #[test]
282    fn realistic_naming_convention_variations() {
283        // Test realistic scenarios where case matters but shouldn't dominate
284        let reference = "getValue";
285
286        let camel_case = case_aware_jaro_winkler("getValue", reference);
287        let pascal_case = case_aware_jaro_winkler("GetValue", reference);
288        let snake_case = case_aware_jaro_winkler("get_value", reference);
289        let typo = case_aware_jaro_winkler("getValu", reference); // missing 'e'
290        let wrong_word = case_aware_jaro_winkler("setValue", reference);
291
292        println!("\nRealistic 'getValue' comparisons:");
293        println!("  getValue (exact):     {:.4}", camel_case);
294        println!("  GetValue (Pascal):    {:.4}", pascal_case);
295        println!("  get_value (snake):    {:.4}", snake_case);
296        println!("  getValu (typo):       {:.4}", typo);
297        println!("  setValue (wrong):     {:.4}", wrong_word);
298
299        // Exact match is best
300        assert_eq!(camel_case, 1.0);
301
302        // Case variation should beat typo
303        assert!(
304            pascal_case > typo,
305            "Case variation ({}) should beat typo ({})",
306            pascal_case,
307            typo
308        );
309
310        // Case variation should beat wrong word
311        assert!(
312            pascal_case > wrong_word && snake_case > wrong_word,
313            "Case variations should beat different words"
314        );
315    }
316}