Skip to main content

simple_agents_healing/
string_utils.rs

1//! String utility functions for case conversion and fuzzy matching.
2//!
3//! Provides helpers for field name normalization and similarity scoring.
4
5/// Convert a string from camelCase or PascalCase to snake_case.
6///
7/// # Examples
8///
9/// ```
10/// use simple_agents_healing::string_utils::to_snake_case;
11///
12/// assert_eq!(to_snake_case("firstName"), "first_name");
13/// assert_eq!(to_snake_case("userID"), "user_id");
14/// assert_eq!(to_snake_case("HTTPResponse"), "http_response");
15/// ```
16pub fn to_snake_case(s: &str) -> String {
17    let mut result = String::with_capacity(s.len() + 5);
18    let mut prev_is_lowercase = false;
19    let mut prev_is_uppercase = false;
20
21    for (i, ch) in s.chars().enumerate() {
22        if ch.is_uppercase() {
23            // Insert underscore before uppercase if:
24            // 1. Not at start
25            // 2. Previous char was lowercase (camelCase boundary)
26            // 3. Next char is lowercase (HTTPResponse -> http_response)
27            if i > 0 && (prev_is_lowercase || (prev_is_uppercase && next_is_lowercase(s, i))) {
28                result.push('_');
29            }
30            result.push(ch.to_ascii_lowercase());
31            prev_is_uppercase = true;
32            prev_is_lowercase = false;
33        } else {
34            result.push(ch);
35            prev_is_lowercase = ch.is_lowercase();
36            prev_is_uppercase = false;
37        }
38    }
39
40    result
41}
42
43/// Convert a string from snake_case to camelCase.
44///
45/// # Examples
46///
47/// ```
48/// use simple_agents_healing::string_utils::to_camel_case;
49///
50/// assert_eq!(to_camel_case("first_name"), "firstName");
51/// assert_eq!(to_camel_case("user_id"), "userId");
52/// assert_eq!(to_camel_case("http_response"), "httpResponse");
53/// ```
54pub fn to_camel_case(s: &str) -> String {
55    let mut result = String::with_capacity(s.len());
56    let mut capitalize_next = false;
57
58    for ch in s.chars() {
59        if ch == '_' {
60            capitalize_next = true;
61        } else if capitalize_next {
62            result.push(ch.to_ascii_uppercase());
63            capitalize_next = false;
64        } else {
65            result.push(ch);
66        }
67    }
68
69    result
70}
71
72/// Check if the next character in the string is lowercase.
73fn next_is_lowercase(s: &str, current_idx: usize) -> bool {
74    s.chars()
75        .nth(current_idx + 1)
76        .map(|ch| ch.is_lowercase())
77        .unwrap_or(false)
78}
79
80/// Calculate Jaro-Winkler similarity between two strings.
81///
82/// Returns a value between 0.0 (no similarity) and 1.0 (identical).
83/// The Jaro-Winkler metric gives more weight to strings with matching prefixes.
84///
85/// # Algorithm
86///
87/// 1. Calculate Jaro similarity (based on matching characters and transpositions)
88/// 2. Apply Winkler modification (boost for common prefix)
89///
90/// # Examples
91///
92/// ```
93/// use simple_agents_healing::string_utils::jaro_winkler;
94///
95/// assert!(jaro_winkler("hello", "hello") > 0.99);
96/// assert!(jaro_winkler("hello", "hallo") > 0.8);
97/// assert!(jaro_winkler("hello", "world") < 0.5);
98/// ```
99pub fn jaro_winkler(s1: &str, s2: &str) -> f64 {
100    if s1 == s2 {
101        return 1.0;
102    }
103    if s1.is_empty() || s2.is_empty() {
104        return 0.0;
105    }
106
107    // Calculate Jaro similarity first
108    let jaro = jaro_similarity(s1, s2);
109
110    // Calculate common prefix length (up to 4 characters)
111    let prefix_len = s1
112        .chars()
113        .zip(s2.chars())
114        .take(4)
115        .take_while(|(c1, c2)| c1 == c2)
116        .count();
117
118    // Apply Winkler modification
119    // jaro_winkler = jaro + (prefix_length * p * (1 - jaro))
120    // where p = 0.1 (standard scaling factor)
121    const P: f64 = 0.1;
122    jaro + (prefix_len as f64 * P * (1.0 - jaro))
123}
124
125/// Calculate Jaro similarity between two strings.
126///
127/// This is the base algorithm used by Jaro-Winkler.
128/// Returns a value between 0.0 and 1.0.
129fn jaro_similarity(s1: &str, s2: &str) -> f64 {
130    let s1_chars: Vec<char> = s1.chars().collect();
131    let s2_chars: Vec<char> = s2.chars().collect();
132
133    let s1_len = s1_chars.len();
134    let s2_len = s2_chars.len();
135
136    if s1_len == 0 || s2_len == 0 {
137        return 0.0;
138    }
139
140    // Calculate match window (max distance for characters to be considered matching)
141    let match_distance = (s1_len.max(s2_len) / 2).saturating_sub(1);
142
143    let mut s1_matches = vec![false; s1_len];
144    let mut s2_matches = vec![false; s2_len];
145
146    let mut matches = 0;
147    let mut transpositions = 0;
148
149    // Find matching characters
150    for i in 0..s1_len {
151        let start = i.saturating_sub(match_distance);
152        let end = (i + match_distance + 1).min(s2_len);
153
154        for j in start..end {
155            if s2_matches[j] || s1_chars[i] != s2_chars[j] {
156                continue;
157            }
158            s1_matches[i] = true;
159            s2_matches[j] = true;
160            matches += 1;
161            break;
162        }
163    }
164
165    if matches == 0 {
166        return 0.0;
167    }
168
169    // Count transpositions
170    let mut k = 0;
171    for i in 0..s1_len {
172        if !s1_matches[i] {
173            continue;
174        }
175        while !s2_matches[k] {
176            k += 1;
177        }
178        if s1_chars[i] != s2_chars[k] {
179            transpositions += 1;
180        }
181        k += 1;
182    }
183
184    // Calculate Jaro similarity
185    let m = matches as f64;
186    (m / s1_len as f64 + m / s2_len as f64 + (m - transpositions as f64 / 2.0) / m) / 3.0
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192
193    #[test]
194    fn test_to_snake_case() {
195        assert_eq!(to_snake_case("firstName"), "first_name");
196        assert_eq!(to_snake_case("FirstName"), "first_name");
197        assert_eq!(to_snake_case("userID"), "user_id");
198        assert_eq!(to_snake_case("HTTPResponse"), "http_response");
199        assert_eq!(to_snake_case("XMLHttpRequest"), "xml_http_request");
200        assert_eq!(to_snake_case("already_snake"), "already_snake");
201        assert_eq!(to_snake_case("IOError"), "io_error");
202    }
203
204    #[test]
205    fn test_to_camel_case() {
206        assert_eq!(to_camel_case("first_name"), "firstName");
207        assert_eq!(to_camel_case("user_id"), "userId");
208        assert_eq!(to_camel_case("http_response"), "httpResponse");
209        assert_eq!(to_camel_case("already_camel"), "alreadyCamel");
210    }
211
212    #[test]
213    fn test_case_conversion_roundtrip() {
214        let snake = "user_first_name";
215        let camel = to_camel_case(snake);
216        assert_eq!(camel, "userFirstName");
217        assert_eq!(to_snake_case(&camel), snake);
218    }
219
220    #[test]
221    fn test_jaro_winkler_identical() {
222        assert!((jaro_winkler("hello", "hello") - 1.0).abs() < 0.001);
223        assert!((jaro_winkler("test", "test") - 1.0).abs() < 0.001);
224    }
225
226    #[test]
227    fn test_jaro_winkler_empty() {
228        assert_eq!(jaro_winkler("", ""), 1.0);
229        assert_eq!(jaro_winkler("hello", ""), 0.0);
230        assert_eq!(jaro_winkler("", "world"), 0.0);
231    }
232
233    #[test]
234    fn test_jaro_winkler_similar() {
235        // Common prefix boost
236        assert!(jaro_winkler("hello", "hallo") > 0.8);
237        assert!(jaro_winkler("martha", "marhta") > 0.9);
238
239        // Test cases from literature
240        assert!(jaro_winkler("dixon", "dicksonx") > 0.8);
241        assert!(jaro_winkler("william", "williams") > 0.9);
242    }
243
244    #[test]
245    fn test_jaro_winkler_different() {
246        assert!(jaro_winkler("hello", "world") < 0.6);
247        assert!(jaro_winkler("abc", "xyz") < 0.3);
248    }
249
250    #[test]
251    fn test_jaro_winkler_field_matching() {
252        // Realistic field name matching scenarios
253        assert!(jaro_winkler("userName", "username") > 0.9);
254        assert!(jaro_winkler("firstName", "first_name") > 0.7);
255        assert!(jaro_winkler("userId", "user_id") > 0.8);
256
257        // Typos
258        assert!(jaro_winkler("usrName", "userName") > 0.8);
259        assert!(jaro_winkler("emailAdress", "emailAddress") > 0.95);
260    }
261
262    #[test]
263    fn test_jaro_similarity() {
264        assert!((jaro_similarity("martha", "marhta") - 0.944).abs() < 0.01);
265        assert!((jaro_similarity("dixon", "dicksonx")).abs() < 0.8);
266    }
267}