Skip to main content

handy/
pattern.rs

1use jaro_winkler::jaro_winkler;
2use levenshtein::levenshtein;
3use regex::Regex;
4use std::path::Path;
5
6/// The margin of error for string similarity scores.
7pub const ERROR_MARGIN: f64 = 0.001;
8
9/// Converts a glob pattern to a regex pattern.
10///
11/// ## Examples
12///
13/// ```rust,no_run
14/// use handy::pattern::glob_to_regex_pattern;
15///
16/// assert_eq!(glob_to_regex_pattern("fish*.txt"), "fish.*\\.txt");
17/// ```
18#[must_use]
19pub fn glob_to_regex_pattern(pattern: &str) -> String {
20    let mut regex_pattern = String::new();
21    let mut escaping = false;
22
23    for c in pattern.chars() {
24        match c {
25            '*' if !escaping => regex_pattern.push_str(".*"), // Match any sequence of characters
26            '?' if !escaping => regex_pattern.push('.'),      // Match any single character
27            '.' | '+' | '(' | ')' | '|' | '^' | '$' | '[' | ']' | '{' | '}' | '\\' if !escaping => {
28                regex_pattern.push('\\'); // Escape regex special characters
29                regex_pattern.push(c);
30            }
31            '\\' if !escaping => escaping = true, // Start escaping next character
32            _ => {
33                regex_pattern.push(c); // Literal character
34                escaping = false;
35            }
36        }
37    }
38    regex_pattern
39}
40
41/// Checks if a string similarity score is close to the upper bound (1.0), which (according to the [`ERROR_MARGIN`]) indicates a perfect match.
42///
43/// ## Arguments
44///
45/// * `score` - The similarity score to check, can be from [`match_string`].
46///
47/// ## Returns
48///
49/// * `bool` - True if the score is close to 1.0, false otherwise.
50#[must_use]
51pub fn is_close_to_upper_bound(score: f64) -> bool {
52    (score - 1.0).abs() < ERROR_MARGIN
53}
54
55/// Checks if a path's filename matches a glob pattern.
56///
57/// ## Examples
58///
59/// ```rust,no_run
60/// use std::path::Path;
61/// use handy::pattern::match_filename_with_glob_pattern;
62///
63/// assert!(match_filename_with_glob_pattern(Path::new("fish.txt"), "f*.txt"));
64/// ```
65///
66/// ## Panics
67///
68/// This function panics if the internal glob pattern `.*` is invalid.
69#[must_use]
70pub fn match_filename_with_glob_pattern(path: &Path, pattern: &str) -> bool {
71    let regex_pattern = glob_to_regex_pattern(pattern);
72    let re = Regex::new(&regex_pattern).unwrap_or(Regex::new(".*").unwrap());
73
74    if let Some(name) = path.file_name().map(|s| s.to_string_lossy().to_string()) {
75        if re.is_match(&name) {
76            return true;
77        }
78    }
79
80    false
81}
82
83/// Returns a similarity score between two strings using a fuzzy matching algorithm.
84///
85/// ## Examples
86///
87/// ```rust,no_run
88/// use handy::pattern::match_string;
89///
90/// let s1 = "Salvage Yard";
91/// let s2 = "yard";
92///
93/// let score = match_string(s1, s2);
94/// println!("Score: {}", score);
95/// ```
96///
97/// ## Arguments
98///
99/// * `s1` - The first string.
100/// * `s2` - The second string.
101///
102/// ## Returns
103///
104/// The similarity score between the two strings.
105#[allow(clippy::cast_precision_loss)]
106#[must_use]
107pub fn match_string(s1: &str, s2: &str) -> f64 {
108    let s1 = s1.to_lowercase();
109    let s2 = s2.to_lowercase();
110
111    if s1.is_empty() || s2.is_empty() {
112        return if s1.is_empty() == s2.is_empty() {
113            1.0
114        } else {
115            0.0
116        };
117    }
118
119    if s1.contains(&s2) || s2.contains(&s1) {
120        return 1.0;
121    }
122
123    let len1 = s1.chars().count();
124    let len2 = s2.chars().count();
125    let shorter_len = len1.min(len2);
126
127    if shorter_len == 0 {
128        return 0.0;
129    }
130
131    let distance = levenshtein(&s1, &s2) as f64;
132    let score = 1.0 - (distance / shorter_len as f64);
133
134    score.clamp(0.0, 1.0)
135}
136
137/// Returns a similarity score between two strings using a fuzzy matching algorithm that relies on Jaro-Winkler instead Levenshtein. Use this over [`match_string`].
138///
139/// ## Examples
140///
141/// ```rust,no_run
142/// use handy::pattern::string_similarity;
143///
144/// let s1 = "Salvage Yard";
145/// let s2 = "yad";
146///
147/// let score = string_similarity(s1, s2);
148/// println!("Score: {}", score);
149/// ```
150///
151/// ## Arguments
152///
153/// * `s1` - The first string.
154/// * `s2` - The second string.
155///
156/// ## Returns
157///
158/// The similarity score between the two strings, the score is a [f64] between 0.0 and 1.0.
159#[must_use]
160pub fn string_similarity<S1, S2>(s1: S1, s2: S2) -> f64
161where
162    S1: AsRef<str>,
163    S2: AsRef<str>,
164{
165    string_similarity_impl(s1.as_ref(), s2.as_ref())
166}
167
168/// Returns a similarity score between two strings using a fuzzy matching algorithm that relies on Jaro-Winkler instead of Levenshtein.
169fn string_similarity_impl(s1: &str, s2: &str) -> f64 {
170    let s1 = s1.trim().to_lowercase();
171    let s2 = s2.trim().to_lowercase();
172
173    if s1.is_empty() || s2.is_empty() {
174        return 0.0;
175    }
176
177    if s1.contains(&s2) || s2.contains(&s1) {
178        return 1.0;
179    }
180
181    jaro_winkler(&s1, &s2)
182}
183
184/// Asserts that two strings have a similarity score close to the expected value.
185#[macro_export]
186macro_rules! assert_match_string {
187    ($s1:expr, $s2:expr, $expected:expr) => {
188        let actual = $crate::pattern::match_string($s1, $s2);
189        assert!(
190            (actual - $expected).abs() < $crate::pattern::ERROR_MARGIN,
191            "Left: {}\nRight: {}",
192            actual,
193            $expected
194        );
195    };
196}
197
198/// Asserts that two strings have a similarity score close to the expected value.
199#[macro_export]
200macro_rules! assert_string_similarity {
201    ($s1:expr, $s2:expr, $expected:expr) => {
202        let actual = $crate::pattern::string_similarity($s1, $s2);
203        assert!(
204            (actual - $expected).abs() < $crate::pattern::ERROR_MARGIN,
205            "Left: {}\nRight: {}",
206            actual,
207            $expected
208        );
209    };
210}
211
212#[cfg(test)]
213mod tests {
214    use super::{glob_to_regex_pattern, match_filename_with_glob_pattern};
215    use crate::{assert_match_string, pattern::is_close_to_upper_bound};
216    use std::path::Path;
217
218    #[test]
219    fn test_glob_to_regex() {
220        assert_eq!(glob_to_regex_pattern("fish*.txt"), "fish.*\\.txt");
221        assert_eq!(glob_to_regex_pattern("fish?txt"), "fish.txt");
222        assert_eq!(glob_to_regex_pattern("fish+txt"), "fish\\+txt");
223        assert_eq!(glob_to_regex_pattern("fish\\txt"), "fish\\\\txt");
224        assert_eq!(glob_to_regex_pattern("fish\\(txt"), "fish\\\\\\(txt");
225    }
226
227    #[test]
228    fn test_is_close_to_upper_bound() {
229        assert!(is_close_to_upper_bound(1.0));
230        assert!(is_close_to_upper_bound(0.9999));
231    }
232
233    #[test]
234    #[should_panic(expected = "is_close_to_upper_bound(0.999)")]
235    fn test_is_close_to_upper_bound_false() {
236        assert!(is_close_to_upper_bound(0.999));
237    }
238
239    #[test]
240    fn test_match_filename_with_glob_pattern() {
241        assert!(match_filename_with_glob_pattern(
242            Path::new("fish.txt"),
243            "f*.txt"
244        ));
245        assert!(!match_filename_with_glob_pattern(
246            Path::new("fish.txt"),
247            "f*.jpg"
248        ));
249    }
250
251    #[test]
252    fn test_match_string() {
253        assert_match_string!("kitten", "kissing", 0.333);
254        assert_match_string!("Salvage Yard", "yard", 1.0);
255        assert_match_string!("raiju", "yard", 0.0);
256    }
257
258    #[test]
259    fn test_string_similarity() {
260        assert_string_similarity!("kitten", "kissing", 0.714);
261        assert_string_similarity!("Salvage Yard", "yard", 1.0);
262        assert_string_similarity!("Salvage Yard", "yad", 0.472);
263        assert_string_similarity!("raiju", "yard", 0.483);
264    }
265}