Skip to main content

jpx_core/extensions/
fuzzy.rs

1//! Fuzzy string matching functions.
2
3use std::collections::HashSet;
4
5use serde_json::Value;
6
7use crate::functions::{Function, number_value};
8use crate::interpreter::SearchResult;
9use crate::registry::register_if_enabled;
10use crate::{Context, Runtime, arg, defn};
11
12// levenshtein(s1, s2) -> number
13defn!(LevenshteinFn, vec![arg!(string), arg!(string)], None);
14
15impl Function for LevenshteinFn {
16    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
17        self.signature.validate(args, ctx)?;
18        let s1 = args[0].as_str().unwrap();
19        let s2 = args[1].as_str().unwrap();
20        let dist = strsim::levenshtein(s1, s2);
21        Ok(number_value(dist as f64))
22    }
23}
24
25// normalized_levenshtein(s1, s2) -> number (0.0-1.0)
26defn!(
27    NormalizedLevenshteinFn,
28    vec![arg!(string), arg!(string)],
29    None
30);
31
32impl Function for NormalizedLevenshteinFn {
33    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
34        self.signature.validate(args, ctx)?;
35        let s1 = args[0].as_str().unwrap();
36        let s2 = args[1].as_str().unwrap();
37        let sim = strsim::normalized_levenshtein(s1, s2);
38        Ok(number_value(sim))
39    }
40}
41
42// damerau_levenshtein(s1, s2) -> number
43defn!(DamerauLevenshteinFn, vec![arg!(string), arg!(string)], None);
44
45impl Function for DamerauLevenshteinFn {
46    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
47        self.signature.validate(args, ctx)?;
48        let s1 = args[0].as_str().unwrap();
49        let s2 = args[1].as_str().unwrap();
50        let dist = strsim::damerau_levenshtein(s1, s2);
51        Ok(number_value(dist as f64))
52    }
53}
54
55// normalized_damerau_levenshtein(s1, s2) -> number (0.0-1.0)
56defn!(
57    NormalizedDamerauLevenshteinFn,
58    vec![arg!(string), arg!(string)],
59    None
60);
61
62impl Function for NormalizedDamerauLevenshteinFn {
63    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
64        self.signature.validate(args, ctx)?;
65        let s1 = args[0].as_str().unwrap();
66        let s2 = args[1].as_str().unwrap();
67        let sim = strsim::normalized_damerau_levenshtein(s1, s2);
68        Ok(number_value(sim))
69    }
70}
71
72// jaro(s1, s2) -> number (0.0-1.0)
73defn!(JaroFn, vec![arg!(string), arg!(string)], None);
74
75impl Function for JaroFn {
76    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
77        self.signature.validate(args, ctx)?;
78        let s1 = args[0].as_str().unwrap();
79        let s2 = args[1].as_str().unwrap();
80        let sim = strsim::jaro(s1, s2);
81        Ok(number_value(sim))
82    }
83}
84
85// jaro_winkler(s1, s2) -> number (0.0-1.0)
86defn!(JaroWinklerFn, vec![arg!(string), arg!(string)], None);
87
88impl Function for JaroWinklerFn {
89    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
90        self.signature.validate(args, ctx)?;
91        let s1 = args[0].as_str().unwrap();
92        let s2 = args[1].as_str().unwrap();
93        let sim = strsim::jaro_winkler(s1, s2);
94        Ok(number_value(sim))
95    }
96}
97
98// sorensen_dice(s1, s2) -> number (0.0-1.0)
99defn!(SorensenDiceFn, vec![arg!(string), arg!(string)], None);
100
101impl Function for SorensenDiceFn {
102    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
103        self.signature.validate(args, ctx)?;
104        let s1 = args[0].as_str().unwrap();
105        let s2 = args[1].as_str().unwrap();
106        let sim = strsim::sorensen_dice(s1, s2);
107        Ok(number_value(sim))
108    }
109}
110
111// hamming(s1, s2) -> number (returns null if strings have different lengths)
112defn!(HammingFn, vec![arg!(string), arg!(string)], None);
113
114impl Function for HammingFn {
115    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
116        self.signature.validate(args, ctx)?;
117        let s1 = args[0].as_str().unwrap();
118        let s2 = args[1].as_str().unwrap();
119        match strsim::hamming(s1, s2) {
120            Ok(dist) => Ok(number_value(dist as f64)),
121            Err(_) => Ok(Value::Null), // Different lengths
122        }
123    }
124}
125
126// osa_distance(s1, s2) -> number (Optimal String Alignment distance)
127defn!(OsaDistanceFn, vec![arg!(string), arg!(string)], None);
128
129impl Function for OsaDistanceFn {
130    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
131        self.signature.validate(args, ctx)?;
132        let s1 = args[0].as_str().unwrap();
133        let s2 = args[1].as_str().unwrap();
134        let dist = strsim::osa_distance(s1, s2);
135        Ok(number_value(dist as f64))
136    }
137}
138
139/// Register fuzzy matching functions filtered by the enabled set.
140pub fn register_filtered(runtime: &mut Runtime, enabled: &HashSet<&str>) {
141    register_if_enabled(
142        runtime,
143        "levenshtein",
144        enabled,
145        Box::new(LevenshteinFn::new()),
146    );
147    register_if_enabled(
148        runtime,
149        "normalized_levenshtein",
150        enabled,
151        Box::new(NormalizedLevenshteinFn::new()),
152    );
153    register_if_enabled(
154        runtime,
155        "damerau_levenshtein",
156        enabled,
157        Box::new(DamerauLevenshteinFn::new()),
158    );
159    register_if_enabled(
160        runtime,
161        "normalized_damerau_levenshtein",
162        enabled,
163        Box::new(NormalizedDamerauLevenshteinFn::new()),
164    );
165    register_if_enabled(runtime, "jaro", enabled, Box::new(JaroFn::new()));
166    register_if_enabled(
167        runtime,
168        "jaro_winkler",
169        enabled,
170        Box::new(JaroWinklerFn::new()),
171    );
172    register_if_enabled(
173        runtime,
174        "sorensen_dice",
175        enabled,
176        Box::new(SorensenDiceFn::new()),
177    );
178    register_if_enabled(runtime, "hamming", enabled, Box::new(HammingFn::new()));
179    register_if_enabled(
180        runtime,
181        "osa_distance",
182        enabled,
183        Box::new(OsaDistanceFn::new()),
184    );
185}
186
187#[cfg(test)]
188mod tests {
189    use crate::Runtime;
190    use serde_json::json;
191
192    fn setup_runtime() -> Runtime {
193        Runtime::builder()
194            .with_standard()
195            .with_all_extensions()
196            .build()
197    }
198
199    #[test]
200    fn test_levenshtein() {
201        let runtime = setup_runtime();
202        let expr = runtime.compile("levenshtein('kitten', 'sitting')").unwrap();
203        let result = expr.search(&json!(null)).unwrap();
204        assert_eq!(result.as_f64().unwrap(), 3.0);
205    }
206
207    #[test]
208    fn test_levenshtein_identical() {
209        let runtime = setup_runtime();
210        let expr = runtime.compile("levenshtein('hello', 'hello')").unwrap();
211        let result = expr.search(&json!(null)).unwrap();
212        assert_eq!(result.as_f64().unwrap(), 0.0);
213    }
214
215    #[test]
216    fn test_normalized_levenshtein() {
217        let runtime = setup_runtime();
218        let expr = runtime
219            .compile("normalized_levenshtein('hello', 'hello')")
220            .unwrap();
221        let result = expr.search(&json!(null)).unwrap();
222        assert_eq!(result.as_f64().unwrap(), 1.0);
223    }
224
225    #[test]
226    fn test_normalized_levenshtein_different() {
227        let runtime = setup_runtime();
228        let expr = runtime
229            .compile("normalized_levenshtein('hello', 'world')")
230            .unwrap();
231        let result = expr.search(&json!(null)).unwrap();
232        let sim = result.as_f64().unwrap();
233        assert!(sim > 0.0 && sim < 1.0);
234    }
235
236    #[test]
237    fn test_damerau_levenshtein() {
238        let runtime = setup_runtime();
239        // Transposition: "ab" -> "ba" is 1 edit in Damerau-Levenshtein
240        let expr = runtime.compile("damerau_levenshtein('ab', 'ba')").unwrap();
241        let result = expr.search(&json!(null)).unwrap();
242        assert_eq!(result.as_f64().unwrap(), 1.0);
243    }
244
245    #[test]
246    fn test_jaro() {
247        let runtime = setup_runtime();
248        let expr = runtime.compile("jaro('hello', 'hallo')").unwrap();
249        let result = expr.search(&json!(null)).unwrap();
250        let sim = result.as_f64().unwrap();
251        assert!(sim > 0.8);
252    }
253
254    #[test]
255    fn test_jaro_identical() {
256        let runtime = setup_runtime();
257        let expr = runtime.compile("jaro('test', 'test')").unwrap();
258        let result = expr.search(&json!(null)).unwrap();
259        assert_eq!(result.as_f64().unwrap(), 1.0);
260    }
261
262    #[test]
263    fn test_jaro_winkler() {
264        let runtime = setup_runtime();
265        // Jaro-Winkler boosts common prefixes
266        let expr = runtime
267            .compile("jaro_winkler('prefix_abc', 'prefix_xyz')")
268            .unwrap();
269        let result = expr.search(&json!(null)).unwrap();
270        let sim = result.as_f64().unwrap();
271        assert!(sim > 0.7);
272    }
273
274    #[test]
275    fn test_jaro_winkler_vs_jaro() {
276        let runtime = setup_runtime();
277        // Jaro-Winkler should be >= Jaro for strings with common prefix
278        let jw_expr = runtime.compile("jaro_winkler('hello', 'hella')").unwrap();
279        let j_expr = runtime.compile("jaro('hello', 'hella')").unwrap();
280        let jw = jw_expr.search(&json!(null)).unwrap();
281        let j = j_expr.search(&json!(null)).unwrap();
282        assert!(jw.as_f64().unwrap() >= j.as_f64().unwrap());
283    }
284
285    #[test]
286    fn test_sorensen_dice() {
287        let runtime = setup_runtime();
288        let expr = runtime.compile("sorensen_dice('night', 'nacht')").unwrap();
289        let result = expr.search(&json!(null)).unwrap();
290        let sim = result.as_f64().unwrap();
291        assert!(sim > 0.0 && sim < 1.0);
292    }
293
294    #[test]
295    fn test_sorensen_dice_identical() {
296        let runtime = setup_runtime();
297        let expr = runtime.compile("sorensen_dice('test', 'test')").unwrap();
298        let result = expr.search(&json!(null)).unwrap();
299        assert_eq!(result.as_f64().unwrap(), 1.0);
300    }
301
302    #[test]
303    fn test_normalized_damerau_levenshtein() {
304        let runtime = setup_runtime();
305        let expr = runtime
306            .compile("normalized_damerau_levenshtein('hello', 'hello')")
307            .unwrap();
308        let result = expr.search(&json!(null)).unwrap();
309        assert_eq!(result.as_f64().unwrap(), 1.0);
310    }
311
312    #[test]
313    fn test_normalized_damerau_levenshtein_transposition() {
314        let runtime = setup_runtime();
315        // "ab" vs "ba" - transposition
316        let expr = runtime
317            .compile("normalized_damerau_levenshtein('ab', 'ba')")
318            .unwrap();
319        let result = expr.search(&json!(null)).unwrap();
320        let sim = result.as_f64().unwrap();
321        assert!(sim > 0.0 && sim < 1.0);
322    }
323
324    #[test]
325    fn test_hamming() {
326        let runtime = setup_runtime();
327        let expr = runtime.compile("hamming('karolin', 'kathrin')").unwrap();
328        let result = expr.search(&json!(null)).unwrap();
329        assert_eq!(result.as_f64().unwrap(), 3.0);
330    }
331
332    #[test]
333    fn test_hamming_identical() {
334        let runtime = setup_runtime();
335        let expr = runtime.compile("hamming('hello', 'hello')").unwrap();
336        let result = expr.search(&json!(null)).unwrap();
337        assert_eq!(result.as_f64().unwrap(), 0.0);
338    }
339
340    #[test]
341    fn test_hamming_different_lengths() {
342        let runtime = setup_runtime();
343        // Different lengths should return null
344        let expr = runtime.compile("hamming('hello', 'hi')").unwrap();
345        let result = expr.search(&json!(null)).unwrap();
346        assert!(result.is_null());
347    }
348
349    #[test]
350    fn test_osa_distance() {
351        let runtime = setup_runtime();
352        // OSA allows transpositions
353        let expr = runtime.compile("osa_distance('ab', 'ba')").unwrap();
354        let result = expr.search(&json!(null)).unwrap();
355        assert_eq!(result.as_f64().unwrap(), 1.0);
356    }
357
358    #[test]
359    fn test_osa_distance_identical() {
360        let runtime = setup_runtime();
361        let expr = runtime.compile("osa_distance('hello', 'hello')").unwrap();
362        let result = expr.search(&json!(null)).unwrap();
363        assert_eq!(result.as_f64().unwrap(), 0.0);
364    }
365}