partial_date/levenshtein.rs
1//! Levenshtein distance and ratio algorithms.
2//!
3//! These are implemented from scratch — the library has zero external
4//! dependencies and this module must not introduce any.
5//!
6//! # Algorithms
7//!
8//! - [`levenshtein_distance`] — the classic edit distance (insertions,
9//! deletions, substitutions each cost 1).
10//! - [`levenshtein_ratio`] — a normalised similarity score in `[0.0, 1.0]`
11//! derived from the distance. Two identical strings score `1.0`; two
12//! completely different strings of maximum length score `0.0`.
13
14// ---------------------------------------------------------------------------
15// Distance
16// ---------------------------------------------------------------------------
17
18/// Compute the Levenshtein edit distance between two strings.
19///
20/// The distance counts the minimum number of single-character edits
21/// (insertions, deletions, or substitutions) required to transform `a` into
22/// `b`.
23///
24/// The implementation uses two rolling rows of a standard dynamic-programming
25/// matrix so memory is O(min(|a|, |b|)) rather than O(|a| · |b|).
26///
27/// Both strings are treated as sequences of bytes. All inputs in this library
28/// are ASCII (the caller lowercases before calling), so byte-level comparisons
29/// are correct and cheaper than char-level ones.
30///
31/// # Examples
32///
33/// ```
34/// use partial_date::levenshtein::levenshtein_distance;
35///
36/// assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
37/// assert_eq!(levenshtein_distance("", "abc"), 3);
38/// assert_eq!(levenshtein_distance("abc", "abc"), 0);
39/// assert_eq!(levenshtein_distance("march", "mrach"), 2);
40/// ```
41pub fn levenshtein_distance(a: &str, b: &str) -> usize {
42 // Make `a` the shorter string so the inner loop (over `a`) uses less
43 // memory and cache pressure.
44 let (a, b) = if a.len() <= b.len() {
45 (a.as_bytes(), b.as_bytes())
46 } else {
47 (b.as_bytes(), a.as_bytes())
48 };
49
50 let len_a = a.len();
51 let len_b = b.len();
52
53 // prev[j] = distance(a[0..i-1], b[0..j])
54 // curr[j] = distance(a[0..i], b[0..j])
55 let mut prev: Vec<usize> = (0..=len_b).collect();
56 let mut curr: Vec<usize> = vec![0; len_b + 1];
57
58 for i in 1..=len_a {
59 curr[0] = i;
60 for j in 1..=len_b {
61 let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
62 curr[j] = (prev[j] + 1) // deletion
63 .min(curr[j - 1] + 1) // insertion
64 .min(prev[j - 1] + cost); // substitution
65 }
66 prev.clone_from(&curr);
67 }
68
69 prev[len_b]
70}
71
72// ---------------------------------------------------------------------------
73// Ratio
74// ---------------------------------------------------------------------------
75
76/// Compute the normalised Levenshtein similarity ratio between two strings.
77///
78/// The ratio is defined as:
79///
80/// ```text
81/// ratio = 1.0 - distance(a, b) / max(len(a), len(b))
82/// ```
83///
84/// and lies in the range `[0.0, 1.0]`:
85///
86/// - `1.0` — the strings are identical (or both empty).
87/// - `0.0` — the strings are completely different (distance equals the length
88/// of the longer string).
89///
90/// # Examples
91///
92/// ```
93/// use partial_date::levenshtein::levenshtein_ratio;
94///
95/// assert!((levenshtein_ratio("march", "march") - 1.0).abs() < f32::EPSILON);
96/// assert!((levenshtein_ratio("mrach", "march") - 0.6).abs() < f32::EPSILON);
97/// assert!((levenshtein_ratio("january","jauary") - (6.0/7.0)).abs() < f32::EPSILON);
98/// ```
99pub fn levenshtein_ratio(a: &str, b: &str) -> f32 {
100 let max_len = a.len().max(b.len());
101 if max_len == 0 {
102 return 1.0; // Both strings are empty — they are identical.
103 }
104 let dist = levenshtein_distance(a, b);
105 1.0 - (dist as f32 / max_len as f32)
106}