partial_date/
levenshtein.rs

1//! Levenshtein distance and ratio algorithms.
2//!
3//! These are implemented from scratch — the library has zero external
4//! dependencies and this module must not introduce any.
5//!
6//! # Algorithms
7//!
8//! - [`levenshtein_distance`] — the classic edit distance (insertions,
9//!   deletions, substitutions each cost 1).
10//! - [`levenshtein_ratio`] — a normalised similarity score in `[0.0, 1.0]`
11//!   derived from the distance.  Two identical strings score `1.0`; two
12//!   completely different strings of maximum length score `0.0`.
13
14// ---------------------------------------------------------------------------
15// Distance
16// ---------------------------------------------------------------------------
17
18/// Compute the Levenshtein edit distance between two strings.
19///
20/// The distance counts the minimum number of single-character edits
21/// (insertions, deletions, or substitutions) required to transform `a` into
22/// `b`.
23///
24/// The implementation uses two rolling rows of a standard dynamic-programming
25/// matrix so memory is O(min(|a|, |b|)) rather than O(|a| · |b|).
26///
27/// Both strings are treated as sequences of bytes.  All inputs in this library
28/// are ASCII (the caller lowercases before calling), so byte-level comparisons
29/// are correct and cheaper than char-level ones.
30///
31/// # Examples
32///
33/// ```
34/// use partial_date::levenshtein::levenshtein_distance;
35///
36/// assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
37/// assert_eq!(levenshtein_distance("", "abc"), 3);
38/// assert_eq!(levenshtein_distance("abc", "abc"), 0);
39/// assert_eq!(levenshtein_distance("march", "mrach"), 2);
40/// ```
41pub fn levenshtein_distance(a: &str, b: &str) -> usize {
42    // Make `a` the shorter string so the inner loop (over `a`) uses less
43    // memory and cache pressure.
44    let (a, b) = if a.len() <= b.len() {
45        (a.as_bytes(), b.as_bytes())
46    } else {
47        (b.as_bytes(), a.as_bytes())
48    };
49
50    let len_a = a.len();
51    let len_b = b.len();
52
53    // prev[j] = distance(a[0..i-1], b[0..j])
54    // curr[j] = distance(a[0..i],   b[0..j])
55    let mut prev: Vec<usize> = (0..=len_b).collect();
56    let mut curr: Vec<usize> = vec![0; len_b + 1];
57
58    for i in 1..=len_a {
59        curr[0] = i;
60        for j in 1..=len_b {
61            let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
62            curr[j] = (prev[j] + 1) // deletion
63                .min(curr[j - 1] + 1) // insertion
64                .min(prev[j - 1] + cost); // substitution
65        }
66        prev.clone_from(&curr);
67    }
68
69    prev[len_b]
70}
71
72// ---------------------------------------------------------------------------
73// Ratio
74// ---------------------------------------------------------------------------
75
76/// Compute the normalised Levenshtein similarity ratio between two strings.
77///
78/// The ratio is defined as:
79///
80/// ```text
81/// ratio = 1.0 - distance(a, b) / max(len(a), len(b))
82/// ```
83///
84/// and lies in the range `[0.0, 1.0]`:
85///
86/// - `1.0` — the strings are identical (or both empty).
87/// - `0.0` — the strings are completely different (distance equals the length
88///   of the longer string).
89///
90/// # Examples
91///
92/// ```
93/// use partial_date::levenshtein::levenshtein_ratio;
94///
95/// assert!((levenshtein_ratio("march",  "march")  - 1.0).abs() < f32::EPSILON);
96/// assert!((levenshtein_ratio("mrach",  "march")  - 0.6).abs() < f32::EPSILON);
97/// assert!((levenshtein_ratio("january","jauary") - (6.0/7.0)).abs() < f32::EPSILON);
98/// ```
99pub fn levenshtein_ratio(a: &str, b: &str) -> f32 {
100    let max_len = a.len().max(b.len());
101    if max_len == 0 {
102        return 1.0; // Both strings are empty — they are identical.
103    }
104    let dist = levenshtein_distance(a, b);
105    1.0 - (dist as f32 / max_len as f32)
106}
partial_date/levenshtein.rs

partial_date/
levenshtein.rs