Skip to main content

tl_data/
validate.rs

1// ThinkingLanguage — Data Validation & Fuzzy Matching
2// Licensed under MIT OR Apache-2.0
3//
4// String validation builtins and fuzzy matching functions.
5
6use regex::Regex;
7use std::sync::LazyLock;
8
9static EMAIL_RE: LazyLock<Regex> =
10    LazyLock::new(|| Regex::new(r"^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$").unwrap());
11
12static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^https?://[^\s]+$").unwrap());
13
14static PHONE_RE: LazyLock<Regex> =
15    LazyLock::new(|| Regex::new(r"^[\+]?[(]?[0-9]{1,4}[)]?[-\s\./0-9]*$").unwrap());
16
17/// Check if a string is a valid email address.
18pub fn is_email(s: &str) -> bool {
19    EMAIL_RE.is_match(s)
20}
21
22/// Check if a string is a valid HTTP/HTTPS URL.
23pub fn is_url(s: &str) -> bool {
24    URL_RE.is_match(s)
25}
26
27/// Check if a string looks like a phone number.
28pub fn is_phone(s: &str) -> bool {
29    let trimmed = s.trim();
30    if trimmed.is_empty() {
31        return false;
32    }
33    // Must have at least 7 digits
34    let digit_count = trimmed.chars().filter(|c| c.is_ascii_digit()).count();
35    digit_count >= 7 && PHONE_RE.is_match(trimmed)
36}
37
38/// Check if a value is between low and high (inclusive).
39pub fn is_between(val: f64, low: f64, high: f64) -> bool {
40    low <= val && val <= high
41}
42
43/// Compute the Levenshtein edit distance between two strings.
44pub fn levenshtein(a: &str, b: &str) -> usize {
45    let a_chars: Vec<char> = a.chars().collect();
46    let b_chars: Vec<char> = b.chars().collect();
47    let m = a_chars.len();
48    let n = b_chars.len();
49
50    if m == 0 {
51        return n;
52    }
53    if n == 0 {
54        return m;
55    }
56
57    let mut prev = (0..=n).collect::<Vec<usize>>();
58    let mut curr = vec![0; n + 1];
59
60    for i in 1..=m {
61        curr[0] = i;
62        for j in 1..=n {
63            let cost = if a_chars[i - 1] == b_chars[j - 1] {
64                0
65            } else {
66                1
67            };
68            curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
69        }
70        std::mem::swap(&mut prev, &mut curr);
71    }
72    prev[n]
73}
74
75/// Compute the American Soundex code for a string.
76pub fn soundex(s: &str) -> String {
77    let s = s.trim();
78    if s.is_empty() {
79        return "0000".to_string();
80    }
81
82    let chars: Vec<char> = s.chars().collect();
83    let first = chars[0].to_ascii_uppercase();
84    if !first.is_ascii_alphabetic() {
85        return "0000".to_string();
86    }
87
88    let code = |c: char| -> Option<char> {
89        match c.to_ascii_uppercase() {
90            'B' | 'F' | 'P' | 'V' => Some('1'),
91            'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'),
92            'D' | 'T' => Some('3'),
93            'L' => Some('4'),
94            'M' | 'N' => Some('5'),
95            'R' => Some('6'),
96            _ => None, // A, E, I, O, U, H, W, Y → ignored
97        }
98    };
99
100    let mut result = String::with_capacity(4);
101    result.push(first);
102
103    let mut last_code = code(first);
104    for &c in &chars[1..] {
105        if result.len() >= 4 {
106            break;
107        }
108        let c_code = code(c);
109        if let Some(cc) = c_code {
110            if Some(cc) != last_code {
111                result.push(cc);
112            }
113            last_code = Some(cc);
114        } else {
115            // H and W don't separate identical codes, but vowels do
116            let upper = c.to_ascii_uppercase();
117            if upper != 'H' && upper != 'W' {
118                last_code = None;
119            }
120        }
121    }
122
123    while result.len() < 4 {
124        result.push('0');
125    }
126    result
127}
128
129#[cfg(test)]
130mod tests {
131    use super::*;
132
133    #[test]
134    fn test_is_email_valid() {
135        assert!(is_email("user@example.com"));
136        assert!(is_email("test.name+tag@domain.co.uk"));
137        assert!(is_email("a@b.cc"));
138    }
139
140    #[test]
141    fn test_is_email_invalid() {
142        assert!(!is_email("not-an-email"));
143        assert!(!is_email("@missing.com"));
144        assert!(!is_email("user@.com"));
145    }
146
147    #[test]
148    fn test_is_email_edge() {
149        assert!(!is_email(""));
150        assert!(!is_email(" "));
151        assert!(is_email("user123@test-domain.org"));
152    }
153
154    #[test]
155    fn test_is_url_valid() {
156        assert!(is_url("http://example.com"));
157        assert!(is_url("https://www.example.com/path?q=1"));
158    }
159
160    #[test]
161    fn test_is_url_invalid() {
162        assert!(!is_url("ftp://files.example.com"));
163        assert!(!is_url("not a url"));
164        assert!(!is_url(""));
165    }
166
167    #[test]
168    fn test_is_phone_valid() {
169        assert!(is_phone("+1-555-555-5555"));
170        assert!(is_phone("(555) 555-5555"));
171    }
172
173    #[test]
174    fn test_is_phone_invalid() {
175        assert!(!is_phone("abc"));
176        assert!(!is_phone("123")); // too few digits
177        assert!(!is_phone(""));
178    }
179
180    #[test]
181    fn test_is_between() {
182        assert!(is_between(5.0, 1.0, 10.0));
183        assert!(is_between(1.0, 1.0, 10.0)); // inclusive low
184        assert!(is_between(10.0, 1.0, 10.0)); // inclusive high
185        assert!(!is_between(0.0, 1.0, 10.0));
186        assert!(!is_between(11.0, 1.0, 10.0));
187    }
188
189    #[test]
190    fn test_levenshtein() {
191        assert_eq!(levenshtein("kitten", "sitting"), 3);
192        assert_eq!(levenshtein("", "abc"), 3);
193        assert_eq!(levenshtein("abc", ""), 3);
194        assert_eq!(levenshtein("abc", "abc"), 0);
195        assert_eq!(levenshtein("book", "back"), 2);
196    }
197
198    #[test]
199    fn test_soundex() {
200        assert_eq!(soundex("Robert"), "R163");
201        assert_eq!(soundex("Rupert"), "R163");
202        assert_eq!(soundex("Ashcraft"), "A261");
203        assert_eq!(soundex("Tymczak"), "T522");
204        assert_eq!(soundex(""), "0000");
205    }
206}