Skip to main content

text_processing_rs/taggers/
electronic.rs

1//! Electronic address tagger.
2//!
3//! Converts spoken emails and URLs to written form:
4//! - "a at gmail dot com" → "a@gmail.com"
5//! - "w w w dot example dot com" → "www.example.com"
6//! - "h t t p colon slash slash..." → "http://..."
7
8/// Parse spoken electronic address to written form.
9pub fn parse(input: &str) -> Option<String> {
10    let original = input.trim();
11    let input_lower = original.to_lowercase();
12
13    // Try email pattern
14    if let Some(result) = parse_email(original, &input_lower) {
15        return Some(result);
16    }
17
18    // Try URL pattern
19    if let Some(result) = parse_url(&input_lower) {
20        return Some(result);
21    }
22
23    // Try domain pattern
24    if let Some(result) = parse_domain(&input_lower) {
25        return Some(result);
26    }
27
28    None
29}
30
31/// Parse email address (contains " at ")
32fn parse_email(original: &str, input: &str) -> Option<String> {
33    if !input.contains(" at ") {
34        return None;
35    }
36
37    let parts: Vec<&str> = input.splitn(2, " at ").collect();
38    if parts.len() != 2 {
39        return None;
40    }
41
42    // Domain part must contain " dot " to be a valid email domain
43    // This prevents "set alarm at ten" from being parsed as email
44    if !parts[1].contains(" dot ") {
45        return None;
46    }
47
48    // Get the original local part to preserve casing
49    let orig_parts: Vec<&str> = original.splitn(2, " at ").collect();
50    let orig_local = if orig_parts.len() == 2 {
51        orig_parts[0]
52    } else {
53        // Try case-insensitive split
54        let at_pos = original.to_lowercase().find(" at ")?;
55        &original[..at_pos]
56    };
57
58    let local_part = parse_email_part_with_case(orig_local, parts[0]);
59    let domain_part = parse_domain_part(parts[1]);
60
61    Some(format!("{}@{}", local_part, domain_part))
62}
63
64/// Parse email local part preserving original casing
65fn parse_email_part_with_case(original: &str, _input: &str) -> String {
66    let mut result = String::new();
67    let words: Vec<&str> = original.split_whitespace().collect();
68
69    for (i, word) in words.iter().enumerate() {
70        let word_lower = word.to_lowercase();
71        // "dot" at the start should be literal "dot", not "."
72        // e.g., "dot three at gmail dot com" → "dot 3@gmail.com"
73        if word_lower == "dot" && i == 0 {
74            result.push_str(word);
75            result.push(' ');
76        } else if word_lower == "dot" {
77            result.push('.');
78        } else if word_lower == "underscore" {
79            result.push('_');
80        } else if word_lower == "dash" || word_lower == "hyphen" {
81            result.push('-');
82        } else if let Some(digit) = word_to_digit(&word_lower) {
83            // Number word - convert to digit
84            result.push(digit);
85        } else if word.len() == 1 {
86            // Single letter - preserve original case
87            result.push_str(word);
88        } else {
89            result.push_str(&word.to_lowercase());
90        }
91    }
92
93    result
94}
95
96/// Convert word to single digit
97fn word_to_digit(word: &str) -> Option<char> {
98    match word {
99        "zero" | "o" | "oh" => Some('0'),
100        "one" => Some('1'),
101        "two" => Some('2'),
102        "three" => Some('3'),
103        "four" => Some('4'),
104        "five" => Some('5'),
105        "six" => Some('6'),
106        "seven" => Some('7'),
107        "eight" => Some('8'),
108        "nine" => Some('9'),
109        _ => None,
110    }
111}
112
113/// Parse URL with protocol
114fn parse_url(input: &str) -> Option<String> {
115    // Check for protocol prefix
116    let protocols = [
117        ("h t t p s colon slash slash ", "https://"),
118        ("h t t p colon slash slash ", "http://"),
119        ("https colon slash slash ", "https://"),
120        ("http colon slash slash ", "http://"),
121    ];
122
123    for (spoken, written) in &protocols {
124        if input.starts_with(spoken) {
125            let rest = &input[spoken.len()..];
126            let domain = parse_domain_part(rest);
127            return Some(format!("{}{}", written, domain));
128        }
129    }
130
131    // Check for www prefix without protocol
132    if input.starts_with("w w w dot ") {
133        let rest = &input[10..];
134        let domain = parse_domain_part(rest);
135        return Some(format!("www.{}", domain));
136    }
137
138    None
139}
140
141/// Parse standalone domain
142fn parse_domain(input: &str) -> Option<String> {
143    // Must contain " dot " to be a domain
144    if !input.contains(" dot ") {
145        return None;
146    }
147
148    let result = parse_domain_part(input);
149
150    // Must have at least one dot
151    if result.contains('.') {
152        Some(result)
153    } else {
154        None
155    }
156}
157
158/// Parse email local part (before @)
159fn parse_email_part(input: &str) -> String {
160    let words: Vec<&str> = input.split_whitespace().collect();
161    let mut result = String::new();
162
163    for (i, word) in words.iter().enumerate() {
164        match *word {
165            // "dot" at the start should be literal "dot", not "."
166            // e.g., "dot three at gmail dot com" → "dot 3@gmail.com"
167            "dot" if i == 0 => {
168                result.push_str("dot ");
169            }
170            "dot" => result.push('.'),
171            "hyphen" | "dash" => result.push('-'),
172            "underscore" => result.push('_'),
173            _ => {
174                // Check for spelled out letters/numbers
175                if let Some(c) = word_to_char(word) {
176                    result.push(c);
177                } else {
178                    // Use word as-is (for things like "gmail", "abc")
179                    result.push_str(word);
180                }
181            }
182        }
183    }
184
185    result
186}
187
188/// Parse domain part (after @ or entire URL domain)
189fn parse_domain_part(input: &str) -> String {
190    let words: Vec<&str> = input.split_whitespace().collect();
191    let mut result = String::new();
192
193    for word in words {
194        match word {
195            "dot" => result.push('.'),
196            "slash" => result.push('/'),
197            "colon" => result.push(':'),
198            "hyphen" | "dash" => result.push('-'),
199            _ => {
200                // Check for spelled out letters/numbers
201                if let Some(c) = word_to_char(word) {
202                    result.push(c);
203                } else {
204                    // Use word as-is
205                    result.push_str(word);
206                }
207            }
208        }
209    }
210
211    result
212}
213
214/// Convert single letter/number word to character
215fn word_to_char(word: &str) -> Option<char> {
216    // Single letters
217    if word.len() == 1 {
218        let c = word.chars().next()?;
219        if c.is_ascii_alphabetic() || c.is_ascii_digit() {
220            return Some(c);
221        }
222    }
223
224    // Spelled out numbers
225    match word {
226        "zero" | "o" | "oh" => Some('0'),
227        "one" => Some('1'),
228        "two" => Some('2'),
229        "three" => Some('3'),
230        "four" => Some('4'),
231        "five" => Some('5'),
232        "six" => Some('6'),
233        "seven" => Some('7'),
234        "eight" => Some('8'),
235        "nine" => Some('9'),
236        _ => None,
237    }
238}
239
240#[cfg(test)]
241mod tests {
242    use super::*;
243
244    #[test]
245    fn test_simple_email() {
246        assert_eq!(parse("a at gmail dot com"), Some("a@gmail.com".to_string()));
247    }
248
249    #[test]
250    fn test_email_with_dots() {
251        assert_eq!(
252            parse("a dot b c at gmail dot com"),
253            Some("a.bc@gmail.com".to_string())
254        );
255    }
256
257    #[test]
258    fn test_email_with_numbers() {
259        assert_eq!(
260            parse("a one b two at a b c dot com"),
261            Some("a1b2@abc.com".to_string())
262        );
263    }
264
265    #[test]
266    fn test_url_with_protocol() {
267        assert_eq!(
268            parse("h t t p colon slash slash w w w dot example dot com"),
269            Some("http://www.example.com".to_string())
270        );
271    }
272
273    #[test]
274    fn test_www_domain() {
275        assert_eq!(
276            parse("w w w dot example dot com"),
277            Some("www.example.com".to_string())
278        );
279    }
280
281    #[test]
282    fn test_simple_domain() {
283        assert_eq!(parse("nvidia dot com"), Some("nvidia.com".to_string()));
284    }
285}