text_processing_rs/taggers/
electronic.rs1pub fn parse(input: &str) -> Option<String> {
10 let original = input.trim();
11 let input_lower = original.to_lowercase();
12
13 if let Some(result) = parse_email(original, &input_lower) {
15 return Some(result);
16 }
17
18 if let Some(result) = parse_url(&input_lower) {
20 return Some(result);
21 }
22
23 if let Some(result) = parse_domain(&input_lower) {
25 return Some(result);
26 }
27
28 None
29}
30
31fn parse_email(original: &str, input: &str) -> Option<String> {
33 if !input.contains(" at ") {
34 return None;
35 }
36
37 let parts: Vec<&str> = input.splitn(2, " at ").collect();
38 if parts.len() != 2 {
39 return None;
40 }
41
42 if !parts[1].contains(" dot ") {
45 return None;
46 }
47
48 let orig_parts: Vec<&str> = original.splitn(2, " at ").collect();
50 let orig_local = if orig_parts.len() == 2 {
51 orig_parts[0]
52 } else {
53 let at_pos = original.to_lowercase().find(" at ")?;
55 &original[..at_pos]
56 };
57
58 let local_part = parse_email_part_with_case(orig_local, parts[0]);
59 let domain_part = parse_domain_part(parts[1]);
60
61 Some(format!("{}@{}", local_part, domain_part))
62}
63
64fn parse_email_part_with_case(original: &str, _input: &str) -> String {
66 let mut result = String::new();
67 let words: Vec<&str> = original.split_whitespace().collect();
68
69 for (i, word) in words.iter().enumerate() {
70 let word_lower = word.to_lowercase();
71 if word_lower == "dot" && i == 0 {
74 result.push_str(word);
75 result.push(' ');
76 } else if word_lower == "dot" {
77 result.push('.');
78 } else if word_lower == "underscore" {
79 result.push('_');
80 } else if word_lower == "dash" || word_lower == "hyphen" {
81 result.push('-');
82 } else if let Some(digit) = word_to_digit(&word_lower) {
83 result.push(digit);
85 } else if word.len() == 1 {
86 result.push_str(word);
88 } else {
89 result.push_str(&word.to_lowercase());
90 }
91 }
92
93 result
94}
95
96fn word_to_digit(word: &str) -> Option<char> {
98 match word {
99 "zero" | "o" | "oh" => Some('0'),
100 "one" => Some('1'),
101 "two" => Some('2'),
102 "three" => Some('3'),
103 "four" => Some('4'),
104 "five" => Some('5'),
105 "six" => Some('6'),
106 "seven" => Some('7'),
107 "eight" => Some('8'),
108 "nine" => Some('9'),
109 _ => None,
110 }
111}
112
113fn parse_url(input: &str) -> Option<String> {
115 let protocols = [
117 ("h t t p s colon slash slash ", "https://"),
118 ("h t t p colon slash slash ", "http://"),
119 ("https colon slash slash ", "https://"),
120 ("http colon slash slash ", "http://"),
121 ];
122
123 for (spoken, written) in &protocols {
124 if input.starts_with(spoken) {
125 let rest = &input[spoken.len()..];
126 let domain = parse_domain_part(rest);
127 return Some(format!("{}{}", written, domain));
128 }
129 }
130
131 if input.starts_with("w w w dot ") {
133 let rest = &input[10..];
134 let domain = parse_domain_part(rest);
135 return Some(format!("www.{}", domain));
136 }
137
138 None
139}
140
141fn parse_domain(input: &str) -> Option<String> {
143 if !input.contains(" dot ") {
145 return None;
146 }
147
148 let result = parse_domain_part(input);
149
150 if result.contains('.') {
152 Some(result)
153 } else {
154 None
155 }
156}
157
158fn parse_email_part(input: &str) -> String {
160 let words: Vec<&str> = input.split_whitespace().collect();
161 let mut result = String::new();
162
163 for (i, word) in words.iter().enumerate() {
164 match *word {
165 "dot" if i == 0 => {
168 result.push_str("dot ");
169 }
170 "dot" => result.push('.'),
171 "hyphen" | "dash" => result.push('-'),
172 "underscore" => result.push('_'),
173 _ => {
174 if let Some(c) = word_to_char(word) {
176 result.push(c);
177 } else {
178 result.push_str(word);
180 }
181 }
182 }
183 }
184
185 result
186}
187
188fn parse_domain_part(input: &str) -> String {
190 let words: Vec<&str> = input.split_whitespace().collect();
191 let mut result = String::new();
192
193 for word in words {
194 match word {
195 "dot" => result.push('.'),
196 "slash" => result.push('/'),
197 "colon" => result.push(':'),
198 "hyphen" | "dash" => result.push('-'),
199 _ => {
200 if let Some(c) = word_to_char(word) {
202 result.push(c);
203 } else {
204 result.push_str(word);
206 }
207 }
208 }
209 }
210
211 result
212}
213
214fn word_to_char(word: &str) -> Option<char> {
216 if word.len() == 1 {
218 let c = word.chars().next()?;
219 if c.is_ascii_alphabetic() || c.is_ascii_digit() {
220 return Some(c);
221 }
222 }
223
224 match word {
226 "zero" | "o" | "oh" => Some('0'),
227 "one" => Some('1'),
228 "two" => Some('2'),
229 "three" => Some('3'),
230 "four" => Some('4'),
231 "five" => Some('5'),
232 "six" => Some('6'),
233 "seven" => Some('7'),
234 "eight" => Some('8'),
235 "nine" => Some('9'),
236 _ => None,
237 }
238}
239
240#[cfg(test)]
241mod tests {
242 use super::*;
243
244 #[test]
245 fn test_simple_email() {
246 assert_eq!(parse("a at gmail dot com"), Some("a@gmail.com".to_string()));
247 }
248
249 #[test]
250 fn test_email_with_dots() {
251 assert_eq!(
252 parse("a dot b c at gmail dot com"),
253 Some("a.bc@gmail.com".to_string())
254 );
255 }
256
257 #[test]
258 fn test_email_with_numbers() {
259 assert_eq!(
260 parse("a one b two at a b c dot com"),
261 Some("a1b2@abc.com".to_string())
262 );
263 }
264
265 #[test]
266 fn test_url_with_protocol() {
267 assert_eq!(
268 parse("h t t p colon slash slash w w w dot example dot com"),
269 Some("http://www.example.com".to_string())
270 );
271 }
272
273 #[test]
274 fn test_www_domain() {
275 assert_eq!(
276 parse("w w w dot example dot com"),
277 Some("www.example.com".to_string())
278 );
279 }
280
281 #[test]
282 fn test_simple_domain() {
283 assert_eq!(parse("nvidia dot com"), Some("nvidia.com".to_string()));
284 }
285}