text_processing_rs/taggers/
telephone.rs1use super::cardinal::words_to_number;
9
10pub fn parse(input: &str) -> Option<String> {
12 let input_lower = input.to_lowercase();
13 let input_trimmed = input_lower.trim();
14
15 if input_trimmed.contains(',') {
17 return None;
18 }
19
20 if input_trimmed.contains(" dot ") {
22 return parse_ip_address(input_trimmed);
23 }
24
25 if input_trimmed.contains("ssn") {
27 return parse_ssn_in_context(input, input_trimmed);
28 }
29
30 if let Some(result) = parse_alphanumeric_code(input) {
32 return Some(result);
33 }
34
35 if !has_digit_content(input_trimmed) {
37 return None;
38 }
39
40 if has_scale_words(input_trimmed) {
42 return None;
43 }
44
45 parse_phone_number(input_trimmed)
47}
48
49fn parse_ip_address(input: &str) -> Option<String> {
51 let parts: Vec<&str> = input.split(" dot ").collect();
52 if parts.len() < 2 {
53 return None;
54 }
55
56 let mut octets = Vec::new();
57 for part in parts {
58 let octet = parse_ip_octet(part)?;
59 octets.push(octet);
60 }
61
62 Some(octets.join("."))
63}
64
65fn parse_ip_octet(input: &str) -> Option<String> {
67 let words: Vec<&str> = input.split_whitespace().collect();
68 if words.is_empty() {
69 return None;
70 }
71
72 let mut result = String::new();
78 let mut i = 0;
79
80 while i < words.len() {
81 let word = words[i];
82
83 if word == "double" && i + 1 < words.len() {
85 let next = words[i + 1];
86 if let Some(d) = word_to_digit(next) {
87 result.push(d);
88 result.push(d);
89 i += 2;
90 continue;
91 } else if let Some(num) = words_to_number(next) {
92 let s = (num as i64).to_string();
93 result.push_str(&s);
94 result.push_str(&s);
95 i += 2;
96 continue;
97 }
98 }
99
100 if let Some(d) = word_to_digit(word) {
102 result.push(d);
103 i += 1;
104 continue;
105 }
106
107 if i + 1 < words.len() {
109 let combined = format!("{} {}", word, words[i + 1]);
110 if let Some(num) = words_to_number(&combined) {
111 result.push_str(&(num as i64).to_string());
112 i += 2;
113 continue;
114 }
115 }
116
117 if let Some(num) = words_to_number(word) {
119 result.push_str(&(num as i64).to_string());
120 i += 1;
121 continue;
122 }
123
124 i += 1;
125 }
126
127 if result.is_empty() {
128 None
129 } else {
130 Some(result)
131 }
132}
133
134fn parse_ssn_in_context(original_input: &str, input: &str) -> Option<String> {
137 let ssn_idx = input.find("ssn")?;
139 let prefix = &input[..ssn_idx];
140 let after_ssn = &input[ssn_idx + 3..].trim_start();
141
142 let orig_ssn_idx = original_input.to_lowercase().find("ssn")?;
144 let orig_ssn = &original_input[orig_ssn_idx..orig_ssn_idx + 3];
145
146 let digits_part = if after_ssn.starts_with("is ") {
148 &after_ssn[3..]
149 } else {
150 after_ssn
151 };
152
153 let digits = parse_digit_sequence_with_double(digits_part)?;
154
155 if digits.len() >= 9 {
157 let formatted = format!("{}-{}-{}", &digits[0..3], &digits[3..5], &digits[5..9]);
158 if prefix.is_empty() {
159 Some(format!("{} is {}", orig_ssn, formatted))
160 } else {
161 Some(format!("{}{} is {}", prefix.trim(), orig_ssn, formatted))
162 }
163 } else {
164 None
165 }
166}
167
168fn parse_alphanumeric_code(input: &str) -> Option<String> {
170 let words: Vec<&str> = input.split_whitespace().collect();
171 if words.len() < 2 {
172 return None;
173 }
174
175 let has_letters = words.iter().any(|w| is_single_letter(&w.to_lowercase()));
177 let has_numbers = words.iter().any(|w| {
178 let wl = w.to_lowercase();
179 word_to_digit(&wl).is_some() || is_tens_word(&wl) || is_number_word(&wl)
180 });
181
182 if !has_letters || !has_numbers {
183 return None;
184 }
185
186 let first_word_lower = words[0].to_lowercase();
190 let starts_with_digit = word_to_digit(&first_word_lower).is_some();
191 let is_compact_code = starts_with_digit
192 && words.iter().all(|w| {
193 let wl = w.to_lowercase();
194 is_single_letter(&wl)
195 || word_to_digit(&wl).is_some()
196 || is_tens_word(&wl)
197 || is_number_word(&wl)
198 });
199
200 let mut result = String::new();
202 let mut i = 0;
203 let mut letter_run = String::new();
204 let mut prev_was_number = false; while i < words.len() {
207 let word = words[i];
208 let word_lower = word.to_lowercase();
209
210 if is_single_letter(&word_lower) {
212 letter_run.push_str(&word_lower);
213 i += 1;
215 continue;
216 }
217
218 if !letter_run.is_empty() {
220 if !prev_was_number && !is_compact_code && !result.is_empty() && !result.ends_with(' ')
223 {
224 result.push(' ');
225 }
226 if should_uppercase_abbrev(&letter_run) {
228 result.push_str(&letter_run.to_uppercase());
229 } else {
230 result.push_str(&letter_run);
231 }
232 if !prev_was_number && !is_compact_code && !should_join_letters_to_number(&letter_run) {
235 result.push(' ');
236 }
237 letter_run.clear();
238 prev_was_number = false;
239 }
240
241 if i + 1 < words.len() && is_tens_word(&word_lower) {
243 let next_word = words[i + 1].to_lowercase();
244 if is_tens_word(&next_word) {
245 if let (Some(tens1), Some(tens2)) =
247 (words_to_number(&word_lower), words_to_number(&next_word))
248 {
249 let combined = (tens1 / 10) * 1000 + tens2;
250 result.push_str(&combined.to_string());
251 i += 2;
252 prev_was_number = true;
253 continue;
254 }
255 }
256 }
257
258 if i + 1 < words.len() && is_teen_word(&word_lower) {
260 let next_word = words[i + 1].to_lowercase();
261 if is_tens_word(&next_word) {
262 if let (Some(teen), Some(tens)) =
263 (words_to_number(&word_lower), words_to_number(&next_word))
264 {
265 let combined = teen * 100 + tens;
266 result.push_str(&combined.to_string());
267 i += 2;
268 prev_was_number = true;
269 continue;
270 }
271 }
272 }
273
274 if i + 1 < words.len() && is_tens_word(&word_lower) {
276 let next_word = words[i + 1].to_lowercase();
277 let compound = format!("{} {}", word_lower, next_word);
278 if let Some(num) = words_to_number(&compound) {
279 if num > words_to_number(&word_lower).unwrap_or(0) {
281 result.push_str(&num.to_string());
282 i += 2;
283 prev_was_number = true;
284 continue;
285 }
286 }
287 }
288
289 if let Some(d) = word_to_digit(&word_lower) {
291 result.push(d);
292 i += 1;
293 prev_was_number = true;
294 continue;
295 }
296
297 if let Some(num) = words_to_number(&word_lower) {
299 if num >= 10 && num <= 99 {
300 result.push_str(&num.to_string());
301 i += 1;
302 prev_was_number = true;
303 continue;
304 }
305 }
306
307 if !result.is_empty() && !result.ends_with(' ') {
309 result.push(' ');
310 }
311 result.push_str(word);
312 i += 1;
313 prev_was_number = false;
314 }
315
316 if !letter_run.is_empty() {
318 if should_uppercase_abbrev(&letter_run) {
319 result.push_str(&letter_run.to_uppercase());
320 } else {
321 result.push_str(&letter_run);
322 }
323 }
324
325 if result.is_empty() || result == input {
326 None
327 } else {
328 Some(result)
329 }
330}
331
332fn is_single_letter(word: &str) -> bool {
333 if word.len() != 1 {
335 return false;
336 }
337 let c = word.chars().next().unwrap_or(' ');
338 c.is_ascii_alphabetic() && c != 'o' && c != 'O'
339}
340
341fn is_number_word(word: &str) -> bool {
342 is_tens_word(word) || is_teen_word(word) || word_to_digit(word).is_some()
343}
344
345fn is_teen_word(word: &str) -> bool {
346 matches!(
347 word,
348 "ten"
349 | "eleven"
350 | "twelve"
351 | "thirteen"
352 | "fourteen"
353 | "fifteen"
354 | "sixteen"
355 | "seventeen"
356 | "eighteen"
357 | "nineteen"
358 )
359}
360
361fn should_uppercase_abbrev(s: &str) -> bool {
362 matches!(
364 s,
365 "rtx" | "gtx" | "rx" | "amd" | "cpu" | "gpu" | "usb" | "hdmi"
366 )
367}
368
369fn should_join_letters_to_number(s: &str) -> bool {
370 s == "x"
373}
374
375fn parse_phone_number(input: &str) -> Option<String> {
377 let has_plus = input.starts_with("plus ");
378
379 let (prefix, rest) = extract_phone_prefix(input);
381 let digits = parse_digit_sequence_with_double(rest)?;
382
383 if !has_plus && digits.len() < 3 {
385 return None;
386 }
387
388 let formatted = format_phone_number(&digits);
390
391 if prefix.is_empty() {
392 Some(formatted)
393 } else {
394 Some(format!("{} {}", prefix, formatted))
395 }
396}
397
398fn is_tens_word(word: &str) -> bool {
400 matches!(
401 word,
402 "twenty" | "thirty" | "forty" | "fifty" | "sixty" | "seventy" | "eighty" | "ninety"
403 )
404}
405
406fn extract_phone_prefix(input: &str) -> (String, &str) {
408 if !input.starts_with("plus ") {
409 return (String::new(), input);
410 }
411
412 let rest = &input[5..];
413 let words: Vec<&str> = rest.split_whitespace().collect();
414
415 let mut code = String::new();
418 let mut consumed_words = 0;
419
420 if words.len() >= 2 && is_tens_word(words[0]) {
423 let compound = format!("{} {}", words[0], words[1]);
424 if let Some(num) = words_to_number(&compound) {
425 if num >= 10 && num <= 999 {
426 code = (num as i64).to_string();
427 consumed_words = 2;
428 }
429 }
430 }
431
432 if code.is_empty() {
434 for (i, word) in words.iter().enumerate() {
435 if let Some(d) = word_to_digit(word) {
436 code.push(d);
437 consumed_words = i + 1;
438 if code.len() >= 2 {
441 break;
442 }
443 } else if is_tens_word(word) {
444 if let Some(num) = words_to_number(word) {
446 if code.is_empty() && num >= 10 && num <= 99 {
447 code = (num as i64).to_string();
448 consumed_words = i + 1;
449 break;
450 }
451 }
452 break;
453 } else {
454 break;
455 }
456 }
457 }
458
459 if code.is_empty() {
460 return (String::new(), input);
461 }
462
463 let remaining = words[consumed_words..].join(" ");
464 let remaining_start = if remaining.is_empty() {
466 input.len()
467 } else {
468 input.find(&remaining).unwrap_or(input.len())
469 };
470
471 (format!("+{}", code), &input[remaining_start..])
472}
473
474fn parse_digit_sequence_with_double(input: &str) -> Option<String> {
476 let words: Vec<&str> = input.split_whitespace().collect();
477 let mut result = String::new();
478 let mut i = 0;
479
480 while i < words.len() {
481 let word = words[i];
482
483 if word == "double" && i + 1 < words.len() {
485 if let Some(d) = word_to_digit(words[i + 1]) {
486 result.push(d);
487 result.push(d);
488 i += 2;
489 continue;
490 } else if let Some(num) = words_to_number(words[i + 1]) {
491 let s = (num as i64).to_string();
492 result.push_str(&s);
493 result.push_str(&s);
494 i += 2;
495 continue;
496 }
497 }
498
499 if word == "triple" && i + 1 < words.len() {
501 if let Some(d) = word_to_digit(words[i + 1]) {
502 result.push(d);
503 result.push(d);
504 result.push(d);
505 i += 2;
506 continue;
507 }
508 }
509
510 if let Some(d) = word_to_digit(word) {
512 result.push(d);
513 i += 1;
514 continue;
515 }
516
517 if let Some(num) = words_to_number(word) {
519 if i + 1 < words.len() {
521 let combined = format!("{} {}", word, words[i + 1]);
522 if let Some(compound) = words_to_number(&combined) {
523 if compound != num {
524 result.push_str(&(compound as i64).to_string());
525 i += 2;
526 continue;
527 }
528 }
529 }
530 result.push_str(&(num as i64).to_string());
531 i += 1;
532 continue;
533 }
534
535 i += 1;
537 }
538
539 if result.is_empty() {
540 None
541 } else {
542 Some(result)
543 }
544}
545
546fn has_digit_content(input: &str) -> bool {
548 let digit_words = [
549 "zero",
550 "one",
551 "two",
552 "three",
553 "four",
554 "five",
555 "six",
556 "seven",
557 "eight",
558 "nine",
559 "oh",
560 "o",
561 "double",
562 "triple",
563 "ten",
564 "eleven",
565 "twelve",
566 "thirteen",
567 "fourteen",
568 "fifteen",
569 "sixteen",
570 "seventeen",
571 "eighteen",
572 "nineteen",
573 "twenty",
574 "thirty",
575 "forty",
576 "fifty",
577 "sixty",
578 "seventy",
579 "eighty",
580 "ninety",
581 ];
582
583 for word in input.split_whitespace() {
584 if digit_words.contains(&word) {
585 return true;
586 }
587 }
588 false
589}
590
591fn has_scale_words(input: &str) -> bool {
593 let scale_words = [
594 "hundred",
595 "thousand",
596 "million",
597 "billion",
598 "trillion",
599 "quadrillion",
600 "quintillion",
601 "sextillion",
602 "crore",
603 "lakh",
604 ];
605
606 for word in input.split_whitespace() {
607 if scale_words.contains(&word) {
608 return true;
609 }
610 }
611 false
612}
613
614fn word_to_digit(word: &str) -> Option<char> {
616 match word {
617 "zero" | "o" | "oh" => Some('0'),
618 "one" => Some('1'),
619 "two" => Some('2'),
620 "three" => Some('3'),
621 "four" => Some('4'),
622 "five" => Some('5'),
623 "six" => Some('6'),
624 "seven" => Some('7'),
625 "eight" => Some('8'),
626 "nine" => Some('9'),
627 _ => None,
628 }
629}
630
631fn format_phone_number(digits: &str) -> String {
633 let len = digits.len();
634
635 if len == 11 {
637 return format!(
638 "{} {}-{}-{}",
639 &digits[0..1],
640 &digits[1..4],
641 &digits[4..7],
642 &digits[7..11]
643 );
644 }
645
646 if len == 10 {
648 return format!("{}-{}-{}", &digits[0..3], &digits[3..6], &digits[6..10]);
649 }
650
651 if len == 7 {
653 return format!("{}-{}", &digits[0..3], &digits[3..7]);
654 }
655
656 if len == 3 {
658 return digits.to_string();
659 }
660
661 if len > 3 {
663 return format!("{}-{}", &digits[0..3], &digits[3..]);
664 }
665
666 digits.to_string()
667}
668
669#[cfg(test)]
670mod tests {
671 use super::*;
672
673 #[test]
674 fn test_basic_phone() {
675 assert_eq!(
676 parse("one two three one two three five six seven eight"),
677 Some("123-123-5678".to_string())
678 );
679 }
680
681 #[test]
682 fn test_with_country_code() {
683 assert_eq!(
684 parse("plus nine one one two three one two three five six seven eight"),
685 Some("+91 123-123-5678".to_string())
686 );
687 }
688
689 #[test]
690 fn test_double_pattern() {
691 assert_eq!(
692 parse("double oh three one two three five six seven eight"),
693 Some("003-123-5678".to_string())
694 );
695 }
696
697 #[test]
698 fn test_three_digits() {
699 assert_eq!(parse("seven nine nine"), Some("799".to_string()));
700 }
701
702 #[test]
703 fn test_ip_address() {
704 assert_eq!(
705 parse("one two three dot one two three dot o dot four o"),
706 Some("123.123.0.40".to_string())
707 );
708 }
709}