Skip to main content

text_processing_rs/taggers/
telephone.rs

1//! Telephone number tagger.
2//!
3//! Converts spoken phone numbers, IP addresses, and serial numbers to written form:
4//! - "one two three one two three five six seven eight" → "123-123-5678"
5//! - "plus forty four one two three..." → "+44 123-123-5678"
6//! - "one two three dot one two three dot o dot four o" → "123.123.0.40"
7
8use super::cardinal::words_to_number;
9
10/// Parse spoken telephone/serial number to written form.
11pub fn parse(input: &str) -> Option<String> {
12    let input_lower = input.to_lowercase();
13    let input_trimmed = input_lower.trim();
14
15    // Reject input with punctuation (commas, etc.)
16    if input_trimmed.contains(',') {
17        return None;
18    }
19
20    // Try IP address pattern first (contains "dot")
21    if input_trimmed.contains(" dot ") {
22        return parse_ip_address(input_trimmed);
23    }
24
25    // Try SSN pattern (contains "ssn")
26    if input_trimmed.contains("ssn") {
27        return parse_ssn_in_context(input, input_trimmed);
28    }
29
30    // Try alphanumeric product/serial code patterns
31    if let Some(result) = parse_alphanumeric_code(input) {
32        return Some(result);
33    }
34
35    // Must have digit content
36    if !has_digit_content(input_trimmed) {
37        return None;
38    }
39
40    // Don't match if input has scale words (billion, million, etc.)
41    if has_scale_words(input_trimmed) {
42        return None;
43    }
44
45    // Try phone number pattern
46    parse_phone_number(input_trimmed)
47}
48
49/// Parse IP address pattern: "one two three dot one two three dot o dot four o"
50fn parse_ip_address(input: &str) -> Option<String> {
51    let parts: Vec<&str> = input.split(" dot ").collect();
52    if parts.len() < 2 {
53        return None;
54    }
55
56    let mut octets = Vec::new();
57    for part in parts {
58        let octet = parse_ip_octet(part)?;
59        octets.push(octet);
60    }
61
62    Some(octets.join("."))
63}
64
65/// Parse a single IP octet
66fn parse_ip_octet(input: &str) -> Option<String> {
67    let words: Vec<&str> = input.split_whitespace().collect();
68    if words.is_empty() {
69        return None;
70    }
71
72    // Try parsing as compound number sequence
73    // e.g., "one twenty three" = 1 + 23 = "123"
74    // e.g., "forty five" = "45"
75    // e.g., "double five" = "55"
76
77    let mut result = String::new();
78    let mut i = 0;
79
80    while i < words.len() {
81        let word = words[i];
82
83        // Handle "double X"
84        if word == "double" && i + 1 < words.len() {
85            let next = words[i + 1];
86            if let Some(d) = word_to_digit(next) {
87                result.push(d);
88                result.push(d);
89                i += 2;
90                continue;
91            } else if let Some(num) = words_to_number(next) {
92                let s = (num as i64).to_string();
93                result.push_str(&s);
94                result.push_str(&s);
95                i += 2;
96                continue;
97            }
98        }
99
100        // Try single digit
101        if let Some(d) = word_to_digit(word) {
102            result.push(d);
103            i += 1;
104            continue;
105        }
106
107        // Try compound number (e.g., "twenty three", "forty five")
108        if i + 1 < words.len() {
109            let combined = format!("{} {}", word, words[i + 1]);
110            if let Some(num) = words_to_number(&combined) {
111                result.push_str(&(num as i64).to_string());
112                i += 2;
113                continue;
114            }
115        }
116
117        // Try single number word (e.g., "forty")
118        if let Some(num) = words_to_number(word) {
119            result.push_str(&(num as i64).to_string());
120            i += 1;
121            continue;
122        }
123
124        i += 1;
125    }
126
127    if result.is_empty() {
128        None
129    } else {
130        Some(result)
131    }
132}
133
134/// Parse SSN in context: "ssn is seven double nine one two three double one three"
135/// Preserves original casing of "SSN" from input
136fn parse_ssn_in_context(original_input: &str, input: &str) -> Option<String> {
137    // Find where SSN digits start
138    let ssn_idx = input.find("ssn")?;
139    let prefix = &input[..ssn_idx];
140    let after_ssn = &input[ssn_idx + 3..].trim_start();
141
142    // Get original SSN casing from the original input
143    let orig_ssn_idx = original_input.to_lowercase().find("ssn")?;
144    let orig_ssn = &original_input[orig_ssn_idx..orig_ssn_idx + 3];
145
146    // Skip "is" if present
147    let digits_part = if after_ssn.starts_with("is ") {
148        &after_ssn[3..]
149    } else {
150        after_ssn
151    };
152
153    let digits = parse_digit_sequence_with_double(digits_part)?;
154
155    // SSN format: XXX-XX-XXXX
156    if digits.len() >= 9 {
157        let formatted = format!("{}-{}-{}", &digits[0..3], &digits[3..5], &digits[5..9]);
158        if prefix.is_empty() {
159            Some(format!("{} is {}", orig_ssn, formatted))
160        } else {
161            Some(format!("{}{} is {}", prefix.trim(), orig_ssn, formatted))
162        }
163    } else {
164        None
165    }
166}
167
168/// Parse alphanumeric product/serial codes like "x eighty six" → "x86"
169fn parse_alphanumeric_code(input: &str) -> Option<String> {
170    let words: Vec<&str> = input.split_whitespace().collect();
171    if words.len() < 2 {
172        return None;
173    }
174
175    // Check if this looks like an alphanumeric pattern (mix of letters and number words)
176    let has_letters = words.iter().any(|w| is_single_letter(&w.to_lowercase()));
177    let has_numbers = words.iter().any(|w| {
178        let wl = w.to_lowercase();
179        word_to_digit(&wl).is_some() || is_tens_word(&wl) || is_number_word(&wl)
180    });
181
182    if !has_letters || !has_numbers {
183        return None;
184    }
185
186    // Check for compact serial code pattern: starts with digit word, has interspersed letters
187    // e.g., "five w k r a three one" → "5wkra31"
188    // vs regular patterns that need spacing like "a thirty six" or "r t x forty fifty t i"
189    let first_word_lower = words[0].to_lowercase();
190    let starts_with_digit = word_to_digit(&first_word_lower).is_some();
191    let is_compact_code = starts_with_digit
192        && words.iter().all(|w| {
193            let wl = w.to_lowercase();
194            is_single_letter(&wl)
195                || word_to_digit(&wl).is_some()
196                || is_tens_word(&wl)
197                || is_number_word(&wl)
198        });
199
200    // Build result by parsing each component
201    let mut result = String::new();
202    let mut i = 0;
203    let mut letter_run = String::new();
204    let mut prev_was_number = false; // Track if we just output a number
205
206    while i < words.len() {
207        let word = words[i];
208        let word_lower = word.to_lowercase();
209
210        // Single letter - accumulate in letter_run
211        if is_single_letter(&word_lower) {
212            letter_run.push_str(&word_lower);
213            // Don't reset prev_was_number here - letters may be suffix to previous number
214            i += 1;
215            continue;
216        }
217
218        // Flush letter run before number
219        if !letter_run.is_empty() {
220            // If previous output was a number, letters join directly (1080p, 4050ti)
221            // Otherwise add space before if result is not empty (unless compact code)
222            if !prev_was_number && !is_compact_code && !result.is_empty() && !result.ends_with(' ')
223            {
224                result.push(' ');
225            }
226            // Check if this should be uppercased (common abbreviations)
227            if should_uppercase_abbrev(&letter_run) {
228                result.push_str(&letter_run.to_uppercase());
229            } else {
230                result.push_str(&letter_run);
231            }
232            // Add space after letter run before number (unless compact code or known no-space patterns)
233            // Also don't add space if letters came right after number (they're a suffix like "p" in 1080p)
234            if !prev_was_number && !is_compact_code && !should_join_letters_to_number(&letter_run) {
235                result.push(' ');
236            }
237            letter_run.clear();
238            prev_was_number = false;
239        }
240
241        // Check for "X0 Y0" pattern (e.g., "forty fifty" = 4050, "ten eighty" = 1080)
242        if i + 1 < words.len() && is_tens_word(&word_lower) {
243            let next_word = words[i + 1].to_lowercase();
244            if is_tens_word(&next_word) {
245                // "forty fifty" → 4050
246                if let (Some(tens1), Some(tens2)) =
247                    (words_to_number(&word_lower), words_to_number(&next_word))
248                {
249                    let combined = (tens1 / 10) * 1000 + tens2;
250                    result.push_str(&combined.to_string());
251                    i += 2;
252                    prev_was_number = true;
253                    continue;
254                }
255            }
256        }
257
258        // Check for "ten eighty" = 1080 pattern (teens + tens)
259        if i + 1 < words.len() && is_teen_word(&word_lower) {
260            let next_word = words[i + 1].to_lowercase();
261            if is_tens_word(&next_word) {
262                if let (Some(teen), Some(tens)) =
263                    (words_to_number(&word_lower), words_to_number(&next_word))
264                {
265                    let combined = teen * 100 + tens;
266                    result.push_str(&combined.to_string());
267                    i += 2;
268                    prev_was_number = true;
269                    continue;
270                }
271            }
272        }
273
274        // Try compound number ("eighty six" = 86)
275        if i + 1 < words.len() && is_tens_word(&word_lower) {
276            let next_word = words[i + 1].to_lowercase();
277            let compound = format!("{} {}", word_lower, next_word);
278            if let Some(num) = words_to_number(&compound) {
279                // Check it's actually a compound (tens + units) not just tens + something else
280                if num > words_to_number(&word_lower).unwrap_or(0) {
281                    result.push_str(&num.to_string());
282                    i += 2;
283                    prev_was_number = true;
284                    continue;
285                }
286            }
287        }
288
289        // Single digit word
290        if let Some(d) = word_to_digit(&word_lower) {
291            result.push(d);
292            i += 1;
293            prev_was_number = true;
294            continue;
295        }
296
297        // Single number word (tens or teens)
298        if let Some(num) = words_to_number(&word_lower) {
299            if num >= 10 && num <= 99 {
300                result.push_str(&num.to_string());
301                i += 1;
302                prev_was_number = true;
303                continue;
304            }
305        }
306
307        // Unknown word - keep as-is with space if needed
308        if !result.is_empty() && !result.ends_with(' ') {
309            result.push(' ');
310        }
311        result.push_str(word);
312        i += 1;
313        prev_was_number = false;
314    }
315
316    // Flush remaining letters
317    if !letter_run.is_empty() {
318        if should_uppercase_abbrev(&letter_run) {
319            result.push_str(&letter_run.to_uppercase());
320        } else {
321            result.push_str(&letter_run);
322        }
323    }
324
325    if result.is_empty() || result == input {
326        None
327    } else {
328        Some(result)
329    }
330}
331
332fn is_single_letter(word: &str) -> bool {
333    // Single ASCII letter, but NOT 'o' which means zero in phone/serial contexts
334    if word.len() != 1 {
335        return false;
336    }
337    let c = word.chars().next().unwrap_or(' ');
338    c.is_ascii_alphabetic() && c != 'o' && c != 'O'
339}
340
341fn is_number_word(word: &str) -> bool {
342    is_tens_word(word) || is_teen_word(word) || word_to_digit(word).is_some()
343}
344
345fn is_teen_word(word: &str) -> bool {
346    matches!(
347        word,
348        "ten"
349            | "eleven"
350            | "twelve"
351            | "thirteen"
352            | "fourteen"
353            | "fifteen"
354            | "sixteen"
355            | "seventeen"
356            | "eighteen"
357            | "nineteen"
358    )
359}
360
361fn should_uppercase_abbrev(s: &str) -> bool {
362    // Common uppercase abbreviations in product names
363    matches!(
364        s,
365        "rtx" | "gtx" | "rx" | "amd" | "cpu" | "gpu" | "usb" | "hdmi"
366    )
367}
368
369fn should_join_letters_to_number(s: &str) -> bool {
370    // Single "x" prefix joins with number (x86, x386)
371    // Other letters get a space before the number
372    s == "x"
373}
374
375/// Parse phone number
376fn parse_phone_number(input: &str) -> Option<String> {
377    let has_plus = input.starts_with("plus ");
378
379    // Parse prefix and digits
380    let (prefix, rest) = extract_phone_prefix(input);
381    let digits = parse_digit_sequence_with_double(rest)?;
382
383    // Must have at least 7 digits for phone number (or 3 for short codes)
384    if !has_plus && digits.len() < 3 {
385        return None;
386    }
387
388    // Format the number
389    let formatted = format_phone_number(&digits);
390
391    if prefix.is_empty() {
392        Some(formatted)
393    } else {
394        Some(format!("{} {}", prefix, formatted))
395    }
396}
397
398/// Check if word is a tens word (twenty, thirty, etc.)
399fn is_tens_word(word: &str) -> bool {
400    matches!(
401        word,
402        "twenty" | "thirty" | "forty" | "fifty" | "sixty" | "seventy" | "eighty" | "ninety"
403    )
404}
405
406/// Extract phone prefix (country code with +)
407fn extract_phone_prefix(input: &str) -> (String, &str) {
408    if !input.starts_with("plus ") {
409        return (String::new(), input);
410    }
411
412    let rest = &input[5..];
413    let words: Vec<&str> = rest.split_whitespace().collect();
414
415    // Try to parse country code (could be "forty four" = 44, or "nine one" = 91)
416    // Country codes are 1-3 digits
417    let mut code = String::new();
418    let mut consumed_words = 0;
419
420    // First, try compound number ONLY for tens+units patterns (e.g., "forty four" = 44)
421    // NOT for digit sequences like "nine one" which should stay as "91" (individual digits)
422    if words.len() >= 2 && is_tens_word(words[0]) {
423        let compound = format!("{} {}", words[0], words[1]);
424        if let Some(num) = words_to_number(&compound) {
425            if num >= 10 && num <= 999 {
426                code = (num as i64).to_string();
427                consumed_words = 2;
428            }
429        }
430    }
431
432    // If no compound match, try single tens word or individual digits
433    if code.is_empty() {
434        for (i, word) in words.iter().enumerate() {
435            if let Some(d) = word_to_digit(word) {
436                code.push(d);
437                consumed_words = i + 1;
438                // For individual digit words, limit to 2 digits (common country codes)
439                // 3-digit codes like 351 are usually spoken as compound ("three fifty one")
440                if code.len() >= 2 {
441                    break;
442                }
443            } else if is_tens_word(word) {
444                // Single tens word like "forty" = 40
445                if let Some(num) = words_to_number(word) {
446                    if code.is_empty() && num >= 10 && num <= 99 {
447                        code = (num as i64).to_string();
448                        consumed_words = i + 1;
449                        break;
450                    }
451                }
452                break;
453            } else {
454                break;
455            }
456        }
457    }
458
459    if code.is_empty() {
460        return (String::new(), input);
461    }
462
463    let remaining = words[consumed_words..].join(" ");
464    // Find position in original string
465    let remaining_start = if remaining.is_empty() {
466        input.len()
467    } else {
468        input.find(&remaining).unwrap_or(input.len())
469    };
470
471    (format!("+{}", code), &input[remaining_start..])
472}
473
474/// Parse digit sequence handling "double X" patterns
475fn parse_digit_sequence_with_double(input: &str) -> Option<String> {
476    let words: Vec<&str> = input.split_whitespace().collect();
477    let mut result = String::new();
478    let mut i = 0;
479
480    while i < words.len() {
481        let word = words[i];
482
483        // Handle "double X"
484        if word == "double" && i + 1 < words.len() {
485            if let Some(d) = word_to_digit(words[i + 1]) {
486                result.push(d);
487                result.push(d);
488                i += 2;
489                continue;
490            } else if let Some(num) = words_to_number(words[i + 1]) {
491                let s = (num as i64).to_string();
492                result.push_str(&s);
493                result.push_str(&s);
494                i += 2;
495                continue;
496            }
497        }
498
499        // Handle "triple X"
500        if word == "triple" && i + 1 < words.len() {
501            if let Some(d) = word_to_digit(words[i + 1]) {
502                result.push(d);
503                result.push(d);
504                result.push(d);
505                i += 2;
506                continue;
507            }
508        }
509
510        // Handle single digit
511        if let Some(d) = word_to_digit(word) {
512            result.push(d);
513            i += 1;
514            continue;
515        }
516
517        // Handle compound numbers (twenty three = 23)
518        if let Some(num) = words_to_number(word) {
519            // Check if next word is a units digit
520            if i + 1 < words.len() {
521                let combined = format!("{} {}", word, words[i + 1]);
522                if let Some(compound) = words_to_number(&combined) {
523                    if compound != num {
524                        result.push_str(&(compound as i64).to_string());
525                        i += 2;
526                        continue;
527                    }
528                }
529            }
530            result.push_str(&(num as i64).to_string());
531            i += 1;
532            continue;
533        }
534
535        // Skip unknown words
536        i += 1;
537    }
538
539    if result.is_empty() {
540        None
541    } else {
542        Some(result)
543    }
544}
545
546/// Check if input contains digit words
547fn has_digit_content(input: &str) -> bool {
548    let digit_words = [
549        "zero",
550        "one",
551        "two",
552        "three",
553        "four",
554        "five",
555        "six",
556        "seven",
557        "eight",
558        "nine",
559        "oh",
560        "o",
561        "double",
562        "triple",
563        "ten",
564        "eleven",
565        "twelve",
566        "thirteen",
567        "fourteen",
568        "fifteen",
569        "sixteen",
570        "seventeen",
571        "eighteen",
572        "nineteen",
573        "twenty",
574        "thirty",
575        "forty",
576        "fifty",
577        "sixty",
578        "seventy",
579        "eighty",
580        "ninety",
581    ];
582
583    for word in input.split_whitespace() {
584        if digit_words.contains(&word) {
585            return true;
586        }
587    }
588    false
589}
590
591/// Check if input has scale words (indicates cardinal, not phone)
592fn has_scale_words(input: &str) -> bool {
593    let scale_words = [
594        "hundred",
595        "thousand",
596        "million",
597        "billion",
598        "trillion",
599        "quadrillion",
600        "quintillion",
601        "sextillion",
602        "crore",
603        "lakh",
604    ];
605
606    for word in input.split_whitespace() {
607        if scale_words.contains(&word) {
608            return true;
609        }
610    }
611    false
612}
613
614/// Convert word to single digit
615fn word_to_digit(word: &str) -> Option<char> {
616    match word {
617        "zero" | "o" | "oh" => Some('0'),
618        "one" => Some('1'),
619        "two" => Some('2'),
620        "three" => Some('3'),
621        "four" => Some('4'),
622        "five" => Some('5'),
623        "six" => Some('6'),
624        "seven" => Some('7'),
625        "eight" => Some('8'),
626        "nine" => Some('9'),
627        _ => None,
628    }
629}
630
631/// Format phone number
632fn format_phone_number(digits: &str) -> String {
633    let len = digits.len();
634
635    // 11 digits: X XXX-XXX-XXXX (single digit prefix + 10-digit number)
636    if len == 11 {
637        return format!(
638            "{} {}-{}-{}",
639            &digits[0..1],
640            &digits[1..4],
641            &digits[4..7],
642            &digits[7..11]
643        );
644    }
645
646    // 10 digits: XXX-XXX-XXXX
647    if len == 10 {
648        return format!("{}-{}-{}", &digits[0..3], &digits[3..6], &digits[6..10]);
649    }
650
651    // 7 digits: XXX-XXXX
652    if len == 7 {
653        return format!("{}-{}", &digits[0..3], &digits[3..7]);
654    }
655
656    // 3 digits: just return as-is
657    if len == 3 {
658        return digits.to_string();
659    }
660
661    // Other lengths - group as XXX-rest
662    if len > 3 {
663        return format!("{}-{}", &digits[0..3], &digits[3..]);
664    }
665
666    digits.to_string()
667}
668
669#[cfg(test)]
670mod tests {
671    use super::*;
672
673    #[test]
674    fn test_basic_phone() {
675        assert_eq!(
676            parse("one two three one two three five six seven eight"),
677            Some("123-123-5678".to_string())
678        );
679    }
680
681    #[test]
682    fn test_with_country_code() {
683        assert_eq!(
684            parse("plus nine one one two three one two three five six seven eight"),
685            Some("+91 123-123-5678".to_string())
686        );
687    }
688
689    #[test]
690    fn test_double_pattern() {
691        assert_eq!(
692            parse("double oh three one two three five six seven eight"),
693            Some("003-123-5678".to_string())
694        );
695    }
696
697    #[test]
698    fn test_three_digits() {
699        assert_eq!(parse("seven nine nine"), Some("799".to_string()));
700    }
701
702    #[test]
703    fn test_ip_address() {
704        assert_eq!(
705            parse("one two three dot one two three dot o dot four o"),
706            Some("123.123.0.40".to_string())
707        );
708    }
709}