Skip to main content

text_processing_rs/taggers/
time.rs

1//! Time tagger.
2//!
3//! Converts spoken time expressions to written form:
4//! - "two thirty" → "02:30"
5//! - "two thirty pm" → "02:30 p.m."
6//! - "quarter past one" → "01:15"
7//! - "half past three" → "03:30"
8
9use super::cardinal::words_to_number;
10
11/// Parse spoken time expression to written form.
12pub fn parse(input: &str) -> Option<String> {
13    let original = input.trim();
14    let input_lower = original.to_lowercase();
15
16    // Extract period (am/pm) and timezone if present, preserving original casing
17    let (time_part, period, timezone) = extract_period_and_tz(original, &input_lower);
18
19    // Try special patterns first
20    if let Some(result) = parse_quarter_half(&time_part, &period, &timezone) {
21        return Some(result);
22    }
23
24    if let Some(result) = parse_oclock(&time_part, &period, &timezone) {
25        return Some(result);
26    }
27
28    if let Some(result) = parse_to_pattern(&time_part, &period, &timezone) {
29        return Some(result);
30    }
31
32    if let Some(result) = parse_standard_time(&time_part, &period, &timezone) {
33        return Some(result);
34    }
35
36    None
37}
38
39/// Extract am/pm period and timezone from input, preserving original casing
40fn extract_period_and_tz(original: &str, input_lower: &str) -> (String, String, String) {
41    let mut time_part = input_lower.to_string();
42    let mut period = String::new();
43    let mut timezone = String::new();
44
45    // Check for timezone suffixes (match on lowercase, extract from original)
46    let tz_patterns = [
47        "g m t", "gmt", "e s t", "est", "p s t", "pst", "c s t", "cst", "m s t", "mst",
48    ];
49    for tz in &tz_patterns {
50        if time_part.ends_with(tz) {
51            // Extract timezone from original to preserve casing
52            let tz_start = original.len() - tz.len();
53            timezone = original[tz_start..].replace(" ", "");
54            time_part = time_part[..time_part.len() - tz.len()].trim().to_string();
55            break;
56        }
57    }
58
59    // Check for period (am/pm) - match on lowercase, preserve original casing
60    let period_patterns = [
61        (" a m", 4), // " a m" = 4 chars
62        (" am", 3),  // " am" = 3 chars
63        (" p m", 4),
64        (" pm", 3),
65        (" in the morning", 16),
66        (" in the afternoon", 18),
67        (" in the evening", 15),
68    ];
69
70    for (pattern, len) in &period_patterns {
71        if time_part.ends_with(pattern) {
72            // Get the suffix from original to check casing
73            let suffix_start = original.len().saturating_sub(
74                timezone.len()
75                    + if timezone.is_empty() {
76                        0
77                    } else {
78                        // Account for spaces in original timezone
79                        tz_patterns
80                            .iter()
81                            .find(|p| p.replace(" ", "") == timezone)
82                            .map(|p| p.len())
83                            .unwrap_or(timezone.len())
84                    },
85            );
86            let time_original = if timezone.is_empty() {
87                original
88            } else {
89                &original[..suffix_start]
90            }
91            .trim();
92
93            // Check if AM/PM is uppercase in original
94            let period_start = time_original.len().saturating_sub(*len);
95            let orig_suffix = &time_original[period_start..];
96
97            period = format_period_with_case(orig_suffix, *pattern);
98            time_part = time_part[..time_part.len() - len].trim().to_string();
99            break;
100        }
101    }
102
103    (time_part, period, timezone)
104}
105
106/// Format period (AM/PM) preserving original casing
107fn format_period_with_case(orig_suffix: &str, pattern: &str) -> String {
108    let orig_upper = orig_suffix.to_uppercase();
109
110    // Check if original was uppercase
111    if pattern.contains("in the") {
112        // "in the morning/afternoon/evening" always becomes a.m./p.m.
113        if pattern.contains("morning") {
114            return "a.m.".to_string();
115        } else {
116            return "p.m.".to_string();
117        }
118    }
119
120    // Check if it looks uppercase (A M, AM, P M, PM)
121    let is_uppercase = orig_suffix
122        .trim()
123        .chars()
124        .filter(|c| c.is_alphabetic())
125        .all(|c| c.is_uppercase());
126
127    if is_uppercase {
128        if orig_upper.contains('A') {
129            "A.M.".to_string()
130        } else {
131            "P.M.".to_string()
132        }
133    } else {
134        if pattern.contains('a') {
135            "a.m.".to_string()
136        } else {
137            "p.m.".to_string()
138        }
139    }
140}
141
142/// Format time output with period and timezone
143fn format_time(hour: i64, minute: i64, period: &str, timezone: &str) -> String {
144    let mut result = format!("{:02}:{:02}", hour, minute);
145
146    if !period.is_empty() {
147        result.push(' ');
148        result.push_str(period);
149    }
150
151    if !timezone.is_empty() {
152        result.push(' ');
153        result.push_str(timezone);
154    }
155
156    result
157}
158
159/// Parse "quarter past X" and "half past X" patterns
160fn parse_quarter_half(input: &str, period: &str, timezone: &str) -> Option<String> {
161    if input.starts_with("quarter past ") {
162        let hour_part = input.trim_start_matches("quarter past ");
163        let hour = words_to_number(hour_part)? as i64;
164        return Some(format_time(hour, 15, period, timezone));
165    }
166
167    if input.starts_with("half past ") {
168        let hour_part = input.trim_start_matches("half past ");
169        let hour = words_to_number(hour_part)? as i64;
170        return Some(format_time(hour, 30, period, timezone));
171    }
172
173    None
174}
175
176/// Parse "X o'clock" pattern
177fn parse_oclock(input: &str, period: &str, timezone: &str) -> Option<String> {
178    if input.ends_with(" o'clock") || input.ends_with(" oclock") {
179        let hour_part = input
180            .trim_end_matches(" o'clock")
181            .trim_end_matches(" oclock");
182        let hour = words_to_number(hour_part)? as i64;
183        return Some(format_time(hour, 0, period, timezone));
184    }
185
186    None
187}
188
189/// Parse "X to Y" pattern (e.g., "quarter to one" = 12:45)
190fn parse_to_pattern(input: &str, period: &str, timezone: &str) -> Option<String> {
191    if input.starts_with("quarter to ") {
192        let hour_part = input.trim_start_matches("quarter to ");
193        let hour = words_to_number(hour_part)? as i64;
194        let prev_hour = if hour == 1 { 12 } else { hour - 1 };
195        return Some(format_time(prev_hour, 45, period, timezone));
196    }
197
198    // "X min to Y" or "X minutes to Y"
199    if input.contains(" to ") {
200        let parts: Vec<&str> = input.split(" to ").collect();
201        if parts.len() == 2 {
202            let min_part = parts[0]
203                .trim_end_matches(" min")
204                .trim_end_matches(" mins")
205                .trim_end_matches(" minute")
206                .trim_end_matches(" minutes");
207            let minutes_before = words_to_number(min_part)? as i64;
208            let hour = words_to_number(parts[1])? as i64;
209            let prev_hour = if hour == 1 { 12 } else { hour - 1 };
210            let minute = 60 - minutes_before;
211            return Some(format_time(prev_hour, minute, period, timezone));
212        }
213    }
214
215    None
216}
217
218/// Parse standard "hour minute" time
219fn parse_standard_time(input: &str, period: &str, timezone: &str) -> Option<String> {
220    let words: Vec<&str> = input.split_whitespace().collect();
221
222    if words.is_empty() {
223        return None;
224    }
225
226    // Single word - only treat as time if there's a period (am/pm) or timezone
227    // Otherwise "one" would be parsed as "01:00" instead of cardinal "1"
228    if words.len() == 1 {
229        if period.is_empty() && timezone.is_empty() {
230            return None;
231        }
232        let hour = words_to_number(words[0])? as i64;
233        if hour >= 1 && hour <= 24 {
234            return Some(format_time(hour, 0, period, timezone));
235        }
236        return None;
237    }
238
239    // For multi-word time without am/pm, require the first word to be a simple hour (1-12)
240    // This prevents "twenty one" from being parsed as "20:01"
241    // Only single-word hour numbers are valid (e.g., "two", "twelve", not "twenty")
242    let hour_word = words[0];
243    let hour = parse_simple_hour(hour_word)?;
244
245    // Without am/pm, only allow 1-12 as hours (clock hours)
246    if period.is_empty() && timezone.is_empty() && (hour < 1 || hour > 12) {
247        return None;
248    }
249
250    // Remaining words are the minute
251    let minute_words = words[1..].join(" ");
252    let minute = parse_minute(&minute_words)?;
253
254    // Without am/pm, avoid matching patterns that look like historical years
255    // e.g., "eleven fifty five" should be year 1155, not time 11:55
256    // This applies when hour is 10-19 and minute forms a two-digit number
257    if period.is_empty() && timezone.is_empty() {
258        if hour >= 10 && hour <= 19 && minute >= 10 && minute <= 99 {
259            return None;
260        }
261    }
262
263    if minute >= 0 && minute < 60 {
264        Some(format_time(hour, minute, period, timezone))
265    } else {
266        None
267    }
268}
269
270/// Parse a simple hour word (one-twelve only)
271fn parse_simple_hour(word: &str) -> Option<i64> {
272    match word {
273        "one" => Some(1),
274        "two" => Some(2),
275        "three" => Some(3),
276        "four" => Some(4),
277        "five" => Some(5),
278        "six" => Some(6),
279        "seven" => Some(7),
280        "eight" => Some(8),
281        "nine" => Some(9),
282        "ten" => Some(10),
283        "eleven" => Some(11),
284        "twelve" => Some(12),
285        _ => None,
286    }
287}
288
289/// Parse minute portion, handling "oh five" = 05, "thirty" = 30
290/// Only accepts patterns that look like valid minutes
291fn parse_minute(input: &str) -> Option<i64> {
292    let words: Vec<&str> = input.split_whitespace().collect();
293
294    if words.is_empty() {
295        return None;
296    }
297
298    // Handle "o X" or "oh X" pattern for single digit minutes
299    if words.len() == 2 && (words[0] == "o" || words[0] == "oh") {
300        let digit_word = words[1];
301        let minute = words_to_number(digit_word).map(|n| n as i64)?;
302        if minute >= 0 && minute <= 9 {
303            return Some(minute);
304        }
305        return None;
306    }
307
308    // Single word: must be a valid minute word (not a sequence of digits)
309    if words.len() == 1 {
310        let minute = words_to_number(words[0]).map(|n| n as i64)?;
311        if minute >= 0 && minute <= 59 {
312            return Some(minute);
313        }
314        return None;
315    }
316
317    // Two words: must be tens + units compound (e.g., "forty five")
318    // Reject patterns like "nine nine" which are digit sequences
319    if words.len() == 2 {
320        // First word must be a tens word (twenty, thirty, etc.)
321        let is_tens = matches!(words[0], "twenty" | "thirty" | "forty" | "fifty");
322        if !is_tens {
323            return None;
324        }
325        // Second word must be a units word
326        let is_units = matches!(
327            words[1],
328            "one" | "two" | "three" | "four" | "five" | "six" | "seven" | "eight" | "nine"
329        );
330        if !is_units {
331            return None;
332        }
333        let minute = words_to_number(input).map(|n| n as i64)?;
334        if minute >= 0 && minute <= 59 {
335            return Some(minute);
336        }
337    }
338
339    None
340}
341
342#[cfg(test)]
343mod tests {
344    use super::*;
345
346    #[test]
347    fn test_standard_time() {
348        assert_eq!(parse("two thirty"), Some("02:30".to_string()));
349        assert_eq!(parse("eight fifty one"), Some("08:51".to_string()));
350        // Note: "eleven forty five" without am/pm is rejected to avoid
351        // conflict with year patterns like "eleven fifty five" → 1155
352        assert_eq!(parse("eleven forty five"), None);
353        // But with am/pm it works
354        assert_eq!(
355            parse("eleven forty five a m"),
356            Some("11:45 a.m.".to_string())
357        );
358    }
359
360    #[test]
361    fn test_with_period() {
362        assert_eq!(parse("two p m"), Some("02:00 p.m.".to_string()));
363        assert_eq!(
364            parse("eleven fifty five p m"),
365            Some("11:55 p.m.".to_string())
366        );
367        assert_eq!(parse("seven a m"), Some("07:00 a.m.".to_string()));
368    }
369
370    #[test]
371    fn test_quarter_half() {
372        assert_eq!(parse("quarter past one"), Some("01:15".to_string()));
373        assert_eq!(parse("half past three"), Some("03:30".to_string()));
374        assert_eq!(parse("half past twelve"), Some("12:30".to_string()));
375    }
376
377    #[test]
378    fn test_quarter_to() {
379        assert_eq!(parse("quarter to one"), Some("12:45".to_string()));
380        assert_eq!(parse("quarter to twelve"), Some("11:45".to_string()));
381    }
382
383    #[test]
384    fn test_oclock() {
385        assert_eq!(parse("three o'clock"), Some("03:00".to_string()));
386    }
387
388    #[test]
389    fn test_oh_minutes() {
390        assert_eq!(parse("eight o six"), Some("08:06".to_string()));
391        assert_eq!(parse("twelve oh five"), Some("12:05".to_string()));
392    }
393
394    #[test]
395    fn test_with_timezone() {
396        assert_eq!(parse("eight oclock g m t"), Some("08:00 gmt".to_string()));
397        assert_eq!(parse("seven a m e s t"), Some("07:00 a.m. est".to_string()));
398    }
399
400    #[test]
401    fn test_rejects_phone_like_input() {
402        // These should NOT be parsed as time - they're phone numbers
403        assert_eq!(
404            parse("one two three one two three five six seven eight"),
405            None
406        );
407        assert_eq!(parse("seven nine nine"), None);
408    }
409}