Skip to main content

text_processing_rs/taggers/
punctuation.rs

1//! Punctuation tagger.
2//!
3//! Converts spoken punctuation words to their written symbols:
4//! - "period" → "."
5//! - "comma" → ","
6//! - "question mark" → "?"
7//! - "exclamation point" → "!"
8
9use lazy_static::lazy_static;
10
11lazy_static! {
12    /// Spoken punctuation → written symbol mappings.
13    /// Ordered longest-first so multi-word patterns match before single-word ones.
14    static ref PUNCTUATION: Vec<(&'static str, &'static str)> = vec![
15        // Multi-word patterns first
16        ("exclamation point", "!"),
17        ("exclamation mark", "!"),
18        ("question mark", "?"),
19        ("open parenthesis", "("),
20        ("close parenthesis", ")"),
21        ("left parenthesis", "("),
22        ("right parenthesis", ")"),
23        ("open bracket", "["),
24        ("close bracket", "]"),
25        ("left bracket", "["),
26        ("right bracket", "]"),
27        ("open brace", "{"),
28        ("close brace", "}"),
29        ("left brace", "{"),
30        ("right brace", "}"),
31        ("double quote", "\""),
32        ("single quote", "'"),
33        ("forward slash", "/"),
34        ("back slash", "\\"),
35
36        // Single-word patterns
37        ("period", "."),
38        ("dot", "."),
39        ("comma", ","),
40        ("colon", ":"),
41        ("semicolon", ";"),
42        ("hyphen", "-"),
43        ("dash", "-"),
44        ("ellipsis", "..."),
45        ("ampersand", "&"),
46        ("asterisk", "*"),
47        ("at sign", "@"),
48        ("hash", "#"),
49        ("percent", "%"),
50        ("plus", "+"),
51        ("equals", "="),
52        ("tilde", "~"),
53        ("underscore", "_"),
54        ("pipe", "|"),
55        ("slash", "/"),
56    ];
57}
58
59/// Try to parse spoken punctuation into its written symbol.
60///
61/// Returns `Some(symbol)` if the entire input matches a known punctuation word.
62/// Only matches exact full input — does not replace within sentences.
63pub fn parse(input: &str) -> Option<String> {
64    let input_lower = input.to_lowercase();
65    let input_trimmed = input_lower.trim();
66
67    for (pattern, symbol) in PUNCTUATION.iter() {
68        if input_trimmed == *pattern {
69            return Some(symbol.to_string());
70        }
71    }
72
73    None
74}
75
76#[cfg(test)]
77mod tests {
78    use super::*;
79
80    #[test]
81    fn test_basic_punctuation() {
82        assert_eq!(parse("period"), Some(".".to_string()));
83        assert_eq!(parse("comma"), Some(",".to_string()));
84        assert_eq!(parse("colon"), Some(":".to_string()));
85        assert_eq!(parse("semicolon"), Some(";".to_string()));
86    }
87
88    #[test]
89    fn test_multi_word() {
90        assert_eq!(parse("question mark"), Some("?".to_string()));
91        assert_eq!(parse("exclamation point"), Some("!".to_string()));
92        assert_eq!(parse("exclamation mark"), Some("!".to_string()));
93        assert_eq!(parse("open parenthesis"), Some("(".to_string()));
94        assert_eq!(parse("close parenthesis"), Some(")".to_string()));
95        assert_eq!(parse("double quote"), Some("\"".to_string()));
96        assert_eq!(parse("forward slash"), Some("/".to_string()));
97    }
98
99    #[test]
100    fn test_case_insensitive() {
101        assert_eq!(parse("Period"), Some(".".to_string()));
102        assert_eq!(parse("COMMA"), Some(",".to_string()));
103        assert_eq!(parse("Question Mark"), Some("?".to_string()));
104    }
105
106    #[test]
107    fn test_symbols() {
108        assert_eq!(parse("hyphen"), Some("-".to_string()));
109        assert_eq!(parse("dash"), Some("-".to_string()));
110        assert_eq!(parse("ampersand"), Some("&".to_string()));
111        assert_eq!(parse("asterisk"), Some("*".to_string()));
112        assert_eq!(parse("hash"), Some("#".to_string()));
113        assert_eq!(parse("percent"), Some("%".to_string()));
114        assert_eq!(parse("at sign"), Some("@".to_string()));
115        assert_eq!(parse("ellipsis"), Some("...".to_string()));
116    }
117
118    #[test]
119    fn test_no_match() {
120        assert_eq!(parse("hello"), None);
121        assert_eq!(parse("the period was great"), None);
122        assert_eq!(parse(""), None);
123    }
124}