Skip to main content

text_processing_rs/
custom_rules.rs

1//! Custom user-defined normalization rules.
2//!
3//! Allows callers to register spoken→written mappings at runtime.
4//! These rules are checked with the highest priority in sentence mode,
5//! before any built-in taggers.
6//!
7//! Example: ("linux", "Linux"), ("gee pee tee", "GPT")
8
9use std::sync::RwLock;
10
11use lazy_static::lazy_static;
12
13lazy_static! {
14    /// Global custom rules store. Entries are (lowercase_spoken, written).
15    static ref CUSTOM_RULES: RwLock<Vec<(String, String)>> = RwLock::new(Vec::new());
16}
17
18/// Add a custom spoken→written mapping.
19///
20/// The spoken form is stored lowercased for case-insensitive matching.
21/// If the same spoken form already exists, it is replaced.
22pub fn add_rule(spoken: &str, written: &str) {
23    let spoken_lower = spoken.to_lowercase();
24    let mut rules = CUSTOM_RULES.write().unwrap();
25    // Replace if exists
26    if let Some(entry) = rules.iter_mut().find(|(s, _)| *s == spoken_lower) {
27        entry.1 = written.to_string();
28    } else {
29        rules.push((spoken_lower, written.to_string()));
30    }
31}
32
33/// Remove a custom rule by its spoken form.
34///
35/// Returns true if the rule was found and removed.
36pub fn remove_rule(spoken: &str) -> bool {
37    let spoken_lower = spoken.to_lowercase();
38    let mut rules = CUSTOM_RULES.write().unwrap();
39    let len_before = rules.len();
40    rules.retain(|(s, _)| *s != spoken_lower);
41    rules.len() < len_before
42}
43
44/// Clear all custom rules.
45pub fn clear_rules() {
46    let mut rules = CUSTOM_RULES.write().unwrap();
47    rules.clear();
48}
49
50/// Try to match input against custom rules (exact match, case-insensitive).
51///
52/// Returns `Some(written_form)` if a rule matches, `None` otherwise.
53pub fn parse(input: &str) -> Option<String> {
54    let input_lower = input.to_lowercase();
55    let input_trimmed = input_lower.trim();
56
57    let rules = CUSTOM_RULES.read().unwrap();
58    for (spoken, written) in rules.iter() {
59        if input_trimmed == spoken {
60            return Some(written.clone());
61        }
62    }
63
64    None
65}
66
67/// Get the number of custom rules currently registered.
68pub fn rule_count() -> usize {
69    let rules = CUSTOM_RULES.read().unwrap();
70    rules.len()
71}
72
73#[cfg(test)]
74mod tests {
75    use super::*;
76
77    /// Single test to avoid parallel test races on shared global state.
78    #[test]
79    fn test_custom_rules() {
80        clear_rules();
81
82        // Add and parse
83        add_rule("gee pee tee", "GPT");
84        assert_eq!(parse("gee pee tee"), Some("GPT".to_string()));
85        assert_eq!(parse("Gee Pee Tee"), Some("GPT".to_string()));
86        assert_eq!(parse("unknown"), None);
87
88        // Replace existing
89        add_rule("gee pee tee", "GPT-4");
90        assert_eq!(parse("gee pee tee"), Some("GPT-4".to_string()));
91        assert_eq!(rule_count(), 1);
92
93        // Remove
94        assert!(remove_rule("gee pee tee"));
95        assert_eq!(parse("gee pee tee"), None);
96        assert!(!remove_rule("gee pee tee"));
97
98        // Multiple rules + clear
99        add_rule("alpha", "A");
100        add_rule("bravo", "B");
101        assert_eq!(rule_count(), 2);
102        assert_eq!(parse("alpha"), Some("A".to_string()));
103        assert_eq!(parse("bravo"), Some("B".to_string()));
104        clear_rules();
105        assert_eq!(rule_count(), 0);
106        assert_eq!(parse("alpha"), None);
107    }
108}