Skip to main content

text_processing_rs/
lib.rs

1//! # text-processing-rs
2//!
3//! Inverse Text Normalization (ITN) — convert spoken-form ASR output to written form.
4//!
5//! Converts spoken-form text to written form:
6//! - "two hundred thirty two" → "232"
7//! - "five dollars and fifty cents" → "$5.50"
8//! - "january fifth twenty twenty five" → "January 5, 2025"
9//!
10//! ## Usage
11//!
12//! ```
13//! use text_processing_rs::normalize;
14//!
15//! let result = normalize("two hundred");
16//! assert_eq!(result, "200");
17//! ```
18
19pub mod custom_rules;
20pub mod taggers;
21
22#[cfg(feature = "ffi")]
23pub mod ffi;
24
25use taggers::{
26    cardinal, date, decimal, electronic, measure, money, ordinal, punctuation, telephone, time,
27    whitelist, word,
28};
29
30/// Normalize spoken-form text to written form.
31///
32/// Tries taggers in order of specificity (most specific first).
33/// Returns original text if no tagger matches.
34pub fn normalize(input: &str) -> String {
35    let input = input.trim();
36
37    // Apply custom user rules first (highest priority)
38    if let Some(result) = custom_rules::parse(input) {
39        return result;
40    }
41
42    // Apply whitelist replacements (abbreviations, special terms)
43    if let Some(result) = whitelist::parse(input) {
44        return result;
45    }
46
47    // Try punctuation ("period" → ".", "comma" → ",")
48    if let Some(result) = punctuation::parse(input) {
49        return result;
50    }
51
52    // Try word patterns (spelled letters + numbers, numbers with punctuation)
53    if let Some(result) = word::parse(input) {
54        return result;
55    }
56
57    // Try time expressions (before telephone to avoid "two thirty" → alphanumeric)
58    if let Some(result) = time::parse(input) {
59        return result;
60    }
61
62    // Try date expressions (before telephone to avoid "nineteen ninety four" → alphanumeric)
63    if let Some(result) = date::parse(input) {
64        return result;
65    }
66
67    // Try money (contains number + currency) - before telephone
68    if let Some(result) = money::parse(input) {
69        return result;
70    }
71
72    // Try measurements (contains number + unit) - before telephone
73    if let Some(result) = measure::parse(input) {
74        return result;
75    }
76
77    // Try decimal numbers (before telephone to catch "sixty point two")
78    if let Some(result) = decimal::parse(input) {
79        return result;
80    }
81
82    // Try telephone/IP numbers (before electronic to catch IP addresses)
83    if let Some(result) = telephone::parse(input) {
84        return result;
85    }
86
87    // Try electronic addresses (emails, URLs)
88    if let Some(result) = electronic::parse(input) {
89        return result;
90    }
91
92    // Try decimal numbers
93    if let Some(result) = decimal::parse(input) {
94        return result;
95    }
96
97    // Try ordinal numbers
98    if let Some(result) = ordinal::parse(input) {
99        return result;
100    }
101
102    // Try cardinal number
103    if let Some(num) = cardinal::parse(input) {
104        return num;
105    }
106
107    // No match - return original
108    input.to_string()
109}
110
111/// Normalize with language selection (future use).
112pub fn normalize_with_lang(input: &str, _lang: &str) -> String {
113    // TODO: Language-specific taggers
114    normalize(input)
115}
116
117/// Default maximum token span to consider when scanning a sentence.
118const DEFAULT_MAX_SPAN_TOKENS: usize = 16;
119
120/// Try to parse a span of text using sentence-safe taggers.
121///
122/// Returns `(replacement, priority_score)` if a tagger matches.
123/// Taggers are ordered by precision: high-confidence patterns first,
124/// broad patterns (cardinal) last and limited to short spans.
125///
126/// Excluded in sentence mode: `word` and `telephone` (over-fire on natural language).
127fn parse_span(span: &str) -> Option<(String, u8)> {
128    let token_count = span.split_whitespace().count();
129    if token_count == 0 {
130        return None;
131    }
132
133    if let Some(result) = custom_rules::parse(span) {
134        return Some((result, 110));
135    }
136    if let Some(result) = whitelist::parse(span) {
137        return Some((result, 100));
138    }
139    if let Some(result) = punctuation::parse(span) {
140        return Some((result, 98));
141    }
142    if let Some(result) = money::parse(span) {
143        return Some((result, 95));
144    }
145    if let Some(result) = measure::parse(span) {
146        return Some((result, 90));
147    }
148    if let Some(result) = date::parse(span) {
149        return Some((result, 88));
150    }
151    if let Some(result) = time::parse(span) {
152        return Some((result, 85));
153    }
154    if let Some(result) = electronic::parse(span) {
155        return Some((result, 82));
156    }
157    if let Some(result) = decimal::parse(span) {
158        return Some((result, 80));
159    }
160    if let Some(result) = ordinal::parse(span) {
161        return Some((result, 75));
162    }
163
164    // Cardinal only for short spans to avoid over-matching on natural language.
165    if token_count <= 4 {
166        if let Some(result) = cardinal::parse(span) {
167            return Some((result, 70));
168        }
169    }
170
171    None
172}
173
174/// Normalize a full sentence, replacing spoken-form spans with written form.
175///
176/// Unlike [`normalize`] which expects the entire input to be a single expression,
177/// this function scans for normalizable spans within a larger sentence.
178/// Uses a default max span of 16 tokens.
179///
180/// ```
181/// use text_processing_rs::normalize_sentence;
182///
183/// assert_eq!(normalize_sentence("I have twenty one apples"), "I have 21 apples");
184/// assert_eq!(normalize_sentence("hello world"), "hello world");
185/// ```
186pub fn normalize_sentence(input: &str) -> String {
187    normalize_sentence_with_max_span(input, DEFAULT_MAX_SPAN_TOKENS)
188}
189
190/// Normalize a full sentence with a configurable max span size.
191///
192/// `max_span_tokens` controls the maximum number of consecutive tokens
193/// that will be considered as a single normalizable expression.
194/// Smaller values are faster but may miss multi-word expressions.
195/// Larger values catch more patterns but do more work per token.
196///
197/// ```
198/// use text_processing_rs::normalize_sentence_with_max_span;
199///
200/// // Short span: only catches small expressions
201/// assert_eq!(normalize_sentence_with_max_span("I have twenty one apples", 4), "I have 21 apples");
202/// ```
203pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> String {
204    let trimmed = input.trim();
205    if trimmed.is_empty() {
206        return trimmed.to_string();
207    }
208
209    let max_span = if max_span_tokens == 0 {
210        1
211    } else {
212        max_span_tokens
213    };
214    let tokens: Vec<&str> = trimmed.split_whitespace().collect();
215    let mut out: Vec<String> = Vec::with_capacity(tokens.len());
216    let mut i = 0usize;
217
218    while i < tokens.len() {
219        let max_end = usize::min(tokens.len(), i + max_span);
220        let mut best: Option<(usize, String, u8)> = None;
221
222        // Longest-span-first search keeps replacements stable and non-overlapping.
223        for end in (i + 1..=max_end).rev() {
224            let span = tokens[i..end].join(" ");
225            let Some((candidate, score)) = parse_span(&span) else {
226                continue;
227            };
228
229            // Reject no-op results (tagger returned same text).
230            let candidate_trimmed = candidate.trim();
231            if candidate_trimmed.is_empty() || candidate_trimmed == span {
232                continue;
233            }
234
235            let candidate_len = end - i;
236            match &best {
237                None => {
238                    best = Some((end, candidate, score));
239                }
240                Some((best_end, _, best_score)) => {
241                    let best_len = *best_end - i;
242                    if candidate_len > best_len
243                        || (candidate_len == best_len && score > *best_score)
244                    {
245                        best = Some((end, candidate, score));
246                    }
247                }
248            }
249        }
250
251        if let Some((end, replacement, _)) = best {
252            out.push(replacement);
253            i = end;
254        } else {
255            out.push(tokens[i].to_string());
256            i += 1;
257        }
258    }
259
260    out.join(" ")
261}
262
263#[cfg(test)]
264mod tests {
265    use super::*;
266
267    #[test]
268    fn test_basic_cardinal() {
269        assert_eq!(normalize("one"), "1");
270        assert_eq!(normalize("twenty one"), "21");
271        assert_eq!(normalize("one hundred"), "100");
272    }
273
274    #[test]
275    fn test_basic_money() {
276        assert_eq!(normalize("five dollars"), "$5");
277    }
278
279    #[test]
280    fn test_passthrough() {
281        assert_eq!(normalize("hello world"), "hello world");
282    }
283
284    #[test]
285    fn test_sentence_cardinal() {
286        assert_eq!(
287            normalize_sentence("I have twenty one apples"),
288            "I have 21 apples"
289        );
290    }
291
292    #[test]
293    fn test_sentence_money() {
294        assert_eq!(
295            normalize_sentence("five dollars and fifty cents for the coffee"),
296            "$5.50 for the coffee"
297        );
298    }
299
300    #[test]
301    fn test_sentence_passthrough() {
302        assert_eq!(normalize_sentence("hello world"), "hello world");
303        assert_eq!(
304            normalize_sentence("the quick brown fox"),
305            "the quick brown fox"
306        );
307    }
308
309    #[test]
310    fn test_sentence_mixed() {
311        assert_eq!(
312            normalize_sentence("I paid five dollars for twenty three items"),
313            "I paid $5 for 23 items"
314        );
315    }
316
317    #[test]
318    fn test_sentence_empty() {
319        assert_eq!(normalize_sentence(""), "");
320        assert_eq!(normalize_sentence("   "), "");
321    }
322
323    #[test]
324    fn test_sentence_single_number() {
325        assert_eq!(normalize_sentence("forty two"), "42");
326    }
327
328    #[test]
329    fn test_punctuation() {
330        assert_eq!(normalize("period"), ".");
331        assert_eq!(normalize("comma"), ",");
332        assert_eq!(normalize("question mark"), "?");
333        assert_eq!(normalize("exclamation point"), "!");
334    }
335
336    #[test]
337    fn test_sentence_punctuation() {
338        assert_eq!(normalize_sentence("hello period"), "hello .");
339        assert_eq!(normalize_sentence("yes comma I agree"), "yes , I agree");
340        assert_eq!(normalize_sentence("really question mark"), "really ?");
341    }
342}