text_processing_rs/
lib.rs1pub mod custom_rules;
20pub mod taggers;
21
22#[cfg(feature = "ffi")]
23pub mod ffi;
24
25use taggers::{
26 cardinal, date, decimal, electronic, measure, money, ordinal, punctuation, telephone, time,
27 whitelist, word,
28};
29
30pub fn normalize(input: &str) -> String {
35 let input = input.trim();
36
37 if let Some(result) = custom_rules::parse(input) {
39 return result;
40 }
41
42 if let Some(result) = whitelist::parse(input) {
44 return result;
45 }
46
47 if let Some(result) = punctuation::parse(input) {
49 return result;
50 }
51
52 if let Some(result) = word::parse(input) {
54 return result;
55 }
56
57 if let Some(result) = time::parse(input) {
59 return result;
60 }
61
62 if let Some(result) = date::parse(input) {
64 return result;
65 }
66
67 if let Some(result) = money::parse(input) {
69 return result;
70 }
71
72 if let Some(result) = measure::parse(input) {
74 return result;
75 }
76
77 if let Some(result) = decimal::parse(input) {
79 return result;
80 }
81
82 if let Some(result) = telephone::parse(input) {
84 return result;
85 }
86
87 if let Some(result) = electronic::parse(input) {
89 return result;
90 }
91
92 if let Some(result) = decimal::parse(input) {
94 return result;
95 }
96
97 if let Some(result) = ordinal::parse(input) {
99 return result;
100 }
101
102 if let Some(num) = cardinal::parse(input) {
104 return num;
105 }
106
107 input.to_string()
109}
110
111pub fn normalize_with_lang(input: &str, _lang: &str) -> String {
113 normalize(input)
115}
116
117const DEFAULT_MAX_SPAN_TOKENS: usize = 16;
119
120fn parse_span(span: &str) -> Option<(String, u8)> {
128 let token_count = span.split_whitespace().count();
129 if token_count == 0 {
130 return None;
131 }
132
133 if let Some(result) = custom_rules::parse(span) {
134 return Some((result, 110));
135 }
136 if let Some(result) = whitelist::parse(span) {
137 return Some((result, 100));
138 }
139 if let Some(result) = punctuation::parse(span) {
140 return Some((result, 98));
141 }
142 if let Some(result) = money::parse(span) {
143 return Some((result, 95));
144 }
145 if let Some(result) = measure::parse(span) {
146 return Some((result, 90));
147 }
148 if let Some(result) = date::parse(span) {
149 return Some((result, 88));
150 }
151 if let Some(result) = time::parse(span) {
152 return Some((result, 85));
153 }
154 if let Some(result) = electronic::parse(span) {
155 return Some((result, 82));
156 }
157 if let Some(result) = decimal::parse(span) {
158 return Some((result, 80));
159 }
160 if let Some(result) = ordinal::parse(span) {
161 return Some((result, 75));
162 }
163
164 if token_count <= 4 {
166 if let Some(result) = cardinal::parse(span) {
167 return Some((result, 70));
168 }
169 }
170
171 None
172}
173
174pub fn normalize_sentence(input: &str) -> String {
187 normalize_sentence_with_max_span(input, DEFAULT_MAX_SPAN_TOKENS)
188}
189
190pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> String {
204 let trimmed = input.trim();
205 if trimmed.is_empty() {
206 return trimmed.to_string();
207 }
208
209 let max_span = if max_span_tokens == 0 {
210 1
211 } else {
212 max_span_tokens
213 };
214 let tokens: Vec<&str> = trimmed.split_whitespace().collect();
215 let mut out: Vec<String> = Vec::with_capacity(tokens.len());
216 let mut i = 0usize;
217
218 while i < tokens.len() {
219 let max_end = usize::min(tokens.len(), i + max_span);
220 let mut best: Option<(usize, String, u8)> = None;
221
222 for end in (i + 1..=max_end).rev() {
224 let span = tokens[i..end].join(" ");
225 let Some((candidate, score)) = parse_span(&span) else {
226 continue;
227 };
228
229 let candidate_trimmed = candidate.trim();
231 if candidate_trimmed.is_empty() || candidate_trimmed == span {
232 continue;
233 }
234
235 let candidate_len = end - i;
236 match &best {
237 None => {
238 best = Some((end, candidate, score));
239 }
240 Some((best_end, _, best_score)) => {
241 let best_len = *best_end - i;
242 if candidate_len > best_len
243 || (candidate_len == best_len && score > *best_score)
244 {
245 best = Some((end, candidate, score));
246 }
247 }
248 }
249 }
250
251 if let Some((end, replacement, _)) = best {
252 out.push(replacement);
253 i = end;
254 } else {
255 out.push(tokens[i].to_string());
256 i += 1;
257 }
258 }
259
260 out.join(" ")
261}
262
263#[cfg(test)]
264mod tests {
265 use super::*;
266
267 #[test]
268 fn test_basic_cardinal() {
269 assert_eq!(normalize("one"), "1");
270 assert_eq!(normalize("twenty one"), "21");
271 assert_eq!(normalize("one hundred"), "100");
272 }
273
274 #[test]
275 fn test_basic_money() {
276 assert_eq!(normalize("five dollars"), "$5");
277 }
278
279 #[test]
280 fn test_passthrough() {
281 assert_eq!(normalize("hello world"), "hello world");
282 }
283
284 #[test]
285 fn test_sentence_cardinal() {
286 assert_eq!(
287 normalize_sentence("I have twenty one apples"),
288 "I have 21 apples"
289 );
290 }
291
292 #[test]
293 fn test_sentence_money() {
294 assert_eq!(
295 normalize_sentence("five dollars and fifty cents for the coffee"),
296 "$5.50 for the coffee"
297 );
298 }
299
300 #[test]
301 fn test_sentence_passthrough() {
302 assert_eq!(normalize_sentence("hello world"), "hello world");
303 assert_eq!(
304 normalize_sentence("the quick brown fox"),
305 "the quick brown fox"
306 );
307 }
308
309 #[test]
310 fn test_sentence_mixed() {
311 assert_eq!(
312 normalize_sentence("I paid five dollars for twenty three items"),
313 "I paid $5 for 23 items"
314 );
315 }
316
317 #[test]
318 fn test_sentence_empty() {
319 assert_eq!(normalize_sentence(""), "");
320 assert_eq!(normalize_sentence(" "), "");
321 }
322
323 #[test]
324 fn test_sentence_single_number() {
325 assert_eq!(normalize_sentence("forty two"), "42");
326 }
327
328 #[test]
329 fn test_punctuation() {
330 assert_eq!(normalize("period"), ".");
331 assert_eq!(normalize("comma"), ",");
332 assert_eq!(normalize("question mark"), "?");
333 assert_eq!(normalize("exclamation point"), "!");
334 }
335
336 #[test]
337 fn test_sentence_punctuation() {
338 assert_eq!(normalize_sentence("hello period"), "hello .");
339 assert_eq!(normalize_sentence("yes comma I agree"), "yes , I agree");
340 assert_eq!(normalize_sentence("really question mark"), "really ?");
341 }
342}