tokenizers/normalizers/
precompiled.rs1use crate::tokenizer::{NormalizedString, Normalizer, Result};
2pub use spm_precompiled::Precompiled;
3use std::cmp::Ordering;
4use unicode_segmentation::UnicodeSegmentation;
5
6fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: &str) {
7 let old_count = old_part.chars().count() as isize;
8 let new_count = new_part.chars().count() as isize;
9 let diff = new_count - old_count;
10
11 transformations.extend(new_part.chars().map(|c| (c, 0)));
13
14 match diff.cmp(&0) {
15 Ordering::Greater => {
17 transformations
18 .iter_mut()
19 .rev()
20 .take(diff as usize)
21 .for_each(|(_, cs)| *cs = 1);
22 }
23 Ordering::Less => {
25 if let Some((_, cs)) = transformations.last_mut() {
26 *cs += diff;
27 }
28 }
29 _ => {}
30 }
31}
32
33impl Normalizer for Precompiled {
34 fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
35 let mut transformations = Vec::with_capacity(normalized.get().len());
36 let mut modified = false;
46 normalized.get().graphemes(true).for_each(|grapheme| {
47 if grapheme.len() < 6 {
48 if let Some(norm) = self.transform(grapheme) {
49 modified = true;
50 replace(&mut transformations, grapheme, norm);
51 return;
52 }
53 }
54 for (char_index, c) in grapheme.char_indices() {
55 let part = &grapheme[char_index..char_index + c.len_utf8()];
56 if let Some(norm) = self.transform(part) {
57 modified = true;
58 replace(&mut transformations, part, norm);
59 } else {
60 transformations.push((c, 0));
61 }
62 }
63 });
64 if modified {
65 normalized.transform(transformations, 0);
66 }
67 Ok(())
68 }
69}
70
71#[cfg(test)]
72mod tests {
73 use super::*;
74
75 #[test]
76 fn expansion_followed_by_removal() {
77 let mut transformations = vec![];
79
80 let mut n = NormalizedString::from("™\x1eg");
81 replace(&mut transformations, "™", "TM");
82 replace(&mut transformations, "\x1e", "");
83 transformations.push(('g', 0));
84
85 n.transform(transformations, 0);
86
87 assert_eq!(n.get(), "TMg");
88 }
89}