tokenizers/normalizers/
unicode.rs

1use crate::tokenizer::{NormalizedString, Normalizer, Result};
2use crate::utils::macro_rules_attribute;
3
4#[derive(Default, Copy, Clone, Debug)]
5#[macro_rules_attribute(impl_serde_type!)]
6pub struct NFD;
7impl Normalizer for NFD {
8    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
9        normalized.nfd();
10        Ok(())
11    }
12}
13
14#[derive(Default, Copy, Clone, Debug)]
15#[macro_rules_attribute(impl_serde_type!)]
16pub struct NFKD;
17impl Normalizer for NFKD {
18    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
19        normalized.nfkd();
20        Ok(())
21    }
22}
23
24#[derive(Default, Copy, Clone, Debug)]
25#[macro_rules_attribute(impl_serde_type!)]
26pub struct NFC;
27impl Normalizer for NFC {
28    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
29        normalized.nfc();
30        Ok(())
31    }
32}
33
34#[derive(Default, Copy, Clone, Debug)]
35#[macro_rules_attribute(impl_serde_type!)]
36pub struct NFKC;
37impl Normalizer for NFKC {
38    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
39        normalized.nfkc();
40        Ok(())
41    }
42}
43
44fn do_nmt(normalized: &mut NormalizedString) {
45    // Ascii Control characters
46    normalized
47        .filter(|c| {
48            !matches!(
49                c as u32,
50                0x0001..=0x0008 |
51                0x000B |
52                0x000E..=0x001F |
53                0x007F |
54                0x008F |
55                0x009F
56            )
57        })
58        // Other code points considered as whitespace.
59        .map(|c| match c as u32 {
60            0x0009 => ' ',
61            0x000A => ' ',
62            0x000C => ' ',
63            0x000D => ' ',
64            0x1680 => ' ',
65            0x200B..=0x200F => ' ',
66            0x2028 => ' ',
67            0x2029 => ' ',
68            0x2581 => ' ',
69            0xFEFF => ' ',
70            0xFFFD => ' ',
71            _ => c,
72        });
73}
74
75#[derive(Default, Copy, Clone, Debug)]
76#[macro_rules_attribute(impl_serde_type!)]
77pub struct Nmt;
78impl Normalizer for Nmt {
79    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
80        do_nmt(normalized);
81        Ok(())
82    }
83}
84
85#[cfg(test)]
86mod tests {
87    use super::*;
88
89    #[test]
90    fn test_nfkc() {
91        let original = "\u{fb01}".to_string();
92        let normalized = "fi".to_string();
93        let mut n = NormalizedString::from(original.clone());
94        NFKC.normalize(&mut n).unwrap();
95
96        assert_eq!(
97            n,
98            NormalizedString::new(original, normalized, vec![(0, 3), (0, 3)], 0)
99        );
100
101        assert_eq!(n.alignments_original(), vec![(0, 2), (0, 2), (0, 2)]);
102    }
103}