tokenizers/normalizers/
unicode.rs1use crate::tokenizer::{NormalizedString, Normalizer, Result};
2use crate::utils::macro_rules_attribute;
3
4#[derive(Default, Copy, Clone, Debug)]
5#[macro_rules_attribute(impl_serde_type!)]
6pub struct NFD;
7impl Normalizer for NFD {
8 fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
9 normalized.nfd();
10 Ok(())
11 }
12}
13
14#[derive(Default, Copy, Clone, Debug)]
15#[macro_rules_attribute(impl_serde_type!)]
16pub struct NFKD;
17impl Normalizer for NFKD {
18 fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
19 normalized.nfkd();
20 Ok(())
21 }
22}
23
24#[derive(Default, Copy, Clone, Debug)]
25#[macro_rules_attribute(impl_serde_type!)]
26pub struct NFC;
27impl Normalizer for NFC {
28 fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
29 normalized.nfc();
30 Ok(())
31 }
32}
33
34#[derive(Default, Copy, Clone, Debug)]
35#[macro_rules_attribute(impl_serde_type!)]
36pub struct NFKC;
37impl Normalizer for NFKC {
38 fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
39 normalized.nfkc();
40 Ok(())
41 }
42}
43
44fn do_nmt(normalized: &mut NormalizedString) {
45 normalized
47 .filter(|c| {
48 !matches!(
49 c as u32,
50 0x0001..=0x0008 |
51 0x000B |
52 0x000E..=0x001F |
53 0x007F |
54 0x008F |
55 0x009F
56 )
57 })
58 .map(|c| match c as u32 {
60 0x0009 => ' ',
61 0x000A => ' ',
62 0x000C => ' ',
63 0x000D => ' ',
64 0x1680 => ' ',
65 0x200B..=0x200F => ' ',
66 0x2028 => ' ',
67 0x2029 => ' ',
68 0x2581 => ' ',
69 0xFEFF => ' ',
70 0xFFFD => ' ',
71 _ => c,
72 });
73}
74
75#[derive(Default, Copy, Clone, Debug)]
76#[macro_rules_attribute(impl_serde_type!)]
77pub struct Nmt;
78impl Normalizer for Nmt {
79 fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
80 do_nmt(normalized);
81 Ok(())
82 }
83}
84
85#[cfg(test)]
86mod tests {
87 use super::*;
88
89 #[test]
90 fn test_nfkc() {
91 let original = "\u{fb01}".to_string();
92 let normalized = "fi".to_string();
93 let mut n = NormalizedString::from(original.clone());
94 NFKC.normalize(&mut n).unwrap();
95
96 assert_eq!(
97 n,
98 NormalizedString::new(original, normalized, vec![(0, 3), (0, 3)], 0)
99 );
100
101 assert_eq!(n.alignments_original(), vec![(0, 2), (0, 2), (0, 2)]);
102 }
103}