tokenizers/normalizers/
strip.rs1use crate::tokenizer::{NormalizedString, Normalizer, Result};
2use crate::utils::macro_rules_attribute;
3use serde::{Deserialize, Serialize};
4use unicode_normalization_alignments::char::is_combining_mark;
5
6#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
7#[serde(tag = "type")]
8#[non_exhaustive]
9pub struct Strip {
10 pub strip_left: bool,
11 pub strip_right: bool,
12}
13
14impl Strip {
15 pub fn new(strip_left: bool, strip_right: bool) -> Self {
16 Self {
17 strip_left,
18 strip_right,
19 }
20 }
21}
22
23impl Normalizer for Strip {
24 fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
26 if self.strip_left && self.strip_right {
27 normalized.strip();
29 } else {
30 if self.strip_left {
31 normalized.lstrip();
32 }
33
34 if self.strip_right {
35 normalized.rstrip();
36 }
37 }
38
39 Ok(())
40 }
41}
42
43#[derive(Copy, Clone, Debug)]
47#[macro_rules_attribute(impl_serde_type!)]
48pub struct StripAccents;
49
50impl Normalizer for StripAccents {
51 fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
53 normalized.filter(|c| !is_combining_mark(c));
54 Ok(())
55 }
56}
57
58#[cfg(test)]
59mod tests {
60 use super::*;
61 use crate::normalizer::NormalizedString;
62 use crate::normalizers::Lowercase;
63 use crate::normalizers::NFKD;
64 use unicode_normalization_alignments::UnicodeNormalization;
65
66 #[test]
67 fn test_strip_accents() {
68 let original: String = "Me llamó".nfkd().map(|(c, _)| c).collect();
70 let normalized = "Me llamo";
71 assert_ne!(original, normalized);
72 let mut n = NormalizedString::from(original);
73 StripAccents.normalize(&mut n).unwrap();
74 assert_eq!(&n.get(), &normalized);
75
76 let original = "Me llamo";
78 let normalized = "Me llamo";
79 assert_eq!(original, normalized);
80 let mut n = NormalizedString::from(original);
81 StripAccents.normalize(&mut n).unwrap();
82 assert_eq!(&n.get(), &normalized);
83
84 let original: String = "这很简单".nfkd().map(|(c, _)| c).collect();
86 let normalized = "这很简单";
87 assert_eq!(original, normalized);
88 let mut n = NormalizedString::from(original);
89 StripAccents.normalize(&mut n).unwrap();
90 assert_eq!(&n.get(), &normalized);
91 }
92
93 #[test]
94 fn test_vietnamese_bug() {
95 let original: String = "ậ…".to_string();
96 let normalized = "a...".to_string();
97 assert_ne!(original, normalized);
98 let mut n = NormalizedString::from(original);
99 NFKD.normalize(&mut n).unwrap();
100 StripAccents.normalize(&mut n).unwrap();
101 assert_eq!(&n.get(), &normalized);
102 Lowercase.normalize(&mut n).unwrap();
103 assert_eq!(&n.get(), &normalized);
104
105 let original: String = "Cụ thể, bạn sẽ tham gia một nhóm các giám đốc điều hành tổ chức, các nhà lãnh đạo doanh nghiệp, các học giả, chuyên gia phát triển và tình nguyện viên riêng biệt trong lĩnh vực phi lợi nhuận…".to_string();
106 let normalized = "cu the, ban se tham gia mot nhom cac giam đoc đieu hanh to chuc, cac nha lanh đao doanh nghiep, cac hoc gia, chuyen gia phat trien va tinh nguyen vien rieng biet trong linh vuc phi loi nhuan...".to_string();
107 let mut n = NormalizedString::from(original);
108 NFKD.normalize(&mut n).unwrap();
109 StripAccents.normalize(&mut n).unwrap();
110 Lowercase.normalize(&mut n).unwrap();
111 assert_eq!(&n.get(), &normalized);
112 }
113
114 #[test]
115 fn test_thai_bug() {
116 let original = "ำน\u{e49}ำ3ลำ".to_string();
117 let normalized = "านา3ลา".to_string();
118 assert_ne!(original, normalized);
119 let mut n = NormalizedString::from(original);
120 NFKD.normalize(&mut n).unwrap();
121 StripAccents.normalize(&mut n).unwrap();
122 Lowercase.normalize(&mut n).unwrap();
123 assert_eq!(&n.get(), &normalized);
124 }
125
126 #[test]
127 fn test_strip_accents_multiple() {
128 let original = "e\u{304}\u{304}\u{304}o";
129 let normalized = "eo";
130 assert_ne!(original, normalized);
131 let mut n = NormalizedString::from(original);
132 StripAccents.normalize(&mut n).unwrap();
133 assert_eq!(&n.get(), &normalized);
134 assert_eq!(
135 n,
136 NormalizedString::new(
137 original.to_string(),
138 normalized.to_string(),
139 vec![(0, 1), (7, 8)],
140 0
141 )
142 );
143 assert_eq!(
144 n.alignments_original(),
145 vec![
146 (0, 1),
147 (1, 1),
148 (1, 1),
149 (1, 1),
150 (1, 1),
151 (1, 1),
152 (1, 1),
153 (1, 2)
154 ]
155 );
156 }
157}