1use super::{CharNormalizer, CharOrStr};
2use crate::{Script, Token};
3
4pub struct ArabicNormalizer;
14
15impl CharNormalizer for ArabicNormalizer {
17 fn normalize_char(&self, c: char) -> Option<CharOrStr> {
19 normalize_arabic_char(c)
20 }
21
22 fn should_normalize(&self, token: &Token) -> bool {
23 token.script == Script::Arabic && token.lemma.chars().any(is_shoud_normalize)
24 }
25}
26
27fn normalize_arabic_char(c: char) -> Option<CharOrStr> {
28 match c {
29 'ـ' => None,
30 'أ' | 'إ' | 'آ' | 'ٱ' => Some('ا'.into()), 'ى' => Some('ي'.into()),
32 'ة' => Some('ه'.into()),
33 _ => Some(c.into()),
34 }
35}
36
37fn is_shoud_normalize(c: char) -> bool {
38 matches!(c, 'ـ' | 'أ' | 'إ' | 'آ' | 'ٱ' | 'ى' | 'ة')
39}
40
41#[cfg(test)]
42mod test {
43 use std::borrow::Cow::Owned;
44
45 use crate::normalizer::test::test_normalizer;
46 use crate::normalizer::{Normalizer, NormalizerOption};
47 use crate::token::TokenKind;
48
49 fn tokens() -> Vec<Token<'static>> {
51 vec![
52 Token {
54 lemma: Owned("الحمــــــد".to_string()),
55 char_end: 10,
56 byte_end: 10,
57 script: Script::Arabic,
58 ..Default::default()
59 },
60 Token {
61 lemma: Owned("رحــــــيم".to_string()),
62 char_end: 10,
63 byte_end: 10,
64 script: Script::Arabic,
65 char_map: Some(vec![
66 (2, 2),
67 (2, 2),
68 (2, 2),
69 (2, 2),
70 (2, 2),
71 (2, 2),
72 (2, 2),
73 (2, 2),
74 (2, 2),
75 (2, 2),
76 ]),
77 ..Default::default()
78 },
79 Token {
81 lemma: Owned("ٱلحمد".to_string()),
82 char_end: 5,
83 byte_end: 10,
84 script: Script::Arabic,
85 ..Default::default()
86 },
87 Token {
89 lemma: Owned("يومى".to_string()),
90 char_end: 4,
91 byte_end: 8,
92 script: Script::Arabic,
93 ..Default::default()
94 },
95 Token {
97 lemma: Owned("النهاردة".to_string()),
98 char_end: 8,
99 byte_end: 16,
100 script: Script::Arabic,
101 ..Default::default()
102 },
103 ]
104 }
105
106 fn normalizer_result() -> Vec<Token<'static>> {
108 vec![
109 Token {
110 lemma: Owned("الحمد".to_string()),
111 char_end: 10,
112 byte_end: 10,
113 script: Script::Arabic,
114 char_map: Some(vec![
115 (2, 2),
116 (2, 2),
117 (2, 2),
118 (2, 2),
119 (2, 0),
120 (2, 0),
121 (2, 0),
122 (2, 0),
123 (2, 0),
124 (2, 0),
125 (2, 2),
126 ]),
127 ..Default::default()
128 },
129 Token {
130 lemma: Owned("رحيم".to_string()),
131 char_end: 10,
132 byte_end: 10,
133 script: Script::Arabic,
134 char_map: Some(vec![
135 (2, 2),
136 (2, 2),
137 (2, 0),
138 (2, 0),
139 (2, 0),
140 (2, 0),
141 (2, 0),
142 (2, 0),
143 (2, 2),
144 (2, 2),
145 ]),
146 ..Default::default()
147 },
148 Token {
149 lemma: Owned("الحمد".to_string()),
150 char_end: 5,
151 byte_end: 10,
152 script: Script::Arabic,
153 char_map: Some(vec![(2, 2), (2, 2), (2, 2), (2, 2), (2, 2)]),
154 ..Default::default()
155 },
156 Token {
157 lemma: Owned("يومي".to_string()),
158 char_end: 4,
159 byte_end: 8,
160 char_map: Some(vec![(2, 2), (2, 2), (2, 2), (2, 2)]),
161 script: Script::Arabic,
162 ..Default::default()
163 },
164 Token {
165 lemma: Owned("النهارده".to_string()),
166 char_end: 8,
167 byte_end: 16,
168 char_map: Some(vec![
169 (2, 2),
170 (2, 2),
171 (2, 2),
172 (2, 2),
173 (2, 2),
174 (2, 2),
175 (2, 2),
176 (2, 2),
177 ]),
178 script: Script::Arabic,
179 ..Default::default()
180 },
181 ]
182 }
183
184 fn normalized_tokens() -> Vec<Token<'static>> {
186 vec![
187 Token {
188 lemma: Owned("الحمد".to_string()),
189 char_end: 10,
190 byte_end: 10,
191 char_map: Some(vec![
192 (2, 2),
193 (2, 2),
194 (2, 2),
195 (2, 2),
196 (2, 0),
197 (2, 0),
198 (2, 0),
199 (2, 0),
200 (2, 0),
201 (2, 0),
202 (2, 2),
203 ]),
204 script: Script::Arabic,
205 kind: TokenKind::Word,
206 ..Default::default()
207 },
208 Token {
209 lemma: Owned("رحيم".to_string()),
210 char_end: 10,
211 byte_end: 10,
212 script: Script::Arabic,
213 char_map: Some(vec![
214 (2, 2),
215 (2, 2),
216 (2, 0),
217 (2, 0),
218 (2, 0),
219 (2, 0),
220 (2, 0),
221 (2, 0),
222 (2, 2),
223 (2, 2),
224 ]),
225 kind: TokenKind::Word,
226 ..Default::default()
227 },
228 Token {
229 lemma: Owned("الحمد".to_string()),
230 char_end: 5,
231 byte_end: 10,
232 script: Script::Arabic,
233 char_map: Some(vec![(2, 2), (2, 2), (2, 2), (2, 2), (2, 2)]),
234 kind: TokenKind::Word,
235 ..Default::default()
236 },
237 Token {
238 lemma: Owned("يومي".to_string()),
239 char_end: 4,
240 byte_end: 8,
241 char_map: Some(vec![(2, 2), (2, 2), (2, 2), (2, 2)]),
242 script: Script::Arabic,
243 kind: TokenKind::Word,
244 ..Default::default()
245 },
246 Token {
247 lemma: Owned("النهارده".to_string()),
248 char_end: 8,
249 byte_end: 16,
250 char_map: Some(vec![
251 (2, 2),
252 (2, 2),
253 (2, 2),
254 (2, 2),
255 (2, 2),
256 (2, 2),
257 (2, 2),
258 (2, 2),
259 ]),
260 script: Script::Arabic,
261 kind: TokenKind::Word,
262 ..Default::default()
263 },
264 ]
265 }
266
267 test_normalizer!(ArabicNormalizer, tokens(), normalizer_result(), normalized_tokens());
268}