1use std::path::Path;
2
3use crate::{
4 common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
5 error::Error,
6};
7
8#[derive(Clone, Debug, Default, Eq, PartialEq)]
12pub struct ArabicShaping {
13 pub codepoint: Codepoint,
15 pub schematic_name: String,
22 pub joining_type: JoiningType,
24 pub joining_group: JoiningGroup,
26}
27
28#[derive(Clone, Copy, Debug, Eq, PartialEq)]
34pub enum JoiningGroup {
35 AfricanFeh,
36 AfricanNoon,
37 AfricanQaf,
38 Ain,
39 Alaph,
40 Alef,
41 Beh,
42 Beth,
43 BurushaskiYehBarree,
44 Dal,
45 DalathRish,
46 E,
47 FarsiYeh,
48 Fe,
49 Feh,
50 FinalSemkath,
51 Gaf,
52 Gamal,
53 Hah,
54 HanifiRohingyaKinnaYa,
55 HanifiRohingyaPa,
56 He,
57 Heh,
58 HehGoal,
59 Heth,
60 Kaf,
61 Kaph,
62 KashmiriYeh,
63 Khaph,
64 KnottedHeh,
65 Lam,
66 Lamadh,
67 MalayalamBha,
68 MalayalamJa,
69 MalayalamLla,
70 MalayalamLlla,
71 MalayalamNga,
72 MalayalamNna,
73 MalayalamNnna,
74 MalayalamNya,
75 MalayalamRa,
76 MalayalamSsa,
77 MalayalamTta,
78 ManichaeanAleph,
79 ManichaeanAyin,
80 ManichaeanBeth,
81 ManichaeanDaleth,
82 ManichaeanDhamedh,
83 ManichaeanFive,
84 ManichaeanGimel,
85 ManichaeanHeth,
86 ManichaeanHundred,
87 ManichaeanKaph,
88 ManichaeanLamedh,
89 ManichaeanMem,
90 ManichaeanNun,
91 ManichaeanOne,
92 ManichaeanPe,
93 ManichaeanQoph,
94 ManichaeanResh,
95 ManichaeanSadhe,
96 ManichaeanSamekh,
97 ManichaeanTaw,
98 ManichaeanTen,
99 ManichaeanTeth,
100 ManichaeanThamedh,
101 ManichaeanTwenty,
102 ManichaeanWaw,
103 ManichaeanYodh,
104 ManichaeanZayin,
105 Meem,
106 Mim,
107 NoJoiningGroup,
108 Noon,
109 Nun,
110 Nya,
111 Pe,
112 Qaf,
113 Qaph,
114 Reh,
115 ReversedPe,
116 RohingyaYeh,
117 Sad,
118 Sadhe,
119 Seen,
120 Semkath,
121 Shin,
122 StraightWaw,
123 SwashKaf,
124 SyriacWaw,
125 Tah,
126 Taw,
127 TehMarbuta,
128 TehMarbutaGoal,
129 Teth,
130 ThinYeh,
131 VerticalTail,
132 Waw,
133 Yeh,
134 YehBarree,
135 YehWithTail,
136 Yudh,
137 YudhHe,
138 Zain,
139 Zhain,
140}
141
142#[derive(Clone, Copy, Debug, Eq, PartialEq)]
144pub enum JoiningType {
145 RightJoining,
146 LeftJoining,
147 DualJoining,
148 JoinCausing,
149 NonJoining,
150 Transparent,
151}
152
153impl JoiningGroup {
154 pub fn as_str(&self) -> &str {
155 match self {
156 JoiningGroup::AfricanFeh => "African_Feh",
157 JoiningGroup::AfricanNoon => "African_Noon",
158 JoiningGroup::AfricanQaf => "African_Qaf",
159 JoiningGroup::Ain => "Ain",
160 JoiningGroup::Alaph => "Alaph",
161 JoiningGroup::Alef => "Alef",
162 JoiningGroup::Beh => "Beh",
163 JoiningGroup::Beth => "Beth",
164 JoiningGroup::BurushaskiYehBarree => "Burushaski_Yeh_Barree",
165 JoiningGroup::Dal => "Dal",
166 JoiningGroup::DalathRish => "Dalath_Rish",
167 JoiningGroup::E => "E",
168 JoiningGroup::FarsiYeh => "Farsi_Yeh",
169 JoiningGroup::Fe => "Fe",
170 JoiningGroup::Feh => "Feh",
171 JoiningGroup::FinalSemkath => "Final_Semkath",
172 JoiningGroup::Gaf => "Gaf",
173 JoiningGroup::Gamal => "Gamal",
174 JoiningGroup::Hah => "Hah",
175 JoiningGroup::HanifiRohingyaKinnaYa => "Hanifi_Rohingya_Kinna_Ya",
176 JoiningGroup::HanifiRohingyaPa => "Hanifi_Rohingya_Pa",
177 JoiningGroup::He => "He",
178 JoiningGroup::Heh => "Heh",
179 JoiningGroup::HehGoal => "Heh_Goal",
180 JoiningGroup::Heth => "Heth",
181 JoiningGroup::Kaf => "Kaf",
182 JoiningGroup::Kaph => "Kaph",
183 JoiningGroup::KashmiriYeh => "Kashmiri_Yeh",
184 JoiningGroup::Khaph => "Khaph",
185 JoiningGroup::KnottedHeh => "Knotted_Heh",
186 JoiningGroup::Lam => "Lam",
187 JoiningGroup::Lamadh => "Lamadh",
188 JoiningGroup::MalayalamBha => "Malayalam_Bha",
189 JoiningGroup::MalayalamJa => "Malayalam_Ja",
190 JoiningGroup::MalayalamLla => "Malayalam_Lla",
191 JoiningGroup::MalayalamLlla => "Malayalam_Llla",
192 JoiningGroup::MalayalamNga => "Malayalam_Nga",
193 JoiningGroup::MalayalamNna => "Malayalam_Nna",
194 JoiningGroup::MalayalamNnna => "Malayalam_Nnna",
195 JoiningGroup::MalayalamNya => "Malayalam_Nya",
196 JoiningGroup::MalayalamRa => "Malayalam_Ra",
197 JoiningGroup::MalayalamSsa => "Malayalam_Ssa",
198 JoiningGroup::MalayalamTta => "Malayalam_Tta",
199 JoiningGroup::ManichaeanAleph => "Manichaean_Aleph",
200 JoiningGroup::ManichaeanAyin => "Manichaean_Ayin",
201 JoiningGroup::ManichaeanBeth => "Manichaean_Beth",
202 JoiningGroup::ManichaeanDaleth => "Manichaean_Daleth",
203 JoiningGroup::ManichaeanDhamedh => "Manichaean_Dhamedh",
204 JoiningGroup::ManichaeanFive => "Manichaean_Five",
205 JoiningGroup::ManichaeanGimel => "Manichaean_Gimel",
206 JoiningGroup::ManichaeanHeth => "Manichaean_Heth",
207 JoiningGroup::ManichaeanHundred => "Manichaean_Hundred",
208 JoiningGroup::ManichaeanKaph => "Manichaean_Kaph",
209 JoiningGroup::ManichaeanLamedh => "Manichaean_Lamedh",
210 JoiningGroup::ManichaeanMem => "Manichaean_Mem",
211 JoiningGroup::ManichaeanNun => "Manichaean_Nun",
212 JoiningGroup::ManichaeanOne => "Manichaean_One",
213 JoiningGroup::ManichaeanPe => "Manichaean_Pe",
214 JoiningGroup::ManichaeanQoph => "Manichaean_Qoph",
215 JoiningGroup::ManichaeanResh => "Manichaean_Resh",
216 JoiningGroup::ManichaeanSadhe => "Manichaean_Sadhe",
217 JoiningGroup::ManichaeanSamekh => "Manichaean_Samekh",
218 JoiningGroup::ManichaeanTaw => "Manichaean_Taw",
219 JoiningGroup::ManichaeanTen => "Manichaean_Ten",
220 JoiningGroup::ManichaeanTeth => "Manichaean_Teth",
221 JoiningGroup::ManichaeanThamedh => "Manichaean_Thamedh",
222 JoiningGroup::ManichaeanTwenty => "Manichaean_Twenty",
223 JoiningGroup::ManichaeanWaw => "Manichaean_Waw",
224 JoiningGroup::ManichaeanYodh => "Manichaean_Yodh",
225 JoiningGroup::ManichaeanZayin => "Manichaean_Zayin",
226 JoiningGroup::Meem => "Meem",
227 JoiningGroup::Mim => "Mim",
228 JoiningGroup::NoJoiningGroup => "No_Joining_Group",
229 JoiningGroup::Noon => "Noon",
230 JoiningGroup::Nun => "Nun",
231 JoiningGroup::Nya => "Nya",
232 JoiningGroup::Pe => "Pe",
233 JoiningGroup::Qaf => "Qaf",
234 JoiningGroup::Qaph => "Qaph",
235 JoiningGroup::Reh => "Reh",
236 JoiningGroup::ReversedPe => "Reversed_Pe",
237 JoiningGroup::RohingyaYeh => "Rohingya_Yeh",
238 JoiningGroup::Sad => "Sad",
239 JoiningGroup::Sadhe => "Sadhe",
240 JoiningGroup::Seen => "Seen",
241 JoiningGroup::Semkath => "Semkath",
242 JoiningGroup::Shin => "Shin",
243 JoiningGroup::StraightWaw => "Straight_Waw",
244 JoiningGroup::SwashKaf => "Swash_Kaf",
245 JoiningGroup::SyriacWaw => "Syriac_Waw",
246 JoiningGroup::Tah => "Tah",
247 JoiningGroup::Taw => "Taw",
248 JoiningGroup::TehMarbuta => "Teh_Marbuta",
249 JoiningGroup::TehMarbutaGoal => "Teh_Marbuta_Goal",
250 JoiningGroup::Teth => "Teth",
251 JoiningGroup::ThinYeh => "Thin_Yeh",
252 JoiningGroup::VerticalTail => "Vertical_Tail",
253 JoiningGroup::Waw => "Waw",
254 JoiningGroup::Yeh => "Yeh",
255 JoiningGroup::YehBarree => "Yeh_Barree",
256 JoiningGroup::YehWithTail => "Yeh_With_Tail",
257 JoiningGroup::Yudh => "Yudh",
258 JoiningGroup::YudhHe => "Yudh_He",
259 JoiningGroup::Zain => "Zain",
260 JoiningGroup::Zhain => "Zhain",
261 }
262 }
263}
264
265impl std::str::FromStr for JoiningGroup {
266 type Err = Error;
267
268 fn from_str(s: &str) -> Result<JoiningGroup, Error> {
269 match s {
270 "African_Feh" => Ok(JoiningGroup::AfricanFeh),
271 "African_Noon" => Ok(JoiningGroup::AfricanNoon),
272 "African_Qaf" => Ok(JoiningGroup::AfricanQaf),
273 "Ain" => Ok(JoiningGroup::Ain),
274 "Alaph" => Ok(JoiningGroup::Alaph),
275 "Alef" => Ok(JoiningGroup::Alef),
276 "Beh" => Ok(JoiningGroup::Beh),
277 "Beth" => Ok(JoiningGroup::Beth),
278 "Burushaski_Yeh_Barree" => Ok(JoiningGroup::BurushaskiYehBarree),
279 "Dal" => Ok(JoiningGroup::Dal),
280 "Dalath_Rish" => Ok(JoiningGroup::DalathRish),
281 "E" => Ok(JoiningGroup::E),
282 "Farsi_Yeh" => Ok(JoiningGroup::FarsiYeh),
283 "Fe" => Ok(JoiningGroup::Fe),
284 "Feh" => Ok(JoiningGroup::Feh),
285 "Final_Semkath" => Ok(JoiningGroup::FinalSemkath),
286 "Gaf" => Ok(JoiningGroup::Gaf),
287 "Gamal" => Ok(JoiningGroup::Gamal),
288 "Hah" => Ok(JoiningGroup::Hah),
289 "Hanifi_Rohingya_Kinna_Ya" => {
290 Ok(JoiningGroup::HanifiRohingyaKinnaYa)
291 }
292 "Hanifi_Rohingya_Pa" => Ok(JoiningGroup::HanifiRohingyaPa),
293 "He" => Ok(JoiningGroup::He),
294 "Heh" => Ok(JoiningGroup::Heh),
295 "Heh_Goal" => Ok(JoiningGroup::HehGoal),
296 "Heth" => Ok(JoiningGroup::Heth),
297 "Kaf" => Ok(JoiningGroup::Kaf),
298 "Kaph" => Ok(JoiningGroup::Kaph),
299 "Kashmiri_Yeh" => Ok(JoiningGroup::KashmiriYeh),
300 "Khaph" => Ok(JoiningGroup::Khaph),
301 "Knotted_Heh" => Ok(JoiningGroup::KnottedHeh),
302 "Lam" => Ok(JoiningGroup::Lam),
303 "Lamadh" => Ok(JoiningGroup::Lamadh),
304 "Malayalam_Bha" => Ok(JoiningGroup::MalayalamBha),
305 "Malayalam_Ja" => Ok(JoiningGroup::MalayalamJa),
306 "Malayalam_Lla" => Ok(JoiningGroup::MalayalamLla),
307 "Malayalam_Llla" => Ok(JoiningGroup::MalayalamLlla),
308 "Malayalam_Nga" => Ok(JoiningGroup::MalayalamNga),
309 "Malayalam_Nna" => Ok(JoiningGroup::MalayalamNna),
310 "Malayalam_Nnna" => Ok(JoiningGroup::MalayalamNnna),
311 "Malayalam_Nya" => Ok(JoiningGroup::MalayalamNya),
312 "Malayalam_Ra" => Ok(JoiningGroup::MalayalamRa),
313 "Malayalam_Ssa" => Ok(JoiningGroup::MalayalamSsa),
314 "Malayalam_Tta" => Ok(JoiningGroup::MalayalamTta),
315 "Manichaean_Aleph" => Ok(JoiningGroup::ManichaeanAleph),
316 "Manichaean_Ayin" => Ok(JoiningGroup::ManichaeanAyin),
317 "Manichaean_Beth" => Ok(JoiningGroup::ManichaeanBeth),
318 "Manichaean_Daleth" => Ok(JoiningGroup::ManichaeanDaleth),
319 "Manichaean_Dhamedh" => Ok(JoiningGroup::ManichaeanDhamedh),
320 "Manichaean_Five" => Ok(JoiningGroup::ManichaeanFive),
321 "Manichaean_Gimel" => Ok(JoiningGroup::ManichaeanGimel),
322 "Manichaean_Heth" => Ok(JoiningGroup::ManichaeanHeth),
323 "Manichaean_Hundred" => Ok(JoiningGroup::ManichaeanHundred),
324 "Manichaean_Kaph" => Ok(JoiningGroup::ManichaeanKaph),
325 "Manichaean_Lamedh" => Ok(JoiningGroup::ManichaeanLamedh),
326 "Manichaean_Mem" => Ok(JoiningGroup::ManichaeanMem),
327 "Manichaean_Nun" => Ok(JoiningGroup::ManichaeanNun),
328 "Manichaean_One" => Ok(JoiningGroup::ManichaeanOne),
329 "Manichaean_Pe" => Ok(JoiningGroup::ManichaeanPe),
330 "Manichaean_Qoph" => Ok(JoiningGroup::ManichaeanQoph),
331 "Manichaean_Resh" => Ok(JoiningGroup::ManichaeanResh),
332 "Manichaean_Sadhe" => Ok(JoiningGroup::ManichaeanSadhe),
333 "Manichaean_Samekh" => Ok(JoiningGroup::ManichaeanSamekh),
334 "Manichaean_Taw" => Ok(JoiningGroup::ManichaeanTaw),
335 "Manichaean_Ten" => Ok(JoiningGroup::ManichaeanTen),
336 "Manichaean_Teth" => Ok(JoiningGroup::ManichaeanTeth),
337 "Manichaean_Thamedh" => Ok(JoiningGroup::ManichaeanThamedh),
338 "Manichaean_Twenty" => Ok(JoiningGroup::ManichaeanTwenty),
339 "Manichaean_Waw" => Ok(JoiningGroup::ManichaeanWaw),
340 "Manichaean_Yodh" => Ok(JoiningGroup::ManichaeanYodh),
341 "Manichaean_Zayin" => Ok(JoiningGroup::ManichaeanZayin),
342 "Meem" => Ok(JoiningGroup::Meem),
343 "Mim" => Ok(JoiningGroup::Mim),
344 "No_Joining_Group" => Ok(JoiningGroup::NoJoiningGroup),
345 "Noon" => Ok(JoiningGroup::Noon),
346 "Nun" => Ok(JoiningGroup::Nun),
347 "Nya" => Ok(JoiningGroup::Nya),
348 "Pe" => Ok(JoiningGroup::Pe),
349 "Qaf" => Ok(JoiningGroup::Qaf),
350 "Qaph" => Ok(JoiningGroup::Qaph),
351 "Reh" => Ok(JoiningGroup::Reh),
352 "Reversed_Pe" => Ok(JoiningGroup::ReversedPe),
353 "Rohingya_Yeh" => Ok(JoiningGroup::RohingyaYeh),
354 "Sad" => Ok(JoiningGroup::Sad),
355 "Sadhe" => Ok(JoiningGroup::Sadhe),
356 "Seen" => Ok(JoiningGroup::Seen),
357 "Semkath" => Ok(JoiningGroup::Semkath),
358 "Shin" => Ok(JoiningGroup::Shin),
359 "Straight_Waw" => Ok(JoiningGroup::StraightWaw),
360 "Swash_Kaf" => Ok(JoiningGroup::SwashKaf),
361 "Syriac_Waw" => Ok(JoiningGroup::SyriacWaw),
362 "Tah" => Ok(JoiningGroup::Tah),
363 "Taw" => Ok(JoiningGroup::Taw),
364 "Teh_Marbuta" => Ok(JoiningGroup::TehMarbuta),
365 "Teh_Marbuta_Goal" => Ok(JoiningGroup::TehMarbutaGoal),
366 "Teth" => Ok(JoiningGroup::Teth),
367 "Thin_Yeh" => Ok(JoiningGroup::ThinYeh),
368 "Vertical_Tail" => Ok(JoiningGroup::VerticalTail),
369 "Waw" => Ok(JoiningGroup::Waw),
370 "Yeh" => Ok(JoiningGroup::Yeh),
371 "Yeh_Barree" => Ok(JoiningGroup::YehBarree),
372 "Yeh_With_Tail" => Ok(JoiningGroup::YehWithTail),
373 "Yudh" => Ok(JoiningGroup::Yudh),
374 "Yudh_He" => Ok(JoiningGroup::YudhHe),
375 "Zain" => Ok(JoiningGroup::Zain),
376 "Zhain" => Ok(JoiningGroup::Zhain),
377 _ => err!("unrecognized joining group: '{}'", s),
378 }
379 }
380}
381
382impl Default for JoiningGroup {
383 fn default() -> JoiningGroup {
384 JoiningGroup::NoJoiningGroup
385 }
386}
387
388impl JoiningType {
389 pub fn as_str(&self) -> &str {
390 match self {
391 JoiningType::RightJoining => "R",
392 JoiningType::LeftJoining => "L",
393 JoiningType::DualJoining => "D",
394 JoiningType::JoinCausing => "C",
395 JoiningType::NonJoining => "U",
396 JoiningType::Transparent => "T",
397 }
398 }
399}
400
401impl Default for JoiningType {
402 fn default() -> JoiningType {
403 JoiningType::NonJoining
404 }
405}
406
407impl std::str::FromStr for JoiningType {
408 type Err = Error;
409
410 fn from_str(s: &str) -> Result<JoiningType, Error> {
411 match s {
412 "R" => Ok(JoiningType::RightJoining),
413 "L" => Ok(JoiningType::LeftJoining),
414 "D" => Ok(JoiningType::DualJoining),
415 "C" => Ok(JoiningType::JoinCausing),
416 "U" => Ok(JoiningType::NonJoining),
417 "T" => Ok(JoiningType::Transparent),
418 _ => err!(
419 "unrecognized joining type: '{}' \
420 (must be one of R, L, D, C, U or T)",
421 s
422 ),
423 }
424 }
425}
426
427impl UcdFile for ArabicShaping {
428 fn relative_file_path() -> &'static Path {
429 Path::new("ArabicShaping.txt")
430 }
431}
432
433impl UcdFileByCodepoint for ArabicShaping {
434 fn codepoints(&self) -> CodepointIter {
435 self.codepoint.into_iter()
436 }
437}
438
439impl std::str::FromStr for ArabicShaping {
440 type Err = Error;
441
442 fn from_str(line: &str) -> Result<ArabicShaping, Error> {
443 let re_parts = regex!(
444 r"(?x)
445 ^
446 \s*(?P<codepoint>[A-F0-9]+)\s*;
447 \s*(?P<name>[^;]+)\s*;
448 \s*(?P<joining_type>[^;]+)\s*;
449 \s*(?P<joining_group>[^;]+)
450 $
451 ",
452 );
453 let caps = match re_parts.captures(line.trim()) {
454 Some(caps) => caps,
455 None => return err!("invalid ArabicShaping line"),
456 };
457
458 Ok(ArabicShaping {
459 codepoint: caps["codepoint"].parse()?,
460 schematic_name: caps["name"].to_string(),
461 joining_type: caps["joining_type"].parse()?,
462 joining_group: formal_name(&caps["joining_group"]).parse()?,
463 })
464 }
465}
466
467fn formal_name(s: &str) -> String {
472 s.split(|c: char| c.is_whitespace() || c == '_')
474 .map(|component| {
475 let lower = component.to_ascii_lowercase();
477 let mut chars = lower.chars();
478 match chars.next() {
479 None => String::new(),
480 Some(f) => {
481 f.to_uppercase().collect::<String>() + chars.as_str()
482 }
483 }
484 })
485 .collect::<Vec<_>>()
486 .join("_")
487}
488
489#[cfg(test)]
490mod tests {
491 use crate::common::Codepoint;
492
493 use super::{ArabicShaping, JoiningType};
494 use crate::arabic_shaping::JoiningGroup;
495
496 fn codepoint(n: u32) -> Codepoint {
497 Codepoint::from_u32(n).unwrap()
498 }
499
500 fn s(string: &str) -> String {
501 string.to_string()
502 }
503
504 #[test]
505 fn parse1() {
506 let line = "0600; ARABIC NUMBER SIGN; U; No_Joining_Group\n";
507 let data: ArabicShaping = line.parse().unwrap();
508 assert_eq!(
509 data,
510 ArabicShaping {
511 codepoint: codepoint(0x0600),
512 schematic_name: s("ARABIC NUMBER SIGN"),
513 joining_type: JoiningType::NonJoining,
514 joining_group: JoiningGroup::NoJoiningGroup,
515 }
516 );
517 }
518
519 #[test]
520 fn parse2() {
521 let line = "063D; FARSI YEH WITH INVERTED V ABOVE; D; FARSI YEH\n";
522 let data: ArabicShaping = line.parse().unwrap();
523 assert_eq!(
524 data,
525 ArabicShaping {
526 codepoint: codepoint(0x063D),
527 schematic_name: s("FARSI YEH WITH INVERTED V ABOVE"),
528 joining_type: JoiningType::DualJoining,
529 joining_group: JoiningGroup::FarsiYeh,
530 }
531 );
532 }
533
534 #[test]
535 fn parse3() {
536 let line =
537 "10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA\n";
538 let data: ArabicShaping = line.parse().unwrap();
539 assert_eq!(
540 data,
541 ArabicShaping {
542 codepoint: codepoint(0x10D23),
543 schematic_name: s(
544 "HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE"
545 ),
546 joining_type: JoiningType::DualJoining,
547 joining_group: JoiningGroup::HanifiRohingyaKinnaYa,
548 }
549 );
550 }
551}