ucd_parse/
arabic_shaping.rs

1use std::path::Path;
2
3use crate::{
4    common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
5    error::Error,
6};
7
8/// Represents a single row in the `ArabicShaping.txt` file.
9///
10/// The field names were taken from the header of ArabicShaping.txt.
11#[derive(Clone, Debug, Default, Eq, PartialEq)]
12pub struct ArabicShaping {
13    /// The codepoint corresponding to this row.
14    pub codepoint: Codepoint,
15    /// A short schematic name for the codepoint.
16    ///
17    /// The schematic name is descriptive of the shape, based as consistently as
18    /// possible on a name for the skeleton and then the diacritic marks applied
19    /// to the skeleton, if any.  Note that this schematic name is considered a
20    /// comment, and does not constitute a formal property value.
21    pub schematic_name: String,
22    /// The "joining type" of this codepoint.
23    pub joining_type: JoiningType,
24    /// The "joining group" of this codepoint.
25    pub joining_group: String,
26}
27
28/// The Joining_Type field read from ArabicShaping.txt
29#[derive(Clone, Copy, Debug, Eq, PartialEq)]
30pub enum JoiningType {
31    RightJoining,
32    LeftJoining,
33    DualJoining,
34    JoinCausing,
35    NonJoining,
36    Transparent,
37}
38
39impl JoiningType {
40    pub fn as_str(&self) -> &str {
41        match self {
42            JoiningType::RightJoining => "R",
43            JoiningType::LeftJoining => "L",
44            JoiningType::DualJoining => "D",
45            JoiningType::JoinCausing => "C",
46            JoiningType::NonJoining => "U",
47            JoiningType::Transparent => "T",
48        }
49    }
50}
51
52impl Default for JoiningType {
53    fn default() -> JoiningType {
54        JoiningType::NonJoining
55    }
56}
57
58impl std::str::FromStr for JoiningType {
59    type Err = Error;
60
61    fn from_str(s: &str) -> Result<JoiningType, Error> {
62        match s {
63            "R" => Ok(JoiningType::RightJoining),
64            "L" => Ok(JoiningType::LeftJoining),
65            "D" => Ok(JoiningType::DualJoining),
66            "C" => Ok(JoiningType::JoinCausing),
67            "U" => Ok(JoiningType::NonJoining),
68            "T" => Ok(JoiningType::Transparent),
69            _ => err!(
70                "unrecognized joining type: '{}' \
71                 (must be one of R, L, D, C, U or T)",
72                s
73            ),
74        }
75    }
76}
77
78impl UcdFile for ArabicShaping {
79    fn relative_file_path() -> &'static Path {
80        Path::new("ArabicShaping.txt")
81    }
82}
83
84impl UcdFileByCodepoint for ArabicShaping {
85    fn codepoints(&self) -> CodepointIter {
86        self.codepoint.into_iter()
87    }
88}
89
90impl std::str::FromStr for ArabicShaping {
91    type Err = Error;
92
93    fn from_str(line: &str) -> Result<ArabicShaping, Error> {
94        let re_parts = regex!(
95            r"(?x)
96                ^
97                \s*(?P<codepoint>[A-F0-9]+)\s*;
98                \s*(?P<name>[^;]+)\s*;
99                \s*(?P<joining_type>[^;]+)\s*;
100                \s*(?P<joining_group>[^;]+)
101                $
102                ",
103        );
104        let caps = match re_parts.captures(line.trim()) {
105            Some(caps) => caps,
106            None => return err!("invalid ArabicShaping line"),
107        };
108
109        Ok(ArabicShaping {
110            codepoint: caps["codepoint"].parse()?,
111            schematic_name: caps["name"].to_string(),
112            joining_type: caps["joining_type"].parse()?,
113            joining_group: caps["joining_group"].to_string(),
114        })
115    }
116}
117
118#[cfg(test)]
119mod tests {
120    use crate::common::Codepoint;
121
122    use super::{ArabicShaping, JoiningType};
123
124    fn codepoint(n: u32) -> Codepoint {
125        Codepoint::from_u32(n).unwrap()
126    }
127
128    fn s(string: &str) -> String {
129        string.to_string()
130    }
131
132    #[test]
133    fn parse1() {
134        let line = "0600; ARABIC NUMBER SIGN; U; No_Joining_Group\n";
135        let data: ArabicShaping = line.parse().unwrap();
136        assert_eq!(
137            data,
138            ArabicShaping {
139                codepoint: codepoint(0x0600),
140                schematic_name: s("ARABIC NUMBER SIGN"),
141                joining_type: JoiningType::NonJoining,
142                joining_group: s("No_Joining_Group")
143            }
144        );
145    }
146
147    #[test]
148    fn parse2() {
149        let line = "063D; FARSI YEH WITH INVERTED V ABOVE; D; FARSI YEH\n";
150        let data: ArabicShaping = line.parse().unwrap();
151        assert_eq!(
152            data,
153            ArabicShaping {
154                codepoint: codepoint(0x063D),
155                schematic_name: s("FARSI YEH WITH INVERTED V ABOVE"),
156                joining_type: JoiningType::DualJoining,
157                joining_group: s("FARSI YEH")
158            }
159        );
160    }
161
162    #[test]
163    fn parse3() {
164        let line =
165            "10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA\n";
166        let data: ArabicShaping = line.parse().unwrap();
167        assert_eq!(
168            data,
169            ArabicShaping {
170                codepoint: codepoint(0x10D23),
171                schematic_name: s(
172                    "HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE"
173                ),
174                joining_type: JoiningType::DualJoining,
175                joining_group: s("HANIFI ROHINGYA KINNA YA")
176            }
177        );
178    }
179}