precis_tools/
ucd_parsers.rs

1use crate::Error;
2use std::path::Path;
3use std::str::FromStr;
4use ucd_parse::UcdFile;
5
6/// A single row in the
7/// [`HangulSyllableType`](http://www.unicode.org/reports/tr44/#HangulSyllableType.txt)
8/// file.
9#[derive(Clone, Debug, Default, Eq, PartialEq)]
10pub struct HangulSyllableType {
11    /// A single row in the `PropList.txt` file.
12    pub prop: ucd_parse::Property,
13}
14
15impl ucd_parse::UcdFile for HangulSyllableType {
16    fn relative_file_path() -> &'static Path {
17        Path::new("HangulSyllableType.txt")
18    }
19}
20
21impl ucd_parse::UcdFileByCodepoint for HangulSyllableType {
22    fn codepoints(&self) -> ucd_parse::CodepointIter {
23        self.prop.codepoints.into_iter()
24    }
25}
26
27impl FromStr for HangulSyllableType {
28    type Err = ucd_parse::Error;
29
30    fn from_str(line: &str) -> Result<HangulSyllableType, ucd_parse::Error> {
31        let prop = ucd_parse::Property::from_str(line)?;
32        Ok(HangulSyllableType { prop })
33    }
34}
35
36/// A single row in the `DerivedJoiningType` file.
37#[derive(Clone, Debug, Default, Eq, PartialEq)]
38pub struct DerivedJoiningType {
39    /// A single row in the `PropList.txt` file.
40    pub prop: ucd_parse::Property,
41}
42
43impl ucd_parse::UcdFile for DerivedJoiningType {
44    fn relative_file_path() -> &'static Path {
45        Path::new("extracted/DerivedJoiningType.txt")
46    }
47}
48
49impl ucd_parse::UcdFileByCodepoint for DerivedJoiningType {
50    fn codepoints(&self) -> ucd_parse::CodepointIter {
51        self.prop.codepoints.into_iter()
52    }
53}
54
55impl FromStr for DerivedJoiningType {
56    type Err = ucd_parse::Error;
57
58    fn from_str(line: &str) -> Result<DerivedJoiningType, ucd_parse::Error> {
59        let prop = ucd_parse::Property::from_str(line)?;
60        Ok(DerivedJoiningType { prop })
61    }
62}
63
64/// Extension of the `UnicodeData` `struct` provided by the
65/// [`ucd_parse`](https://docs.rs/ucd-parse) crate. Unlike the
66/// original one, this `struct` does not represent a single line in the
67/// [`UnicodeData`](https://www.unicode.org/reports/tr44/#UnicodeData.txt)
68/// file, but it could be the result of a whole parsing of several files
69/// to contain range of Unicode code points. Note that this file, unlike
70/// others in the Unicode data files, represents ranges split in different
71/// lines in order not to break parsers compatibility.
72#[derive(Clone, Debug, Default, Eq, PartialEq)]
73pub struct UnicodeData {
74    /// The code points corresponding to this row.
75    pub codepoints: ucd_parse::Codepoints,
76    /// The name of this code point.
77    pub name: String,
78    /// The "general category" of this code point.
79    pub general_category: String,
80    /// The class of this code point used in the Canonical Ordering Algorithm.
81    ///
82    /// Note that some classes map to a particular symbol. See
83    /// [`UAX44`, Table 15](https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
84    pub canonical_combining_class: u8,
85    /// The bidirectional class of this code point.
86    ///
87    /// Possible values are listed in
88    /// [`UAX44`, Table 13](https://www.unicode.org/reports/tr44/#Bidi_Class_Values).
89    pub bidi_class: String,
90    /// The decomposition mapping for this code point. This includes its
91    /// formatting tag (if present).
92    pub decomposition: ucd_parse::UnicodeDataDecomposition,
93    /// A decimal numeric representation of this code point, if it has the
94    /// property `Numeric_Type=Decimal`.
95    pub numeric_type_decimal: Option<u8>,
96    /// A decimal numeric representation of this code point, if it has the
97    /// property `Numeric_Type=Digit`. Note that while this field is still
98    /// populated for existing code points, no new code points will have this
99    /// field populated.
100    pub numeric_type_digit: Option<u8>,
101    /// A decimal or rational numeric representation of this code point, if it
102    /// has the property `Numeric_Type=Numeric`.
103    pub numeric_type_numeric: Option<ucd_parse::UnicodeDataNumeric>,
104    /// A Boolean indicating whether this code point is "mirrored" in
105    /// bidirectional text.
106    pub bidi_mirrored: bool,
107    /// The "old" Unicode 1.0 or ISO 6429 name of this code point. Note that
108    /// this field is empty unless it is significantly different from
109    /// the `name` field.
110    pub unicode1_name: String,
111    /// The ISO 10464 comment field. This field no longer contains any non-NULL
112    /// values.
113    pub iso_comment: String,
114    /// This code point's simple uppercase mapping, if it exists.
115    pub simple_uppercase_mapping: Option<ucd_parse::Codepoint>,
116    /// This code point's simple lowercase mapping, if it exists.
117    pub simple_lowercase_mapping: Option<ucd_parse::Codepoint>,
118    /// This code point's simple title case mapping, if it exists.
119    pub simple_titlecase_mapping: Option<ucd_parse::Codepoint>,
120}
121
122impl UnicodeData {
123    /// Parse a particular `UCD` file into a sequence of rows.
124    pub fn parse(ucd_dir: &Path) -> Result<Vec<UnicodeData>, Error> {
125        let mut xs = vec![];
126
127        let raws: Vec<ucd_parse::UnicodeData> = ucd_parse::parse(ucd_dir)?;
128        let mut range: Option<ucd_parse::CodepointRange> = None;
129        for udata in raws.iter() {
130            match range.as_mut() {
131                Some(r) => {
132                    if !udata.is_range_end() {
133                        return err!("Expected end range after codepoint {:#06x}. Current codepoint{:#06x}. File: {}",
134							r.start.value(), udata.codepoint.value(), ucd_parse::UnicodeData::file_path(ucd_dir).to_str().unwrap());
135                    }
136                    r.end = udata.codepoint;
137                    if r.start.value() > r.end.value() {
138                        return err!(
139                            "Start range {:#06x} is minor than end range {:#06x}. File: {}",
140                            r.start.value(),
141                            r.end.value(),
142                            ucd_parse::UnicodeData::file_path(ucd_dir).to_str().unwrap()
143                        );
144                    }
145                }
146                None => {
147                    if udata.is_range_end() {
148                        return err!(
149                            "Found end range without starting. Current codepoint {:#06x}. File: {}",
150                            udata.codepoint.value(),
151                            ucd_parse::UnicodeData::file_path(ucd_dir).to_str().unwrap()
152                        );
153                    }
154                }
155            }
156
157            if udata.is_range_start() {
158                if range.is_some() {
159                    return err!(
160                            "Previous range started with codepoint {:#06x} has not yet finished. File: {}",
161							range.unwrap().start.value(),
162                            ucd_parse::UnicodeData::file_path(ucd_dir)
163                                .to_str()
164                                .unwrap()
165                        );
166                }
167                range = Some(ucd_parse::CodepointRange {
168                    start: udata.codepoint,
169                    end: udata.codepoint,
170                });
171                continue;
172            }
173
174            let codepoints = match range {
175                Some(r) => ucd_parse::Codepoints::Range(r),
176                None => ucd_parse::Codepoints::Single(udata.codepoint),
177            };
178
179            let ucd = UnicodeData {
180                codepoints,
181                name: udata.name.clone(),
182                general_category: udata.general_category.clone(),
183                canonical_combining_class: udata.canonical_combining_class,
184                bidi_class: udata.bidi_class.clone(),
185                decomposition: udata.decomposition.clone(),
186                numeric_type_decimal: udata.numeric_type_decimal,
187                numeric_type_digit: udata.numeric_type_digit,
188                numeric_type_numeric: udata.numeric_type_numeric,
189                bidi_mirrored: udata.bidi_mirrored,
190                unicode1_name: udata.unicode1_name.clone(),
191                iso_comment: udata.iso_comment.clone(),
192                simple_uppercase_mapping: udata.simple_uppercase_mapping,
193                simple_lowercase_mapping: udata.simple_lowercase_mapping,
194                simple_titlecase_mapping: udata.simple_titlecase_mapping,
195            };
196
197            if udata.is_range_end() {
198                range = None;
199            }
200
201            xs.push(ucd);
202        }
203
204        Ok(xs)
205    }
206}