precis_tools/
ucd_parsers.rs1use crate::Error;
2use std::path::Path;
3use std::str::FromStr;
4use ucd_parse::UcdFile;
5
6#[derive(Clone, Debug, Default, Eq, PartialEq)]
10pub struct HangulSyllableType {
11 pub prop: ucd_parse::Property,
13}
14
15impl ucd_parse::UcdFile for HangulSyllableType {
16 fn relative_file_path() -> &'static Path {
17 Path::new("HangulSyllableType.txt")
18 }
19}
20
21impl ucd_parse::UcdFileByCodepoint for HangulSyllableType {
22 fn codepoints(&self) -> ucd_parse::CodepointIter {
23 self.prop.codepoints.into_iter()
24 }
25}
26
27impl FromStr for HangulSyllableType {
28 type Err = ucd_parse::Error;
29
30 fn from_str(line: &str) -> Result<HangulSyllableType, ucd_parse::Error> {
31 let prop = ucd_parse::Property::from_str(line)?;
32 Ok(HangulSyllableType { prop })
33 }
34}
35
36#[derive(Clone, Debug, Default, Eq, PartialEq)]
38pub struct DerivedJoiningType {
39 pub prop: ucd_parse::Property,
41}
42
43impl ucd_parse::UcdFile for DerivedJoiningType {
44 fn relative_file_path() -> &'static Path {
45 Path::new("extracted/DerivedJoiningType.txt")
46 }
47}
48
49impl ucd_parse::UcdFileByCodepoint for DerivedJoiningType {
50 fn codepoints(&self) -> ucd_parse::CodepointIter {
51 self.prop.codepoints.into_iter()
52 }
53}
54
55impl FromStr for DerivedJoiningType {
56 type Err = ucd_parse::Error;
57
58 fn from_str(line: &str) -> Result<DerivedJoiningType, ucd_parse::Error> {
59 let prop = ucd_parse::Property::from_str(line)?;
60 Ok(DerivedJoiningType { prop })
61 }
62}
63
64#[derive(Clone, Debug, Default, Eq, PartialEq)]
73pub struct UnicodeData {
74 pub codepoints: ucd_parse::Codepoints,
76 pub name: String,
78 pub general_category: String,
80 pub canonical_combining_class: u8,
85 pub bidi_class: String,
90 pub decomposition: ucd_parse::UnicodeDataDecomposition,
93 pub numeric_type_decimal: Option<u8>,
96 pub numeric_type_digit: Option<u8>,
101 pub numeric_type_numeric: Option<ucd_parse::UnicodeDataNumeric>,
104 pub bidi_mirrored: bool,
107 pub unicode1_name: String,
111 pub iso_comment: String,
114 pub simple_uppercase_mapping: Option<ucd_parse::Codepoint>,
116 pub simple_lowercase_mapping: Option<ucd_parse::Codepoint>,
118 pub simple_titlecase_mapping: Option<ucd_parse::Codepoint>,
120}
121
122impl UnicodeData {
123 pub fn parse(ucd_dir: &Path) -> Result<Vec<UnicodeData>, Error> {
125 let mut xs = vec![];
126
127 let raws: Vec<ucd_parse::UnicodeData> = ucd_parse::parse(ucd_dir)?;
128 let mut range: Option<ucd_parse::CodepointRange> = None;
129 for udata in raws.iter() {
130 match range.as_mut() {
131 Some(r) => {
132 if !udata.is_range_end() {
133 return err!("Expected end range after codepoint {:#06x}. Current codepoint{:#06x}. File: {}",
134 r.start.value(), udata.codepoint.value(), ucd_parse::UnicodeData::file_path(ucd_dir).to_str().unwrap());
135 }
136 r.end = udata.codepoint;
137 if r.start.value() > r.end.value() {
138 return err!(
139 "Start range {:#06x} is minor than end range {:#06x}. File: {}",
140 r.start.value(),
141 r.end.value(),
142 ucd_parse::UnicodeData::file_path(ucd_dir).to_str().unwrap()
143 );
144 }
145 }
146 None => {
147 if udata.is_range_end() {
148 return err!(
149 "Found end range without starting. Current codepoint {:#06x}. File: {}",
150 udata.codepoint.value(),
151 ucd_parse::UnicodeData::file_path(ucd_dir).to_str().unwrap()
152 );
153 }
154 }
155 }
156
157 if udata.is_range_start() {
158 if range.is_some() {
159 return err!(
160 "Previous range started with codepoint {:#06x} has not yet finished. File: {}",
161 range.unwrap().start.value(),
162 ucd_parse::UnicodeData::file_path(ucd_dir)
163 .to_str()
164 .unwrap()
165 );
166 }
167 range = Some(ucd_parse::CodepointRange {
168 start: udata.codepoint,
169 end: udata.codepoint,
170 });
171 continue;
172 }
173
174 let codepoints = match range {
175 Some(r) => ucd_parse::Codepoints::Range(r),
176 None => ucd_parse::Codepoints::Single(udata.codepoint),
177 };
178
179 let ucd = UnicodeData {
180 codepoints,
181 name: udata.name.clone(),
182 general_category: udata.general_category.clone(),
183 canonical_combining_class: udata.canonical_combining_class,
184 bidi_class: udata.bidi_class.clone(),
185 decomposition: udata.decomposition.clone(),
186 numeric_type_decimal: udata.numeric_type_decimal,
187 numeric_type_digit: udata.numeric_type_digit,
188 numeric_type_numeric: udata.numeric_type_numeric,
189 bidi_mirrored: udata.bidi_mirrored,
190 unicode1_name: udata.unicode1_name.clone(),
191 iso_comment: udata.iso_comment.clone(),
192 simple_uppercase_mapping: udata.simple_uppercase_mapping,
193 simple_lowercase_mapping: udata.simple_lowercase_mapping,
194 simple_titlecase_mapping: udata.simple_titlecase_mapping,
195 };
196
197 if udata.is_range_end() {
198 range = None;
199 }
200
201 xs.push(ucd);
202 }
203
204 Ok(xs)
205 }
206}