1use std::{
2 collections::BTreeMap,
3 fmt,
4 fs::File,
5 io::{self, BufRead},
6 path::{Path, PathBuf},
7 str::FromStr,
8};
9
10use crate::error::{Error, ErrorKind};
11
12pub fn parse<P, D>(ucd_dir: P) -> Result<Vec<D>, Error>
16where
17 P: AsRef<Path>,
18 D: UcdFile,
19{
20 let mut xs = vec![];
21 for result in D::from_dir(ucd_dir)? {
22 let x = result?;
23 xs.push(x);
24 }
25 Ok(xs)
26}
27
28pub fn parse_by_codepoint<P, D>(
32 ucd_dir: P,
33) -> Result<BTreeMap<Codepoint, D>, Error>
34where
35 P: AsRef<Path>,
36 D: UcdFileByCodepoint,
37{
38 let mut map = BTreeMap::new();
39 for result in D::from_dir(ucd_dir)? {
40 let x = result?;
41 for cp in x.codepoints() {
42 map.insert(cp, x.clone());
43 }
44 }
45 Ok(map)
46}
47
48pub fn parse_many_by_codepoint<P, D>(
57 ucd_dir: P,
58) -> Result<BTreeMap<Codepoint, Vec<D>>, Error>
59where
60 P: AsRef<Path>,
61 D: UcdFileByCodepoint,
62{
63 let mut map = BTreeMap::new();
64 for result in D::from_dir(ucd_dir)? {
65 let x = result?;
66 for cp in x.codepoints() {
67 map.entry(cp).or_insert(vec![]).push(x.clone());
68 }
69 }
70 Ok(map)
71}
72
73pub fn ucd_directory_version<D: ?Sized + AsRef<Path>>(
79 ucd_dir: &D,
80) -> Result<(u64, u64, u64), Error> {
81 fn ucd_directory_version_inner(
83 ucd_dir: &Path,
84 ) -> Result<(u64, u64, u64), Error> {
85 let re_version_rx = regex!(r"-([0-9]+).([0-9]+).([0-9]+).txt");
86
87 let proplist = ucd_dir.join("PropList.txt");
88 let contents = first_line(&proplist)?;
89 let caps = match re_version_rx.captures(&contents) {
90 Some(c) => c,
91 None => {
92 return err!("Failed to find version in line {:?}", contents)
93 }
94 };
95
96 let capture_to_num = |n| {
97 caps.get(n).unwrap().as_str().parse::<u64>().map_err(|e| Error {
98 kind: ErrorKind::Parse(format!(
99 "Failed to parse version from {:?} in PropList.txt: {}",
100 contents, e
101 )),
102 line: Some(0),
103 path: Some(proplist.clone()),
104 })
105 };
106 let major = capture_to_num(1)?;
107 let minor = capture_to_num(2)?;
108 let patch = capture_to_num(3)?;
109
110 Ok((major, minor, patch))
111 }
112 ucd_directory_version_inner(ucd_dir.as_ref())
113}
114
115fn first_line(path: &Path) -> Result<String, Error> {
116 let file = std::fs::File::open(path).map_err(|e| Error {
117 kind: ErrorKind::Io(e),
118 line: None,
119 path: Some(path.into()),
120 })?;
121
122 let mut reader = std::io::BufReader::new(file);
123 let mut line_contents = String::new();
124 reader.read_line(&mut line_contents).map_err(|e| Error {
125 kind: ErrorKind::Io(e),
126 line: None,
127 path: Some(path.into()),
128 })?;
129 Ok(line_contents)
130}
131
132pub fn parse_codepoint_association<'a>(
135 line: &'a str,
136) -> Result<(Codepoints, &'a str), Error> {
137 let re_parts = regex!(
138 r"(?x)
139 ^
140 \s*(?P<codepoints>[^\s;]+)\s*;
141 \s*(?P<property>[^;\x23]+)\s*
142 ",
143 );
144
145 let caps = match re_parts.captures(line.trim()) {
146 Some(caps) => caps,
147 None => return err!("invalid PropList line: '{}'", line),
148 };
149 let property = match caps.name("property") {
150 Some(property) => property.as_str().trim(),
151 None => {
152 return err!(
153 "could not find property name in PropList line: '{}'",
154 line
155 )
156 }
157 };
158 Ok((caps["codepoints"].parse()?, property))
159}
160
161pub fn parse_codepoint_sequence(s: &str) -> Result<Vec<Codepoint>, Error> {
164 let mut cps = vec![];
165 for cp in s.trim().split_whitespace() {
166 cps.push(cp.parse()?);
167 }
168 Ok(cps)
169}
170
171pub fn parse_break_test(line: &str) -> Result<(Vec<String>, String), Error> {
178 let re_parts = regex!(
179 r"(?x)
180 ^
181 (?:÷|×)
182 (?P<groups>(?:\s[0-9A-Fa-f]{4,5}\s(?:÷|×))+)
183 \s+
184 \#(?P<comment>.+)
185 $
186 ",
187 );
188 let re_group = regex!(
189 r"(?x)
190 (?P<codepoint>[0-9A-Fa-f]{4,5})\s(?P<kind>÷|×)
191 ",
192 );
193
194 let caps = match re_parts.captures(line.trim()) {
195 Some(caps) => caps,
196 None => return err!("invalid break test line: '{}'", line),
197 };
198 let comment = caps["comment"].trim().to_string();
199
200 let mut groups = vec![];
201 let mut cur = String::new();
202 for cap in re_group.captures_iter(&caps["groups"]) {
203 let cp: Codepoint = cap["codepoint"].parse()?;
204 let ch = match cp.scalar() {
205 Some(ch) => ch,
206 None => {
207 return err!(
208 "invalid codepoint '{:X}' in line: '{}'",
209 cp.value(),
210 line
211 )
212 }
213 };
214 cur.push(ch);
215 if &cap["kind"] == "÷" {
216 groups.push(cur);
217 cur = String::new();
218 }
219 }
220 Ok((groups, comment))
221}
222
223pub trait UcdFile:
225 Clone + fmt::Debug + Default + Eq + FromStr<Err = Error> + PartialEq
226{
227 fn relative_file_path() -> &'static Path;
230
231 fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf {
234 ucd_dir.as_ref().join(Self::relative_file_path())
235 }
236
237 fn from_dir<P: AsRef<Path>>(
241 ucd_dir: P,
242 ) -> Result<UcdLineParser<File, Self>, Error> {
243 UcdLineParser::from_path(Self::file_path(ucd_dir))
244 }
245}
246
247pub trait UcdFileByCodepoint: UcdFile {
250 fn codepoints(&self) -> CodepointIter;
252}
253
254#[derive(Debug)]
265pub struct UcdLineParser<R, D> {
266 path: Option<PathBuf>,
267 rdr: io::BufReader<R>,
268 line: String,
269 line_number: u64,
270 _data: std::marker::PhantomData<D>,
271}
272
273impl<D> UcdLineParser<File, D> {
274 pub(crate) fn from_path<P: AsRef<Path>>(
276 path: P,
277 ) -> Result<UcdLineParser<File, D>, Error> {
278 let path = path.as_ref();
279 let file = File::open(path).map_err(|e| Error {
280 kind: ErrorKind::Io(e),
281 line: None,
282 path: Some(path.to_path_buf()),
283 })?;
284 Ok(UcdLineParser::new(Some(path.to_path_buf()), file))
285 }
286}
287
288impl<R: io::Read, D> UcdLineParser<R, D> {
289 pub(crate) fn new(path: Option<PathBuf>, rdr: R) -> UcdLineParser<R, D> {
297 UcdLineParser {
298 path,
299 rdr: io::BufReader::new(rdr),
300 line: String::new(),
301 line_number: 0,
302 _data: std::marker::PhantomData,
303 }
304 }
305}
306
307impl<R: io::Read, D: FromStr<Err = Error>> Iterator for UcdLineParser<R, D> {
308 type Item = Result<D, Error>;
309
310 fn next(&mut self) -> Option<Result<D, Error>> {
311 loop {
312 self.line_number += 1;
313 self.line.clear();
314 let n = match self.rdr.read_line(&mut self.line) {
315 Err(err) => {
316 return Some(Err(Error {
317 kind: ErrorKind::Io(err),
318 line: None,
319 path: self.path.clone(),
320 }))
321 }
322 Ok(n) => n,
323 };
324 if n == 0 {
325 return None;
326 }
327 if !self.line.starts_with('#') && !self.line.trim().is_empty() {
328 break;
329 }
330 }
331 let line_number = self.line_number;
332 Some(self.line.parse().map_err(|mut err: Error| {
333 err.line = Some(line_number);
334 err
335 }))
336 }
337}
338
339#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)]
341pub enum Codepoints {
342 Single(Codepoint),
344 Range(CodepointRange),
346}
347
348impl Default for Codepoints {
349 fn default() -> Codepoints {
350 Codepoints::Single(Codepoint::default())
351 }
352}
353
354impl IntoIterator for Codepoints {
355 type IntoIter = CodepointIter;
356 type Item = Codepoint;
357
358 fn into_iter(self) -> CodepointIter {
359 match self {
360 Codepoints::Single(x) => x.into_iter(),
361 Codepoints::Range(x) => x.into_iter(),
362 }
363 }
364}
365
366impl FromStr for Codepoints {
367 type Err = Error;
368
369 fn from_str(s: &str) -> Result<Codepoints, Error> {
370 if s.contains("..") {
371 CodepointRange::from_str(s).map(Codepoints::Range)
372 } else {
373 Codepoint::from_str(s).map(Codepoints::Single)
374 }
375 }
376}
377
378impl fmt::Display for Codepoints {
379 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
380 match *self {
381 Codepoints::Single(ref x) => x.fmt(f),
382 Codepoints::Range(ref x) => x.fmt(f),
383 }
384 }
385}
386
387impl PartialEq<u32> for Codepoints {
388 fn eq(&self, other: &u32) -> bool {
389 match *self {
390 Codepoints::Single(ref x) => x == other,
391 Codepoints::Range(ref x) => x == &(*other, *other),
392 }
393 }
394}
395
396impl PartialEq<Codepoint> for Codepoints {
397 fn eq(&self, other: &Codepoint) -> bool {
398 match *self {
399 Codepoints::Single(ref x) => x == other,
400 Codepoints::Range(ref x) => x == &(*other, *other),
401 }
402 }
403}
404
405impl PartialEq<(u32, u32)> for Codepoints {
406 fn eq(&self, other: &(u32, u32)) -> bool {
407 match *self {
408 Codepoints::Single(ref x) => &(x.value(), x.value()) == other,
409 Codepoints::Range(ref x) => x == other,
410 }
411 }
412}
413
414impl PartialEq<(Codepoint, Codepoint)> for Codepoints {
415 fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
416 match *self {
417 Codepoints::Single(ref x) => &(*x, *x) == other,
418 Codepoints::Range(ref x) => x == other,
419 }
420 }
421}
422
423#[derive(
426 Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
427)]
428pub struct CodepointRange {
429 pub start: Codepoint,
431 pub end: Codepoint,
433}
434
435impl IntoIterator for CodepointRange {
436 type IntoIter = CodepointIter;
437 type Item = Codepoint;
438
439 fn into_iter(self) -> CodepointIter {
440 CodepointIter { next: self.start.value(), range: self }
441 }
442}
443
444impl FromStr for CodepointRange {
445 type Err = Error;
446
447 fn from_str(s: &str) -> Result<CodepointRange, Error> {
448 let re_parts = regex!(r"^(?P<start>[A-Z0-9]+)\.\.(?P<end>[A-Z0-9]+)$");
449 let caps = match re_parts.captures(s) {
450 Some(caps) => caps,
451 None => return err!("invalid codepoint range: '{}'", s),
452 };
453 let start = caps["start"].parse().or_else(|err| {
454 err!("failed to parse '{}' as a codepoint range: {}", s, err)
455 })?;
456 let end = caps["end"].parse().or_else(|err| {
457 err!("failed to parse '{}' as a codepoint range: {}", s, err)
458 })?;
459 Ok(CodepointRange { start, end })
460 }
461}
462
463impl fmt::Display for CodepointRange {
464 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
465 write!(f, "{}..{}", self.start, self.end)
466 }
467}
468
469impl PartialEq<(u32, u32)> for CodepointRange {
470 fn eq(&self, other: &(u32, u32)) -> bool {
471 &(self.start.value(), self.end.value()) == other
472 }
473}
474
475impl PartialEq<(Codepoint, Codepoint)> for CodepointRange {
476 fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
477 &(self.start, self.end) == other
478 }
479}
480
481#[derive(
488 Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
489)]
490pub struct Codepoint(u32);
491
492impl Codepoint {
493 pub fn from_u32(n: u32) -> Result<Codepoint, Error> {
498 if n > 0x10FFFF {
499 err!("{:x} is not a valid Unicode codepoint", n)
500 } else {
501 Ok(Codepoint(n))
502 }
503 }
504
505 pub fn value(self) -> u32 {
507 self.0
508 }
509
510 pub fn scalar(self) -> Option<char> {
514 char::from_u32(self.0)
515 }
516}
517
518impl IntoIterator for Codepoint {
519 type IntoIter = CodepointIter;
520 type Item = Codepoint;
521
522 fn into_iter(self) -> CodepointIter {
523 let range = CodepointRange { start: self, end: self };
524 CodepointIter { next: self.value(), range }
525 }
526}
527
528impl FromStr for Codepoint {
529 type Err = Error;
530
531 fn from_str(s: &str) -> Result<Codepoint, Error> {
532 match u32::from_str_radix(s, 16) {
533 Ok(n) => Codepoint::from_u32(n),
534 Err(err) => {
535 return err!(
536 "failed to parse '{}' as a hexadecimal codepoint: {}",
537 s,
538 err
539 );
540 }
541 }
542 }
543}
544
545impl fmt::Display for Codepoint {
546 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
547 write!(f, "{:04X}", self.0)
548 }
549}
550
551impl PartialEq<u32> for Codepoint {
552 fn eq(&self, other: &u32) -> bool {
553 self.0 == *other
554 }
555}
556
557impl PartialEq<Codepoint> for u32 {
558 fn eq(&self, other: &Codepoint) -> bool {
559 *self == other.0
560 }
561}
562
563#[derive(Debug)]
565pub struct CodepointIter {
566 next: u32,
567 range: CodepointRange,
568}
569
570impl Iterator for CodepointIter {
571 type Item = Codepoint;
572
573 fn next(&mut self) -> Option<Codepoint> {
574 if self.next > self.range.end.value() {
575 return None;
576 }
577 let current = self.next;
578 self.next += 1;
579 Some(Codepoint::from_u32(current).unwrap())
580 }
581}