ucd_parse/
common.rs

1use std::{
2    collections::BTreeMap,
3    fmt,
4    fs::File,
5    io::{self, BufRead},
6    path::{Path, PathBuf},
7    str::FromStr,
8};
9
10use crate::error::{Error, ErrorKind};
11
12/// Parse a particular file in the UCD into a sequence of rows.
13///
14/// The given directory should be the directory to the UCD.
15pub fn parse<P, D>(ucd_dir: P) -> Result<Vec<D>, Error>
16where
17    P: AsRef<Path>,
18    D: UcdFile,
19{
20    let mut xs = vec![];
21    for result in D::from_dir(ucd_dir)? {
22        let x = result?;
23        xs.push(x);
24    }
25    Ok(xs)
26}
27
28/// Parse a particular file in the UCD into a map from codepoint to the record.
29///
30/// The given directory should be the directory to the UCD.
31pub fn parse_by_codepoint<P, D>(
32    ucd_dir: P,
33) -> Result<BTreeMap<Codepoint, D>, Error>
34where
35    P: AsRef<Path>,
36    D: UcdFileByCodepoint,
37{
38    let mut map = BTreeMap::new();
39    for result in D::from_dir(ucd_dir)? {
40        let x = result?;
41        for cp in x.codepoints() {
42            map.insert(cp, x.clone());
43        }
44    }
45    Ok(map)
46}
47
48/// Parse a particular file in the UCD into a map from codepoint to all
49/// records associated with that codepoint.
50///
51/// This is useful for files that have multiple records for each codepoint.
52/// For example, the `NameAliases.txt` file lists multiple aliases for some
53/// codepoints.
54///
55/// The given directory should be the directory to the UCD.
56pub fn parse_many_by_codepoint<P, D>(
57    ucd_dir: P,
58) -> Result<BTreeMap<Codepoint, Vec<D>>, Error>
59where
60    P: AsRef<Path>,
61    D: UcdFileByCodepoint,
62{
63    let mut map = BTreeMap::new();
64    for result in D::from_dir(ucd_dir)? {
65        let x = result?;
66        for cp in x.codepoints() {
67            map.entry(cp).or_insert(vec![]).push(x.clone());
68        }
69    }
70    Ok(map)
71}
72
73/// Given a path pointing at the root of the `ucd_dir`, attempts to determine
74/// it's unicode version.
75///
76/// This just checks the readme and the very first line of PropList.txt -- in
77/// practice this works for all versions of UCD since 4.1.0.
78pub fn ucd_directory_version<D: ?Sized + AsRef<Path>>(
79    ucd_dir: &D,
80) -> Result<(u64, u64, u64), Error> {
81    // Avoid duplication from generic path parameter.
82    fn ucd_directory_version_inner(
83        ucd_dir: &Path,
84    ) -> Result<(u64, u64, u64), Error> {
85        let re_version_rx = regex!(r"-([0-9]+).([0-9]+).([0-9]+).txt");
86
87        let proplist = ucd_dir.join("PropList.txt");
88        let contents = first_line(&proplist)?;
89        let caps = match re_version_rx.captures(&contents) {
90            Some(c) => c,
91            None => {
92                return err!("Failed to find version in line {:?}", contents)
93            }
94        };
95
96        let capture_to_num = |n| {
97            caps.get(n).unwrap().as_str().parse::<u64>().map_err(|e| Error {
98                kind: ErrorKind::Parse(format!(
99                    "Failed to parse version from {:?} in PropList.txt: {}",
100                    contents, e
101                )),
102                line: Some(0),
103                path: Some(proplist.clone()),
104            })
105        };
106        let major = capture_to_num(1)?;
107        let minor = capture_to_num(2)?;
108        let patch = capture_to_num(3)?;
109
110        Ok((major, minor, patch))
111    }
112    ucd_directory_version_inner(ucd_dir.as_ref())
113}
114
115fn first_line(path: &Path) -> Result<String, Error> {
116    let file = std::fs::File::open(path).map_err(|e| Error {
117        kind: ErrorKind::Io(e),
118        line: None,
119        path: Some(path.into()),
120    })?;
121
122    let mut reader = std::io::BufReader::new(file);
123    let mut line_contents = String::new();
124    reader.read_line(&mut line_contents).map_err(|e| Error {
125        kind: ErrorKind::Io(e),
126        line: None,
127        path: Some(path.into()),
128    })?;
129    Ok(line_contents)
130}
131
132/// A helper function for parsing a common record format that associates one
133/// or more codepoints with a string value.
134pub fn parse_codepoint_association<'a>(
135    line: &'a str,
136) -> Result<(Codepoints, &'a str), Error> {
137    let re_parts = regex!(
138        r"(?x)
139            ^
140            \s*(?P<codepoints>[^\s;]+)\s*;
141            \s*(?P<property>[^;\x23]+)\s*
142            ",
143    );
144
145    let caps = match re_parts.captures(line.trim()) {
146        Some(caps) => caps,
147        None => return err!("invalid PropList line: '{}'", line),
148    };
149    let property = match caps.name("property") {
150        Some(property) => property.as_str().trim(),
151        None => {
152            return err!(
153                "could not find property name in PropList line: '{}'",
154                line
155            )
156        }
157    };
158    Ok((caps["codepoints"].parse()?, property))
159}
160
161/// A helper function for parsing a sequence of space separated codepoints.
162/// The sequence is permitted to be empty.
163pub fn parse_codepoint_sequence(s: &str) -> Result<Vec<Codepoint>, Error> {
164    let mut cps = vec![];
165    for cp in s.trim().split_whitespace() {
166        cps.push(cp.parse()?);
167    }
168    Ok(cps)
169}
170
171/// A helper function for parsing a single test for the various break
172/// algorithms.
173///
174/// Upon success, this returns the UTF-8 encoded groups of codepoints along
175/// with the comment associated with the test. The comment is a human readable
176/// description of the test that may prove useful for debugging.
177pub fn parse_break_test(line: &str) -> Result<(Vec<String>, String), Error> {
178    let re_parts = regex!(
179        r"(?x)
180            ^
181            (?:÷|×)
182            (?P<groups>(?:\s[0-9A-Fa-f]{4,5}\s(?:÷|×))+)
183            \s+
184            \#(?P<comment>.+)
185            $
186            ",
187    );
188    let re_group = regex!(
189        r"(?x)
190            (?P<codepoint>[0-9A-Fa-f]{4,5})\s(?P<kind>÷|×)
191            ",
192    );
193
194    let caps = match re_parts.captures(line.trim()) {
195        Some(caps) => caps,
196        None => return err!("invalid break test line: '{}'", line),
197    };
198    let comment = caps["comment"].trim().to_string();
199
200    let mut groups = vec![];
201    let mut cur = String::new();
202    for cap in re_group.captures_iter(&caps["groups"]) {
203        let cp: Codepoint = cap["codepoint"].parse()?;
204        let ch = match cp.scalar() {
205            Some(ch) => ch,
206            None => {
207                return err!(
208                    "invalid codepoint '{:X}' in line: '{}'",
209                    cp.value(),
210                    line
211                )
212            }
213        };
214        cur.push(ch);
215        if &cap["kind"] == "÷" {
216            groups.push(cur);
217            cur = String::new();
218        }
219    }
220    Ok((groups, comment))
221}
222
223/// Describes a single UCD file.
224pub trait UcdFile:
225    Clone + fmt::Debug + Default + Eq + FromStr<Err = Error> + PartialEq
226{
227    /// The file path corresponding to this file, relative to the UCD
228    /// directory.
229    fn relative_file_path() -> &'static Path;
230
231    /// The full file path corresponding to this file given the UCD directory
232    /// path.
233    fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf {
234        ucd_dir.as_ref().join(Self::relative_file_path())
235    }
236
237    /// Create an iterator over each record in this UCD file.
238    ///
239    /// The parameter should correspond to the directory containing the UCD.
240    fn from_dir<P: AsRef<Path>>(
241        ucd_dir: P,
242    ) -> Result<UcdLineParser<File, Self>, Error> {
243        UcdLineParser::from_path(Self::file_path(ucd_dir))
244    }
245}
246
247/// Describes a single UCD file where every record in the file is associated
248/// with one or more codepoints.
249pub trait UcdFileByCodepoint: UcdFile {
250    /// Returns the codepoints associated with this record.
251    fn codepoints(&self) -> CodepointIter;
252}
253
254/// A line oriented parser for a particular UCD file.
255///
256/// Callers can build a line parser via the
257/// [`UcdFile::from_dir`](trait.UcdFile.html) method.
258///
259/// The `R` type parameter refers to the underlying `io::Read` implementation
260/// from which the UCD data is read.
261///
262/// The `D` type parameter refers to the type of the record parsed out of each
263/// line.
264#[derive(Debug)]
265pub struct UcdLineParser<R, D> {
266    path: Option<PathBuf>,
267    rdr: io::BufReader<R>,
268    line: String,
269    line_number: u64,
270    _data: std::marker::PhantomData<D>,
271}
272
273impl<D> UcdLineParser<File, D> {
274    /// Create a new parser from the given file path.
275    pub(crate) fn from_path<P: AsRef<Path>>(
276        path: P,
277    ) -> Result<UcdLineParser<File, D>, Error> {
278        let path = path.as_ref();
279        let file = File::open(path).map_err(|e| Error {
280            kind: ErrorKind::Io(e),
281            line: None,
282            path: Some(path.to_path_buf()),
283        })?;
284        Ok(UcdLineParser::new(Some(path.to_path_buf()), file))
285    }
286}
287
288impl<R: io::Read, D> UcdLineParser<R, D> {
289    /// Create a new parser that parses the reader given.
290    ///
291    /// The type of data parsed is determined when the `parse_next` function
292    /// is called by virtue of the type requested.
293    ///
294    /// Note that the reader is buffered internally, so the caller does not
295    /// need to provide their own buffering.
296    pub(crate) fn new(path: Option<PathBuf>, rdr: R) -> UcdLineParser<R, D> {
297        UcdLineParser {
298            path,
299            rdr: io::BufReader::new(rdr),
300            line: String::new(),
301            line_number: 0,
302            _data: std::marker::PhantomData,
303        }
304    }
305}
306
307impl<R: io::Read, D: FromStr<Err = Error>> Iterator for UcdLineParser<R, D> {
308    type Item = Result<D, Error>;
309
310    fn next(&mut self) -> Option<Result<D, Error>> {
311        loop {
312            self.line_number += 1;
313            self.line.clear();
314            let n = match self.rdr.read_line(&mut self.line) {
315                Err(err) => {
316                    return Some(Err(Error {
317                        kind: ErrorKind::Io(err),
318                        line: None,
319                        path: self.path.clone(),
320                    }))
321                }
322                Ok(n) => n,
323            };
324            if n == 0 {
325                return None;
326            }
327            if !self.line.starts_with('#') && !self.line.trim().is_empty() {
328                break;
329            }
330        }
331        let line_number = self.line_number;
332        Some(self.line.parse().map_err(|mut err: Error| {
333            err.line = Some(line_number);
334            err
335        }))
336    }
337}
338
339/// A representation of either a single codepoint or a range of codepoints.
340#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)]
341pub enum Codepoints {
342    /// A single codepoint.
343    Single(Codepoint),
344    /// A range of codepoints.
345    Range(CodepointRange),
346}
347
348impl Default for Codepoints {
349    fn default() -> Codepoints {
350        Codepoints::Single(Codepoint::default())
351    }
352}
353
354impl IntoIterator for Codepoints {
355    type IntoIter = CodepointIter;
356    type Item = Codepoint;
357
358    fn into_iter(self) -> CodepointIter {
359        match self {
360            Codepoints::Single(x) => x.into_iter(),
361            Codepoints::Range(x) => x.into_iter(),
362        }
363    }
364}
365
366impl FromStr for Codepoints {
367    type Err = Error;
368
369    fn from_str(s: &str) -> Result<Codepoints, Error> {
370        if s.contains("..") {
371            CodepointRange::from_str(s).map(Codepoints::Range)
372        } else {
373            Codepoint::from_str(s).map(Codepoints::Single)
374        }
375    }
376}
377
378impl fmt::Display for Codepoints {
379    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
380        match *self {
381            Codepoints::Single(ref x) => x.fmt(f),
382            Codepoints::Range(ref x) => x.fmt(f),
383        }
384    }
385}
386
387impl PartialEq<u32> for Codepoints {
388    fn eq(&self, other: &u32) -> bool {
389        match *self {
390            Codepoints::Single(ref x) => x == other,
391            Codepoints::Range(ref x) => x == &(*other, *other),
392        }
393    }
394}
395
396impl PartialEq<Codepoint> for Codepoints {
397    fn eq(&self, other: &Codepoint) -> bool {
398        match *self {
399            Codepoints::Single(ref x) => x == other,
400            Codepoints::Range(ref x) => x == &(*other, *other),
401        }
402    }
403}
404
405impl PartialEq<(u32, u32)> for Codepoints {
406    fn eq(&self, other: &(u32, u32)) -> bool {
407        match *self {
408            Codepoints::Single(ref x) => &(x.value(), x.value()) == other,
409            Codepoints::Range(ref x) => x == other,
410        }
411    }
412}
413
414impl PartialEq<(Codepoint, Codepoint)> for Codepoints {
415    fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
416        match *self {
417            Codepoints::Single(ref x) => &(*x, *x) == other,
418            Codepoints::Range(ref x) => x == other,
419        }
420    }
421}
422
423/// A range of Unicode codepoints. The range is inclusive; both ends of the
424/// range are guaranteed to be valid codepoints.
425#[derive(
426    Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
427)]
428pub struct CodepointRange {
429    /// The start of the codepoint range.
430    pub start: Codepoint,
431    /// The end of the codepoint range.
432    pub end: Codepoint,
433}
434
435impl IntoIterator for CodepointRange {
436    type IntoIter = CodepointIter;
437    type Item = Codepoint;
438
439    fn into_iter(self) -> CodepointIter {
440        CodepointIter { next: self.start.value(), range: self }
441    }
442}
443
444impl FromStr for CodepointRange {
445    type Err = Error;
446
447    fn from_str(s: &str) -> Result<CodepointRange, Error> {
448        let re_parts = regex!(r"^(?P<start>[A-Z0-9]+)\.\.(?P<end>[A-Z0-9]+)$");
449        let caps = match re_parts.captures(s) {
450            Some(caps) => caps,
451            None => return err!("invalid codepoint range: '{}'", s),
452        };
453        let start = caps["start"].parse().or_else(|err| {
454            err!("failed to parse '{}' as a codepoint range: {}", s, err)
455        })?;
456        let end = caps["end"].parse().or_else(|err| {
457            err!("failed to parse '{}' as a codepoint range: {}", s, err)
458        })?;
459        Ok(CodepointRange { start, end })
460    }
461}
462
463impl fmt::Display for CodepointRange {
464    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
465        write!(f, "{}..{}", self.start, self.end)
466    }
467}
468
469impl PartialEq<(u32, u32)> for CodepointRange {
470    fn eq(&self, other: &(u32, u32)) -> bool {
471        &(self.start.value(), self.end.value()) == other
472    }
473}
474
475impl PartialEq<(Codepoint, Codepoint)> for CodepointRange {
476    fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
477        &(self.start, self.end) == other
478    }
479}
480
481/// A single Unicode codepoint.
482///
483/// This type's string representation is a hexadecimal number. It is guaranteed
484/// to be in the range `[0, 10FFFF]`.
485///
486/// Note that unlike Rust's `char` type, this may be a surrogate codepoint.
487#[derive(
488    Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
489)]
490pub struct Codepoint(u32);
491
492impl Codepoint {
493    /// Create a new codepoint from a `u32`.
494    ///
495    /// If the given number is not a valid codepoint, then this returns an
496    /// error.
497    pub fn from_u32(n: u32) -> Result<Codepoint, Error> {
498        if n > 0x10FFFF {
499            err!("{:x} is not a valid Unicode codepoint", n)
500        } else {
501            Ok(Codepoint(n))
502        }
503    }
504
505    /// Return the underlying `u32` codepoint value.
506    pub fn value(self) -> u32 {
507        self.0
508    }
509
510    /// Attempt to convert this codepoint to a Unicode scalar value.
511    ///
512    /// If this is a surrogate codepoint, then this returns `None`.
513    pub fn scalar(self) -> Option<char> {
514        char::from_u32(self.0)
515    }
516}
517
518impl IntoIterator for Codepoint {
519    type IntoIter = CodepointIter;
520    type Item = Codepoint;
521
522    fn into_iter(self) -> CodepointIter {
523        let range = CodepointRange { start: self, end: self };
524        CodepointIter { next: self.value(), range }
525    }
526}
527
528impl FromStr for Codepoint {
529    type Err = Error;
530
531    fn from_str(s: &str) -> Result<Codepoint, Error> {
532        match u32::from_str_radix(s, 16) {
533            Ok(n) => Codepoint::from_u32(n),
534            Err(err) => {
535                return err!(
536                    "failed to parse '{}' as a hexadecimal codepoint: {}",
537                    s,
538                    err
539                );
540            }
541        }
542    }
543}
544
545impl fmt::Display for Codepoint {
546    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
547        write!(f, "{:04X}", self.0)
548    }
549}
550
551impl PartialEq<u32> for Codepoint {
552    fn eq(&self, other: &u32) -> bool {
553        self.0 == *other
554    }
555}
556
557impl PartialEq<Codepoint> for u32 {
558    fn eq(&self, other: &Codepoint) -> bool {
559        *self == other.0
560    }
561}
562
563/// An iterator over a range of Unicode codepoints.
564#[derive(Debug)]
565pub struct CodepointIter {
566    next: u32,
567    range: CodepointRange,
568}
569
570impl Iterator for CodepointIter {
571    type Item = Codepoint;
572
573    fn next(&mut self) -> Option<Codepoint> {
574        if self.next > self.range.end.value() {
575            return None;
576        }
577        let current = self.next;
578        self.next += 1;
579        Some(Codepoint::from_u32(current).unwrap())
580    }
581}