ucd_parse/
case_folding.rs

1use std::path::Path;
2
3use crate::{
4    common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
5    error::Error,
6};
7
8/// A single row in the `CaseFolding.txt` file.
9///
10/// The contents of `CaseFolding.txt` are a convenience derived from both
11/// `UnicodeData.txt` and `SpecialCasing.txt`.
12///
13/// Note that a single codepoint may be mapped multiple times. In particular,
14/// a single codepoint might have distinct `CaseStatus::Simple` and
15/// `CaseStatus::Full` mappings.
16#[derive(Clone, Debug, Default, Eq, PartialEq)]
17pub struct CaseFold {
18    /// The codepoint that is being mapped.
19    pub codepoint: Codepoint,
20    /// The case status of this mapping.
21    pub status: CaseStatus,
22    /// The actual case mapping, which is more than one codepoint if this is
23    /// a "full" mapping.
24    pub mapping: Vec<Codepoint>,
25}
26
27impl UcdFile for CaseFold {
28    fn relative_file_path() -> &'static Path {
29        Path::new("CaseFolding.txt")
30    }
31}
32
33impl UcdFileByCodepoint for CaseFold {
34    fn codepoints(&self) -> CodepointIter {
35        self.codepoint.into_iter()
36    }
37}
38
39impl std::str::FromStr for CaseFold {
40    type Err = Error;
41
42    fn from_str(line: &str) -> Result<CaseFold, Error> {
43        let re_parts = regex!(
44            r"(?x)
45                ^
46                \s*(?P<codepoint>[^\s;]+)\s*;
47                \s*(?P<status>[^\s;]+)\s*;
48                \s*(?P<mapping>[^;]+)\s*;
49                ",
50        );
51
52        let caps = match re_parts.captures(line.trim()) {
53            Some(caps) => caps,
54            None => return err!("invalid CaseFolding line: '{}'", line),
55        };
56        let mut mapping = vec![];
57        for cp in caps["mapping"].split_whitespace() {
58            mapping.push(cp.parse()?);
59        }
60        Ok(CaseFold {
61            codepoint: caps["codepoint"].parse()?,
62            status: caps["status"].parse()?,
63            mapping,
64        })
65    }
66}
67
68/// The status of a particular case mapping.
69#[derive(Clone, Copy, Debug, Eq, PartialEq)]
70pub enum CaseStatus {
71    /// Case mappings shared by both "simple" and "full" mappings.
72    Common,
73    /// A case mapping that changes the number of codepoints.
74    Full,
75    /// A case mapping that doesn't change the number of codepoints, when it
76    /// differs from `Full`.
77    Simple,
78    /// Special cases (currently only for Turkic mappings) that are typically
79    /// excluded by default. Special cases don't change the number of
80    /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes.
81    Special,
82}
83
84impl Default for CaseStatus {
85    fn default() -> CaseStatus {
86        CaseStatus::Common
87    }
88}
89
90impl CaseStatus {
91    /// Returns true if and only if this status indicates a case mapping that
92    /// won't change the number of codepoints.
93    pub fn is_fixed(&self) -> bool {
94        *self != CaseStatus::Full
95    }
96}
97
98impl std::str::FromStr for CaseStatus {
99    type Err = Error;
100
101    fn from_str(s: &str) -> Result<CaseStatus, Error> {
102        match s {
103            "C" => Ok(CaseStatus::Common),
104            "F" => Ok(CaseStatus::Full),
105            "S" => Ok(CaseStatus::Simple),
106            "T" => Ok(CaseStatus::Special),
107            _ => err!(
108                "unrecognized case status: '{}' \
109                 (must be one of C, F, S or T)",
110                s
111            ),
112        }
113    }
114}
115
116#[cfg(test)]
117mod tests {
118    use super::{CaseFold, CaseStatus};
119
120    #[test]
121    fn parse_common() {
122        let line =
123            "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE\n";
124        let row: CaseFold = line.parse().unwrap();
125        assert_eq!(row.codepoint, 0x0150);
126        assert_eq!(row.status, CaseStatus::Common);
127        assert_eq!(row.mapping, vec![0x0151]);
128    }
129
130    #[test]
131    fn parse_full() {
132        let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS\n";
133        let row: CaseFold = line.parse().unwrap();
134        assert_eq!(row.codepoint, 0x03B0);
135        assert_eq!(row.status, CaseStatus::Full);
136        assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]);
137    }
138
139    #[test]
140    fn parse_simple() {
141        let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI\n";
142        let row: CaseFold = line.parse().unwrap();
143        assert_eq!(row.codepoint, 0x1F8F);
144        assert_eq!(row.status, CaseStatus::Simple);
145        assert_eq!(row.mapping, vec![0x1F87]);
146    }
147
148    #[test]
149    fn parse_special() {
150        let line = "0049; T; 0131; # LATIN CAPITAL LETTER I\n";
151        let row: CaseFold = line.parse().unwrap();
152        assert_eq!(row.codepoint, 0x0049);
153        assert_eq!(row.status, CaseStatus::Special);
154        assert_eq!(row.mapping, vec![0x0131]);
155    }
156}