ucd_parse/
special_casing.rs

1use std::path::Path;
2
3use crate::{
4    common::{
5        parse_codepoint_sequence, Codepoint, CodepointIter, UcdFile,
6        UcdFileByCodepoint,
7    },
8    error::Error,
9};
10
11/// A single row in the `SpecialCasing.txt` file.
12///
13/// Note that a single codepoint may be mapped multiple times. In particular,
14/// a single codepoint might have mappings based on distinct language sensitive
15/// conditions (e.g., `U+0307`).
16#[derive(Clone, Debug, Default, Eq, PartialEq)]
17pub struct SpecialCaseMapping {
18    /// The codepoint that is being mapped.
19    pub codepoint: Codepoint,
20    /// The lowercase mapping, which may be empty.
21    pub lowercase: Vec<Codepoint>,
22    /// The titlecase mapping, which may be empty.
23    pub titlecase: Vec<Codepoint>,
24    /// The uppercase mapping, which may be empty.
25    pub uppercase: Vec<Codepoint>,
26    /// A list of language specific conditions, see `SpecialCasing.txt` for
27    /// more details.
28    pub conditions: Vec<String>,
29}
30
31impl UcdFile for SpecialCaseMapping {
32    fn relative_file_path() -> &'static Path {
33        Path::new("SpecialCasing.txt")
34    }
35}
36
37impl UcdFileByCodepoint for SpecialCaseMapping {
38    fn codepoints(&self) -> CodepointIter {
39        self.codepoint.into_iter()
40    }
41}
42
43impl std::str::FromStr for SpecialCaseMapping {
44    type Err = Error;
45
46    fn from_str(line: &str) -> Result<SpecialCaseMapping, Error> {
47        let re_parts = regex!(
48            r"(?x)
49                ^
50                \s*(?P<codepoint>[^\s;]+)\s*;
51                \s*(?P<lower>[^;]+)\s*;
52                \s*(?P<title>[^;]+)\s*;
53                \s*(?P<upper>[^;]+)\s*;
54                \s*(?P<conditions>[^;\x23]+)?
55                ",
56        );
57
58        let caps = match re_parts.captures(line.trim()) {
59            Some(caps) => caps,
60            None => return err!("invalid SpecialCasing line: '{}'", line),
61        };
62        let conditions = caps
63            .name("conditions")
64            .map(|x| {
65                x.as_str()
66                    .trim()
67                    .split_whitespace()
68                    .map(|c| c.to_string())
69                    .collect()
70            })
71            .unwrap_or(vec![]);
72        Ok(SpecialCaseMapping {
73            codepoint: caps["codepoint"].parse()?,
74            lowercase: parse_codepoint_sequence(&caps["lower"])?,
75            titlecase: parse_codepoint_sequence(&caps["title"])?,
76            uppercase: parse_codepoint_sequence(&caps["upper"])?,
77            conditions,
78        })
79    }
80}
81
82#[cfg(test)]
83mod tests {
84    use super::SpecialCaseMapping;
85
86    #[test]
87    fn parse_no_conds() {
88        let line = "1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA\n";
89        let row: SpecialCaseMapping = line.parse().unwrap();
90        assert_eq!(row.codepoint, 0x1F52);
91        assert_eq!(row.lowercase, vec![0x1F52]);
92        assert_eq!(row.titlecase, vec![0x03A5, 0x0313, 0x0300]);
93        assert_eq!(row.uppercase, vec![0x03A5, 0x0313, 0x0300]);
94        assert!(row.conditions.is_empty());
95    }
96
97    #[test]
98    fn parse_conds() {
99        let line = "0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE\n";
100        let row: SpecialCaseMapping = line.parse().unwrap();
101        assert_eq!(row.codepoint, 0x0307);
102        assert!(row.lowercase.is_empty());
103        assert_eq!(row.titlecase, vec![0x0307]);
104        assert_eq!(row.uppercase, vec![0x0307]);
105        assert_eq!(row.conditions, vec!["tr", "After_I"]);
106    }
107}