icu_data/ucm/
parser.rs

1//! A `.ucm` file format (UniCode Mapping) Pest parser
2use std::collections::HashMap;
3
4use super::{Codepoint, Encoding, EquivalenceType};
5
6use pest::error::{Error, ErrorVariant};
7use pest::iterators::Pair;
8pub use pest::Parser;
9use pest::Position;
10use pest_derive::Parser;
11
12/// This [`pest`] parser implements the grammar in `../resources/ucm.pest`.
13#[derive(Parser)]
14#[grammar = "../resources/ucm.pest"]
15pub struct UcmParser;
16
17/// Dumps a tree to `stderr` of the `.ucm` data in the argument as seen by the Pest parser. The
18/// tree comes from the [`pest_ascii_tree`] crate.
19pub fn parse_debug_dump(ucms: &str) {
20    let ast = UcmParser::parse(Rule::ucm, ucms).unwrap_or_else(|e|{panic!("{:?}", e);});
21    use pest_ascii_tree::into_ascii_tree;
22    eprintln!("{}", into_ascii_tree(ast).unwrap());
23}
24
25/// Parse a UCM document into an [`Encoding`].
26pub fn parse(ucms: &str) -> Result<Encoding, Error<Rule>> {
27    fn parse_bytestring(bs: &str) -> Vec<u8> {
28        bs.split("\\x")
29            .filter(|s| s.trim().len() != 0)
30            .map(|s| u8::from_str_radix(s.trim(), 16).unwrap())
31            .collect()
32    }
33
34    let ucm = match UcmParser::parse(Rule::ucm, ucms)?.next() {
35        Some(parsed) => parsed,
36        None => {
37            Err(Error::new_from_pos(
38                ErrorVariant::CustomError {
39                    message: "No rules in parsed file?".to_string(),
40                },
41                Position::new(ucms, 0).unwrap(),
42            ))?
43        }
44    };
45
46    let mut codepoints = vec![];
47    let mut metadata = HashMap::new();
48    let mut states = vec![];
49    for i in ucm.into_inner() {
50        let rules: Vec<Pair<_>> = i.clone().into_inner().into_iter().collect();
51        match i.as_rule() {
52            Rule::unicode_record => {
53                let (uni, bytestring, utype) = (&rules[0], &rules[1], &rules[2]);
54                debug_assert_eq!(uni.as_rule(), Rule::unicode_inner);
55                debug_assert!([Rule::type0, Rule::type1, Rule::type2, Rule::type3].iter().any(|r|utype.as_rule() == *r));
56                let uni = char::from_u32(u32::from_str_radix(uni.as_span().as_str(), 16).unwrap())
57                    .unwrap();
58                let eq_type = match utype.as_rule() {
59                    Rule::type0 => EquivalenceType::Type0,
60                    Rule::type1 => EquivalenceType::Type1,
61                    Rule::type2 => EquivalenceType::Type2,
62                    Rule::type3 => EquivalenceType::Type3,
63                    _ => unreachable!(),
64                };
65                let bytestring: Vec<u8> = parse_bytestring(bytestring.as_str());
66                codepoints.push(Codepoint {
67                    uni,
68                    eq_type,
69                    bytestring,
70                });
71            }
72            Rule::metadata_record => {
73                let (key, value) = (&rules[0], &rules[1]);
74                debug_assert_eq!(key.as_rule(), Rule::metadata_key);
75                metadata
76                    .insert(key.as_str().to_owned(), value.as_str().to_owned());
77            }
78            Rule::state_record => {
79                let state_row = &rules[0];
80                debug_assert_eq!(state_row.as_rule(), Rule::state_row);
81                states.push(state_row.as_str().to_owned());
82            }
83            _ => {}
84        }
85    }
86    Ok(Encoding { codepoints, metadata, states })
87}