icu_data/ucm/
types.rs

1/// The "equivalence type" of the Unicode codepoint to the bytestring in the [`Encoding`]. The
2/// equivalence types are defined by the Unicode consortium as such:
3///
4/// ```text
5/// # The 1st column is the Unicode scalar value.
6/// # The 2nd column is the codepage byte sequence.
7/// # The 3rd column is the fallback indicator.
8/// # The fallback indicator can have one of the following values:
9/// #   |0 for exact 1-1 roundtrip mapping
10/// #   |1 for the best fallback codepage byte sequence.
11/// #   |2 for the substitution character
12/// #   |3 for the best reverse fallback Unicode scaler value
13/// ```
14#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone)]
15pub enum EquivalenceType {
16    Type0,
17    Type1,
18    Type2,
19    Type3,
20}
21
22/// This represents a `CHARMAP` row in a `.ucm` (UniCode Mapping) file.
23#[derive(Debug, Clone, PartialEq)]
24pub struct Codepoint {
25    pub uni: char,
26    pub eq_type: EquivalenceType,
27    pub bytestring: Vec<u8>,
28}
29
30impl Into<char> for Codepoint {
31    fn into(self) -> char {
32        self.uni
33    }
34}
35
36/// This represents a single `.ucm` (UniCode Mapping) file.
37#[derive(Debug, Clone, PartialEq)]
38pub struct Encoding {
39    /// Note: does not include `<icu:state>`. Unordered.
40    pub metadata: std::collections::HashMap<String, String>,
41    /// This list is guaranteed to be in the order it is in the file.
42    pub codepoints: Vec<Codepoint>,
43    /// Parsing of `states` is left to those who wish to implement them. We provide a more complete
44    /// parser of them by using [`UcmParser`](super::parser::UcmParser) with
45    /// [`Rule::state_row`](super::parser::Rule::state_row), but matching the rules is up to you.
46    /// For more information on their format, see [this page from the ICU User
47    /// Guide](https://unicode-org.github.io/icu/userguide/conversion/data.html#state-table-syntax-in-ucm-files).
48    pub states: Vec<String>,
49}