icu_data/ucm/types.rs
1/// The "equivalence type" of the Unicode codepoint to the bytestring in the [`Encoding`]. The
2/// equivalence types are defined by the Unicode consortium as such:
3///
4/// ```text
5/// # The 1st column is the Unicode scalar value.
6/// # The 2nd column is the codepage byte sequence.
7/// # The 3rd column is the fallback indicator.
8/// # The fallback indicator can have one of the following values:
9/// # |0 for exact 1-1 roundtrip mapping
10/// # |1 for the best fallback codepage byte sequence.
11/// # |2 for the substitution character
12/// # |3 for the best reverse fallback Unicode scaler value
13/// ```
14#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone)]
15pub enum EquivalenceType {
16 Type0,
17 Type1,
18 Type2,
19 Type3,
20}
21
22/// This represents a `CHARMAP` row in a `.ucm` (UniCode Mapping) file.
23#[derive(Debug, Clone, PartialEq)]
24pub struct Codepoint {
25 pub uni: char,
26 pub eq_type: EquivalenceType,
27 pub bytestring: Vec<u8>,
28}
29
30impl Into<char> for Codepoint {
31 fn into(self) -> char {
32 self.uni
33 }
34}
35
36/// This represents a single `.ucm` (UniCode Mapping) file.
37#[derive(Debug, Clone, PartialEq)]
38pub struct Encoding {
39 /// Note: does not include `<icu:state>`. Unordered.
40 pub metadata: std::collections::HashMap<String, String>,
41 /// This list is guaranteed to be in the order it is in the file.
42 pub codepoints: Vec<Codepoint>,
43 /// Parsing of `states` is left to those who wish to implement them. We provide a more complete
44 /// parser of them by using [`UcmParser`](super::parser::UcmParser) with
45 /// [`Rule::state_row`](super::parser::Rule::state_row), but matching the rules is up to you.
46 /// For more information on their format, see [this page from the ICU User
47 /// Guide](https://unicode-org.github.io/icu/userguide/conversion/data.html#state-table-syntax-in-ucm-files).
48 pub states: Vec<String>,
49}