unicode_collation/
table.rs

1use pest::{self, Parser};
2use multistage::MultiStage;
3use std::{usize, u16, u32};
4use std::fs::File;
5use std::io::Read;
6use std::fmt;
7
8#[derive(Parser)]
9#[grammar = "syntax.pest"]
10struct TableParser;
11
12type Nodes<'a> = pest::iterators::Pairs<'a, Rule>;
13type Node<'a> = pest::iterators::Pair<'a, Rule>;
14
15pub struct WeightEntry {
16    /// Weights for levels 1-4
17    pub weights: [u16; 4],
18}
19
20impl fmt::Debug for WeightEntry {
21    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
22        write!(f, "[")?;
23        for weight in &self.weights {
24            write!(f, ".{:04X}", weight)?;
25        }
26        write!(f, "]")?;
27        Ok(())
28    }
29}
30
31pub struct CollationTable {
32    /// Weights for all entries
33    weights: Vec<WeightEntry>,
34    table: MultiStage<PrimaryEntry>,
35}
36
37impl CollationTable {
38    fn new() -> Self {
39        Self {
40            weights: Vec::new(),
41            table: MultiStage::new(8),
42        }
43    }
44
45    pub fn from_text(data: &str) -> CollationTable {
46        let mut table: Nodes = TableParser::parse(Rule::table, data).unwrap();
47        let table = table.next().unwrap();
48        let entries = table
49            .into_inner()
50            .skip_while(|p| p.as_rule() != Rule::entry);
51
52        let mut max_contr = 0;
53
54        let mut table = CollationTable::new();
55        for entry in entries {
56            let mut it = entry.into_inner();
57            let mut codepoints = it.next().unwrap().into_inner();
58            let elements = it.next().unwrap();
59
60            // Record weights (FIXME: de-duplicate?)
61            let weights_idx = table.weights.len();
62            table.weights.extend(elements.into_inner().map(scan_weight));
63
64            let first = u32::from_str_radix(codepoints.next().unwrap().as_str(), 16).unwrap();
65            let entry = table.table.entry(first);
66
67            if codepoints.next().is_some() {
68                entry.contraction += 1;
69                max_contr = max_contr.max(entry.contraction);
70            } else {
71                // Record entry's own weights
72                debug_assert_eq!(entry.weights, usize::MAX);
73                entry.weights = weights_idx;
74                entry.len = (table.weights.len() - weights_idx) as u8;
75            }
76        }
77
78        table
79    }
80
81    pub fn from_text_file(path: &str) -> CollationTable {
82        let mut file = File::open(path).unwrap();
83        let mut buf = String::new();
84        file.read_to_string(&mut buf).unwrap();
85        CollationTable::from_text(&buf)
86    }
87
88    pub fn resolve(&self, c: char) -> &[WeightEntry] {
89        self.table
90            .get(c as u32)
91            .and_then(|e| {
92                if e.weights == usize::MAX {
93                    None
94                } else {
95                    Some(&self.weights[e.weights..e.weights + (e.len as usize)])
96                }
97            })
98            .unwrap_or(&[])
99    }
100}
101
102/// Entry in the collation table corresponding to the first character
103#[derive(Clone)]
104struct PrimaryEntry {
105    /// Index into weights array
106    weights: usize,
107    /// Length of weights block. If `len` is 0, this entry does not have assigned weights
108    /// and is only used as a beginning of some contraction.
109    len: u8,
110    /// Index into contraction table, if this entry is a beginning of one or more contraction
111    /// sequences. Set to `u16::MAX` if not a part of contraction.
112    contraction: u16,
113}
114
115impl Default for PrimaryEntry {
116    fn default() -> Self {
117        Self {
118            weights: usize::MAX,
119            len: 0,
120            contraction: 0,
121        }
122    }
123}
124
125fn scan_weight(node: Node) -> WeightEntry {
126    debug_assert_eq!(node.as_rule(), Rule::element);
127
128    let mut it = node.into_inner();
129    let _alt = it.next().unwrap();
130
131    let mut weights = [0; 4];
132    for i in 0..4 {
133        weights[i] = it.next()
134            .map(|r| u16::from_str_radix(r.as_str(), 16).unwrap())
135            .unwrap_or(0);
136    }
137
138    WeightEntry { weights }
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144
145    #[test]
146    pub fn parse() {
147        let _table = CollationTable::from_text_file("data/allkeys.txt");
148    }
149}