unicode_collation/
lib.rs

1//! A [Unicode Collation Algorithm](https://www.unicode.org/reports/tr10/) implemented according
2//! to Unicode Technical Standard #10.
3//!
4//! # Usage
5//!
6//! Add this to your *Cargo.toml*:
7//! ```toml
8//! [dependencies]
9//! unicode-collation = "0.1"
10//! ```
11//!
12//! # Examples
13//! Generate sort key for the given string:
14//!
15//! ```rust
16//! extern crate unicode_collation;
17//! use unicode_collation::{collate, CollationTable};
18//! 
19//! # pub fn main() {
20//! let table = CollationTable::from_text_file("data/allkeys.txt");
21//! let key = collate("Hello!!!", &table);
22//! assert_eq!(format!("{:?}", key), "[\
23//!     1D7E 1D10 1DDD 1DDD 1E43 0261 0261 0261 | \
24//!     0020 0020 0020 0020 0020 0020 0020 0020 | \
25//!     0008 0002 0002 0002 0002 0002 0002 0002 |]");
26//! # }
27//! ```
28extern crate pest;
29#[macro_use]
30extern crate pest_derive;
31extern crate unicode_normalization;
32
33use unicode_normalization::UnicodeNormalization;
34use std::fmt;
35use std::ops::Deref;
36
37mod table;
38mod multistage;
39
40pub struct SortKey(Vec<u16>);
41pub use table::CollationTable;
42
43impl Deref for SortKey {
44    type Target = Vec<u16>;
45
46    fn deref(&self) -> &Self::Target {
47        &self.0
48    }
49}
50
51impl fmt::Debug for SortKey {
52    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
53        write!(f, "[")?;
54        for weight in &self.0 {
55            if *weight == 0 {
56                write!(f, "| ")?;
57            } else {
58                write!(f, "{:04X} ", weight)?;
59            }
60        }
61        write!(f, "|")?;
62        write!(f, "]")?;
63        Ok(())
64    }
65}
66
67
68pub fn collate(text: &str, table: &table::CollationTable) -> SortKey {
69    let mut weights = Vec::new();
70    for c in text.nfd() {
71        weights.extend(table.resolve(c));
72    }
73
74    let mut sort_key = Vec::with_capacity(weights.len());
75    // For all levels
76    for level in 0..4 {
77        for entry in &weights {
78            let weight = entry.weights[level];
79            if weight != 0 {
80                sort_key.push(weight);
81            }
82        }
83        sort_key.push(0);
84    }
85    while sort_key.last() == Some(&0) {
86        sort_key.pop();
87    }
88    
89    SortKey(sort_key)
90}
91
92#[cfg(test)]
93mod tests {
94    use super::*;
95    use std::fs::File;
96    use std::io::{BufRead, BufReader};
97    use std::{char, u32};
98
99    #[test]
100    fn test() {
101        let table = CollationTable::from_text_file("data/allkeys.txt");
102
103        let file = File::open("data/CollationTest/CollationTest_NON_IGNORABLE.txt").unwrap();
104        let file = BufReader::new(&file);
105        for (line_num, line) in file.lines().enumerate() {
106            let line = line.unwrap();
107            let line = line.trim();
108            if line.starts_with("#") || line.is_empty() {
109                continue;
110            }
111            let mut parts = line.split(';');
112
113            let codes = parts.next().unwrap();
114            let text = codes
115                .split(" ")
116                .map(|s| u32::from_str_radix(s, 16).unwrap())
117                .map(|c| char::from_u32(c).unwrap())
118                .collect::<String>();
119
120            let sort_key = collate(&text, &table);
121
122            let comment = parts.next().unwrap();
123            let from = comment.find('[').unwrap();
124            let to = comment.rfind(']').unwrap();
125            let expected = &comment[from..to + 1];
126
127            let actual = format!("{:?}", sort_key);
128            assert_eq!(expected, actual, "failed on line '{}': {}", line_num, line);
129        }
130    }
131}