unicode_writing_script/
lib.rs

1//! Detection of writing scripts from Unicode codepoints.
2
3use std::collections::BTreeMap;
4
5use interavl::IntervalTree;
6
7include!(concat!(env!("OUT_DIR"), "/data.rs"));
8
9/// A context.
10#[derive(Clone)]
11pub struct Context {
12    forest: BTreeMap<&'static str, Tree>,
13}
14
15/// A script.
16#[derive(Clone, Copy, Default)]
17pub struct Script {
18    /// The number of codepoints found in the script.
19    pub count: usize,
20    /// The total number of codepoints in the script.
21    pub total: usize,
22}
23
24#[derive(Clone)]
25struct Tree {
26    count: usize,
27    backend: IntervalTree<u32, ()>,
28}
29
30impl Context {
31    /// Detect writing scripts given ranges of Unicode codepoints.
32    pub fn detect<T>(&self, codepoints: T) -> BTreeMap<&'static str, Script>
33    where
34        T: IntoIterator<Item = [u32; 2]>,
35    {
36        let mut scripts: BTreeMap<&'static str, Script> = Default::default();
37        for [start, end] in codepoints {
38            for (name, tree) in self.forest.iter() {
39                for (range, _) in tree.backend.iter_overlaps(&(start..end + 1)) {
40                    let count = range.end.min(end + 1) - range.start.max(start);
41                    let script = scripts.entry(*name).or_insert_with(|| Script {
42                        count: 0,
43                        total: tree.count,
44                    });
45                    script.count += count as usize;
46                }
47            }
48        }
49        scripts
50    }
51}
52
53impl Default for Context {
54    fn default() -> Self {
55        let mut forest = BTreeMap::default();
56        for (name, ranges) in DATA {
57            forest.insert(*name, Tree::from(*ranges));
58        }
59        Self { forest }
60    }
61}
62
63impl From<&[(u32, u32)]> for Tree {
64    fn from(ranges: &[(u32, u32)]) -> Self {
65        let mut count = 0;
66        let mut backend = IntervalTree::default();
67        for (start, end) in ranges.iter().cloned() {
68            count += (end - start + 1) as usize;
69            backend.insert(start..end + 1, ());
70        }
71        Self { count, backend }
72    }
73}
74
75#[cfg(test)]
76mod tests {
77    use serde::Deserialize;
78
79    use crate::Context;
80
81    macro_rules! ok(($result:expr) => ($result.unwrap()));
82
83    #[derive(Deserialize)]
84    #[serde(untagged)]
85    enum Range {
86        Single(u32),
87        Double(u32, u32),
88    }
89
90    impl From<Range> for [u32; 2] {
91        fn from(range: Range) -> Self {
92            match range {
93                Range::Single(value) => [value, value],
94                Range::Double(start, end) => [start, end],
95            }
96        }
97    }
98
99    #[test]
100    fn proxima_nova() {
101        const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 399, 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 601, 626, 688, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, [884, 885], 894, [900, 906], 908, [910, 929], [931, 974], 983, [1024, 1119], [1138, 1141], [1162, 1279], [1296, 1299], [1308, 1309], [1316, 1319], [1326, 1327], [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7715], [7766, 7767], [7776, 7777], [7786, 7787], [7808, 7813], 7838, [7840, 7929], [8192, 8202], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8317, 8329], [8333, 8334], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8467, [8470, 8471], 8480, 8482, 8486, 8494, [8531, 8532], [8539, 8542], [8592, 8595], 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], [8882, 8883], 8901, 8984, 9632, 9650, 9654, 9660, 9664, 9674, 9679, 9733, [9744, 9745], 9830, 10003, 11800, [57348, 57349], [62522, 62523], [62560, 62579], [62616, 62623], [62662, 62663], [62668, 62669], [62674, 62679], [62730, 62731], [62734, 62771], [62774, 62777], [62780, 62783], 63031, 63171, 63197, [63199, 63219], 63743, [64256, 64260]]";
102        compare(
103            CODEPOINTS,
104            vec![
105                ("Cyrillic", 230, 518),
106                ("Emoji", 27, 1589),
107                ("Emoji_Component", 15, 156),
108                ("Greek", 78, 554),
109                ("Inherited", 20, 714),
110                ("Latin", 395, 1528),
111                ("PUA", 115, 137468),
112                ("Unknown", 123, 954979),
113            ],
114        );
115    }
116
117    #[test]
118    fn proxima_nova_arabic() {
119        const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 399, 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 601, 626, [699, 703], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, 1548, 1563, 1567, [1569, 1594], [1600, 1622], [1632, 1645], [1648, 1649], 1662, 1700, 1722, [7682, 7683], [7690, 7695], [7710, 7717], [7722, 7723], [7732, 7733], [7766, 7767], [7776, 7779], [7786, 7791], [7808, 7813], [7826, 7827], 7830, 7838, [7840, 7929], [8192, 8202], [8204, 8205], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8467, [8470, 8471], 8480, 8482, 8486, 8494, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], 9674, 9676, 11800, [57348, 57349], 63171, 63197, 63743, [64256, 64260], 64336, 64362, 64414, 64508, [64830, 64831], 65020, 65153, 65155, 65157, 65159, 65165, 65167, 65171, 65173, 65177, 65181, 65185, 65189, 65193, 65195, 65197, 65199, 65201, 65205, 65209, 65213, 65217, 65221, 65225, 65229, 65233, 65237, 65241, 65245, 65249, 65253, 65261, 65263, 65265]";
120
121        compare(
122            CODEPOINTS,
123            vec![
124                ("Arabic", 100, 1469),
125                ("Emoji", 21, 1589),
126                ("Emoji_Component", 16, 156),
127                ("Greek", 2, 554),
128                ("Hebrew", 1, 143),
129                ("Inherited", 36, 714),
130                ("Latin", 414, 1528),
131                ("PUA", 5, 137468),
132                ("Unknown", 8, 954979),
133            ],
134        );
135    }
136
137    #[test]
138    fn proxima_nova_devanagari() {
139        const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 626, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, [2304, 2431], [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7715], [7766, 7767], [7776, 7777], [7786, 7787], [7808, 7813], 7838, [7840, 7929], [8192, 8202], [8204, 8205], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8471, 8482, 8486, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], 9674, 9676, 11800, 43259, [57348, 57349], 63171, 63197, [64256, 64260]]";
140
141        compare(
142            CODEPOINTS,
143            vec![
144                ("Arabic", 1, 1469),
145                ("Devanagari", 125, 169),
146                ("Emoji", 21, 1589),
147                ("Emoji_Component", 16, 156),
148                ("Greek", 2, 554),
149                ("Inherited", 27, 714),
150                ("Latin", 391, 1528),
151                ("PUA", 4, 137468),
152                ("Unknown", 7, 954979),
153            ],
154        );
155    }
156
157    #[test]
158    fn proxima_nova_hangeul() {
159        const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 626, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7715], [7766, 7767], [7776, 7777], [7786, 7787], [7808, 7813], 7838, [7840, 7929], [8192, 8202], [8204, 8205], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8471, 8482, 8486, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], [9001, 9002], [9312, 9321], [9332, 9341], 9450, 9471, 9674, [10102, 10111], 11800, [12289, 12290], [12298, 12305], [12308, 12315], [12593, 12643], [12800, 12828], [12896, 12923], 12927, [44032, 55203], [57348, 57349], 63171, 63197, [64256, 64260], [65040, 65049], [65073, 65074], [65081, 65092]]";
160
161        compare(
162            CODEPOINTS,
163            vec![
164                ("Emoji", 22, 1589),
165                ("Emoji_Component", 17, 156),
166                ("Greek", 2, 554),
167                ("Hangul", 11281, 11753),
168                ("Inherited", 23, 714),
169                ("Katakana", 1, 335),
170                ("Latin", 391, 1528),
171                ("PUA", 4, 137468),
172                ("Unknown", 10, 954979),
173            ],
174        );
175    }
176
177    #[test]
178    fn proxima_nova_hebrew() {
179        const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 626, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, [1456, 1468], [1470, 1476], 1479, [1488, 1514], [1520, 1524], [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7717], [7730, 7731], [7766, 7767], [7776, 7777], [7786, 7789], [7806, 7813], [7826, 7827], 7838, [7840, 7929], [8192, 8202], [8204, 8207], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8471, 8482, 8486, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], 9674, 9676, 11800, [57348, 57349], 63171, 63197, [64256, 64260], 64285, 64287, [64297, 64310], [64312, 64316], 64318, [64320, 64321], [64323, 64324], [64326, 64334]]";
180
181        compare(
182            CODEPOINTS,
183            vec![
184                ("Emoji", 21, 1589),
185                ("Emoji_Component", 17, 156),
186                ("Greek", 2, 554),
187                ("Hebrew", 88, 143),
188                ("Inherited", 23, 714),
189                ("Latin", 401, 1528),
190                ("PUA", 4, 137468),
191                ("Unknown", 14, 954979),
192            ],
193        );
194    }
195
196    #[test]
197    fn proxima_nova_tamil() {
198        const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 626, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, [2404, 2405], [2946, 2947], [2949, 2954], [2958, 2960], [2962, 2965], [2969, 2970], 2972, [2974, 2975], [2979, 2980], [2984, 2986], [2990, 3001], [3006, 3010], [3014, 3016], [3018, 3021], 3024, 3031, [3046, 3066], [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7715], [7732, 7735], [7738, 7739], [7748, 7753], [7766, 7767], [7774, 7779], [7786, 7789], [7808, 7813], 7838, [7840, 7929], [8192, 8202], [8204, 8205], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8471, 8482, 8486, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], 9674, 9676, 11800, [57348, 57349], 63171, 63197, [64256, 64260]]";
199
200        compare(
201            CODEPOINTS,
202            vec![
203                ("Devanagari", 1, 169),
204                ("Emoji", 21, 1589),
205                ("Emoji_Component", 16, 156),
206                ("Greek", 2, 554),
207                ("Inherited", 22, 714),
208                ("Latin", 409, 1528),
209                ("PUA", 4, 137468),
210                ("Tamil", 72, 141),
211                ("Unknown", 23, 954979),
212            ],
213        );
214    }
215
216    #[test]
217    fn proxima_nova_thai() {
218        const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 626, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, [3585, 3642], [3647, 3675], [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7715], [7766, 7767], [7776, 7777], [7786, 7787], [7808, 7813], 7838, [7840, 7929], [8192, 8202], [8204, 8205], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8471, 8482, 8486, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], 9674, 9676, 11800, [57348, 57349], 63171, 63197, [64256, 64260]]";
219
220        compare(
221            CODEPOINTS,
222            vec![
223                ("Emoji", 21, 1589),
224                ("Emoji_Component", 16, 156),
225                ("Greek", 2, 554),
226                ("Inherited", 22, 714),
227                ("Latin", 391, 1528),
228                ("PUA", 4, 137468),
229                ("Thai", 86, 88),
230                ("Unknown", 9, 954979),
231            ],
232        );
233    }
234
235    fn compare(codepoints: &str, expected: Vec<(&str, usize, usize)>) {
236        let codepoints: Vec<Range> = ok!(serde_json::from_str(codepoints));
237        let results = Context::default().detect(codepoints.into_iter().map(Into::into));
238        let actual = results
239            .into_iter()
240            .map(|(name, script)| (name, script.count, script.total))
241            .collect::<Vec<_>>();
242        assert_eq!(actual, expected);
243    }
244}