//! Detection of writing scripts from Unicode codepoints.
use std::collections::BTreeMap;
use interavl::IntervalTree;
include!(concat!(env!("OUT_DIR"), "/data.rs"));
/// A context.
#[derive(Clone)]
pub struct Context {
forest: BTreeMap<&'static str, Tree>,
}
/// A script.
#[derive(Clone, Copy, Default)]
pub struct Script {
/// The number of codepoints found in the script.
pub count: usize,
/// The total number of codepoints in the script.
pub total: usize,
}
#[derive(Clone)]
struct Tree {
count: usize,
backend: IntervalTree<u32, ()>,
}
impl Context {
/// Detect writing scripts given ranges of Unicode codepoints.
pub fn detect<T>(&self, codepoints: T) -> BTreeMap<&'static str, Script>
where
T: IntoIterator<Item = [u32; 2]>,
{
let mut scripts: BTreeMap<&'static str, Script> = Default::default();
for [start, end] in codepoints {
for (name, tree) in self.forest.iter() {
for (range, _) in tree.backend.iter_overlaps(&(start..end + 1)) {
let count = range.end.min(end + 1) - range.start.max(start);
let script = scripts.entry(*name).or_insert_with(|| Script {
count: 0,
total: tree.count,
});
script.count += count as usize;
}
}
}
scripts
}
}
impl Default for Context {
fn default() -> Self {
let mut forest = BTreeMap::default();
for (name, ranges) in DATA {
forest.insert(*name, Tree::from(*ranges));
}
Self { forest }
}
}
impl From<&[(u32, u32)]> for Tree {
fn from(ranges: &[(u32, u32)]) -> Self {
let mut count = 0;
let mut backend = IntervalTree::default();
for (start, end) in ranges.iter().cloned() {
count += (end - start + 1) as usize;
backend.insert(start..end + 1, ());
}
Self { count, backend }
}
}
#[cfg(test)]
mod tests {
use serde::Deserialize;
use crate::Context;
macro_rules! ok(($result:expr) => ($result.unwrap()));
#[derive(Deserialize)]
#[serde(untagged)]
enum Range {
Single(u32),
Double(u32, u32),
}
impl From<Range> for [u32; 2] {
fn from(range: Range) -> Self {
match range {
Range::Single(value) => [value, value],
Range::Double(start, end) => [start, end],
}
}
}
#[test]
fn proxima_nova() {
const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 399, 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 601, 626, 688, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, [884, 885], 894, [900, 906], 908, [910, 929], [931, 974], 983, [1024, 1119], [1138, 1141], [1162, 1279], [1296, 1299], [1308, 1309], [1316, 1319], [1326, 1327], [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7715], [7766, 7767], [7776, 7777], [7786, 7787], [7808, 7813], 7838, [7840, 7929], [8192, 8202], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8317, 8329], [8333, 8334], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8467, [8470, 8471], 8480, 8482, 8486, 8494, [8531, 8532], [8539, 8542], [8592, 8595], 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], [8882, 8883], 8901, 8984, 9632, 9650, 9654, 9660, 9664, 9674, 9679, 9733, [9744, 9745], 9830, 10003, 11800, [57348, 57349], [62522, 62523], [62560, 62579], [62616, 62623], [62662, 62663], [62668, 62669], [62674, 62679], [62730, 62731], [62734, 62771], [62774, 62777], [62780, 62783], 63031, 63171, 63197, [63199, 63219], 63743, [64256, 64260]]";
compare(
CODEPOINTS,
vec![
("Cyrillic", 230, 518),
("Emoji", 27, 1589),
("Emoji_Component", 15, 156),
("Greek", 78, 554),
("Inherited", 20, 714),
("Latin", 395, 1528),
("PUA", 115, 137468),
("Unknown", 123, 954979),
],
);
}
#[test]
fn proxima_nova_arabic() {
const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 399, 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 601, 626, [699, 703], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, 1548, 1563, 1567, [1569, 1594], [1600, 1622], [1632, 1645], [1648, 1649], 1662, 1700, 1722, [7682, 7683], [7690, 7695], [7710, 7717], [7722, 7723], [7732, 7733], [7766, 7767], [7776, 7779], [7786, 7791], [7808, 7813], [7826, 7827], 7830, 7838, [7840, 7929], [8192, 8202], [8204, 8205], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8467, [8470, 8471], 8480, 8482, 8486, 8494, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], 9674, 9676, 11800, [57348, 57349], 63171, 63197, 63743, [64256, 64260], 64336, 64362, 64414, 64508, [64830, 64831], 65020, 65153, 65155, 65157, 65159, 65165, 65167, 65171, 65173, 65177, 65181, 65185, 65189, 65193, 65195, 65197, 65199, 65201, 65205, 65209, 65213, 65217, 65221, 65225, 65229, 65233, 65237, 65241, 65245, 65249, 65253, 65261, 65263, 65265]";
compare(
CODEPOINTS,
vec![
("Arabic", 100, 1469),
("Emoji", 21, 1589),
("Emoji_Component", 16, 156),
("Greek", 2, 554),
("Hebrew", 1, 143),
("Inherited", 36, 714),
("Latin", 414, 1528),
("PUA", 5, 137468),
("Unknown", 8, 954979),
],
);
}
#[test]
fn proxima_nova_devanagari() {
const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 626, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, [2304, 2431], [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7715], [7766, 7767], [7776, 7777], [7786, 7787], [7808, 7813], 7838, [7840, 7929], [8192, 8202], [8204, 8205], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8471, 8482, 8486, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], 9674, 9676, 11800, 43259, [57348, 57349], 63171, 63197, [64256, 64260]]";
compare(
CODEPOINTS,
vec![
("Arabic", 1, 1469),
("Devanagari", 125, 169),
("Emoji", 21, 1589),
("Emoji_Component", 16, 156),
("Greek", 2, 554),
("Inherited", 27, 714),
("Latin", 391, 1528),
("PUA", 4, 137468),
("Unknown", 7, 954979),
],
);
}
#[test]
fn proxima_nova_hangeul() {
const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 626, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7715], [7766, 7767], [7776, 7777], [7786, 7787], [7808, 7813], 7838, [7840, 7929], [8192, 8202], [8204, 8205], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8471, 8482, 8486, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], [9001, 9002], [9312, 9321], [9332, 9341], 9450, 9471, 9674, [10102, 10111], 11800, [12289, 12290], [12298, 12305], [12308, 12315], [12593, 12643], [12800, 12828], [12896, 12923], 12927, [44032, 55203], [57348, 57349], 63171, 63197, [64256, 64260], [65040, 65049], [65073, 65074], [65081, 65092]]";
compare(
CODEPOINTS,
vec![
("Emoji", 22, 1589),
("Emoji_Component", 17, 156),
("Greek", 2, 554),
("Hangul", 11281, 11753),
("Inherited", 23, 714),
("Katakana", 1, 335),
("Latin", 391, 1528),
("PUA", 4, 137468),
("Unknown", 10, 954979),
],
);
}
#[test]
fn proxima_nova_hebrew() {
const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 626, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, [1456, 1468], [1470, 1476], 1479, [1488, 1514], [1520, 1524], [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7717], [7730, 7731], [7766, 7767], [7776, 7777], [7786, 7789], [7806, 7813], [7826, 7827], 7838, [7840, 7929], [8192, 8202], [8204, 8207], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8471, 8482, 8486, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], 9674, 9676, 11800, [57348, 57349], 63171, 63197, [64256, 64260], 64285, 64287, [64297, 64310], [64312, 64316], 64318, [64320, 64321], [64323, 64324], [64326, 64334]]";
compare(
CODEPOINTS,
vec![
("Emoji", 21, 1589),
("Emoji_Component", 17, 156),
("Greek", 2, 554),
("Hebrew", 88, 143),
("Inherited", 23, 714),
("Latin", 401, 1528),
("PUA", 4, 137468),
("Unknown", 14, 954979),
],
);
}
#[test]
fn proxima_nova_tamil() {
const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 626, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, [2404, 2405], [2946, 2947], [2949, 2954], [2958, 2960], [2962, 2965], [2969, 2970], 2972, [2974, 2975], [2979, 2980], [2984, 2986], [2990, 3001], [3006, 3010], [3014, 3016], [3018, 3021], 3024, 3031, [3046, 3066], [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7715], [7732, 7735], [7738, 7739], [7748, 7753], [7766, 7767], [7774, 7779], [7786, 7789], [7808, 7813], 7838, [7840, 7929], [8192, 8202], [8204, 8205], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8471, 8482, 8486, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], 9674, 9676, 11800, [57348, 57349], 63171, 63197, [64256, 64260]]";
compare(
CODEPOINTS,
vec![
("Devanagari", 1, 169),
("Emoji", 21, 1589),
("Emoji_Component", 16, 156),
("Greek", 2, 554),
("Inherited", 22, 714),
("Latin", 409, 1528),
("PUA", 4, 137468),
("Tamil", 72, 141),
("Unknown", 23, 954979),
],
);
}
#[test]
fn proxima_nova_thai() {
const CODEPOINTS: &str = "[0, 13, [32, 126], [160, 383], 402, 413, [416, 417], [431, 432], [486, 487], [490, 491], [506, 511], [536, 539], [562, 563], 567, 626, [699, 700], [710, 711], 713, 715, [728, 733], [768, 772], [774, 780], 783, 785, 803, [806, 808], 814, 817, 894, [3585, 3642], [3647, 3675], [7682, 7683], [7690, 7691], [7710, 7711], [7714, 7715], [7766, 7767], [7776, 7777], [7786, 7787], [7808, 7813], 7838, [7840, 7929], [8192, 8202], [8204, 8205], [8211, 8213], [8216, 8222], [8224, 8226], 8230, 8240, [8242, 8243], [8249, 8250], [8253, 8254], 8260, 8304, [8308, 8313], [8320, 8329], 8353, [8355, 8356], [8358, 8364], 8372, [8376, 8378], [8380, 8381], 8471, 8482, 8486, 8706, 8710, 8719, [8721, 8722], 8725, [8729, 8730], 8734, 8747, 8776, 8800, [8804, 8805], 9674, 9676, 11800, [57348, 57349], 63171, 63197, [64256, 64260]]";
compare(
CODEPOINTS,
vec![
("Emoji", 21, 1589),
("Emoji_Component", 16, 156),
("Greek", 2, 554),
("Inherited", 22, 714),
("Latin", 391, 1528),
("PUA", 4, 137468),
("Thai", 86, 88),
("Unknown", 9, 954979),
],
);
}
fn compare(codepoints: &str, expected: Vec<(&str, usize, usize)>) {
let codepoints: Vec<Range> = ok!(serde_json::from_str(codepoints));
let results = Context::default().detect(codepoints.into_iter().map(Into::into));
let actual = results
.into_iter()
.map(|(name, script)| (name, script.count, script.total))
.collect::<Vec<_>>();
assert_eq!(actual, expected);
}
}