Skip to main content

google_fonts_glyphsets/
lib.rs

1use std::collections::{HashMap, HashSet};
2
3use serde::Deserialize;
4
5use thiserror::Error;
6
7#[derive(Debug, Error)]
8pub enum GlyphsetError {
9    #[error("Glyphset not found")]
10    GlyphsetNotFound,
11    #[error("Bug in glyphset definitions")]
12    InternalInconsistency,
13}
14
15fn script_name(s: &str) -> String {
16    // We could do this properly but we only have a few scripts, so:
17    match s {
18        "Latn" => "Latin".to_string(),
19        "Arab" => "Arabic".to_string(),
20        "Cyrl" => "Cyrillic".to_string(),
21        _ => s.to_string(),
22    }
23}
24
25include!(concat!(env!("OUT_DIR"), "/data.rs"));
26
27#[derive(Default, Debug, Clone, Deserialize, PartialEq)]
28/// The definition of a glyph set used by Google Fonts
29pub struct Glyphset {
30    #[serde(skip)]
31    name: String,
32    #[serde(default)]
33    description: String,
34    #[serde(default)]
35    include_glyphsets: Vec<String>,
36    #[serde(default)]
37    language_codes: Vec<String>,
38    #[serde(skip)]
39    codepoints: HashSet<u32>,
40    use_auxiliary: Option<bool>,
41    historical: Option<bool>,
42    #[serde(default)]
43    regions: Vec<String>,
44    #[serde(default)]
45    exclude_language_codes: Vec<String>,
46    population: Option<u32>,
47}
48
49impl Glyphset {
50    fn script(&self) -> String {
51        self.name
52            .split("_")
53            .skip(1)
54            .take(1)
55            .next()
56            .expect("Malformed glyphset name - no script")
57            .to_string()
58    }
59
60    /// Get an iterator over all codepoints in this glyphset, including those
61    /// from included glyphsets.
62    pub fn iter_codepoints(&self) -> impl Iterator<Item = u32> {
63        // Must recursively include codepoints from included glyphsets
64        // We can't make a recursive iterator, rustc goes crazy, so we'll
65        // just put everything in an owned hashset.
66
67        // Infinite loops are bad.
68        let mut seen_glyphsets = HashSet::new();
69        let mut codepoints = HashSet::new();
70        let mut to_process = vec![self.name.clone()];
71        while let Some(glyphset_name) = to_process.pop() {
72            if seen_glyphsets.contains(&glyphset_name) {
73                continue;
74            }
75            seen_glyphsets.insert(glyphset_name.clone());
76            let glyphset = GLYPHSETS
77                .get(&glyphset_name)
78                .expect("Malformed glyphset name in include_glyphsets");
79            for cp in &glyphset.codepoints {
80                codepoints.insert(*cp);
81            }
82        }
83        codepoints.into_iter()
84    }
85}
86
87/// A struct to hold coverage information for a single glyphset
88#[derive(Default, Debug, Clone)]
89pub struct Coverage {
90    /// The set of codepoints from the glyphset present in the provided codepoint set
91    pub has: HashSet<u32>,
92    /// The set of codepoints from the glyphset which are not present in the provided codepoint set
93    pub missing: HashSet<u32>,
94    /// The fraction of the glyphset present (0.0 to 1.0)
95    pub fraction: f32,
96}
97
98pub fn get_coverage<T>(codepoints: &T, glyphset: &str) -> Option<Coverage>
99where
100    for<'a> &'a T: IntoIterator<Item = &'a u32>,
101{
102    let codepoints_set: HashSet<u32> = codepoints.into_iter().copied().collect();
103    GLYPHSETS.get(glyphset).map(|gs| {
104        let our_codepoints = &gs.codepoints;
105        let mut coverage = Coverage::default();
106        let our_codepoints_set: HashSet<u32> = our_codepoints.iter().copied().collect();
107        coverage.has = our_codepoints_set
108            .intersection(&codepoints_set)
109            .copied()
110            .collect();
111        coverage.missing = our_codepoints_set
112            .difference(&codepoints_set)
113            .copied()
114            .collect();
115        // Percentage calculations are a little involved...
116        if !coverage.has.is_empty() && coverage.missing.is_empty() {
117            coverage.fraction = 1.0;
118        } else if glyphset == "GF_Latin_Core" || glyphset == "GF_Latin_Kernel" {
119            coverage.fraction = coverage.has.len() as f32 / our_codepoints.len() as f32;
120        } else {
121            let our_codepoints_without_core = our_codepoints_set
122                .difference(&GF_LATIN_CORE_CODEPOINTS.into())
123                .copied()
124                .collect::<HashSet<_>>();
125            let our_codepoints_without_kernel = our_codepoints_set
126                .difference(&GF_LATIN_KERNEL_CODEPOINTS.into())
127                .copied()
128                .collect::<HashSet<_>>();
129            let unicodes_unique_in_glyphset =
130                if our_codepoints_without_core.is_superset(&our_codepoints_without_kernel) {
131                    our_codepoints_without_core
132                } else {
133                    our_codepoints_without_kernel
134                };
135            if unicodes_unique_in_glyphset.is_empty() {
136                coverage.fraction = 0.0;
137            } else {
138                let has = coverage
139                    .has
140                    .intersection(&unicodes_unique_in_glyphset)
141                    .count();
142                coverage.fraction = has as f32 / unicodes_unique_in_glyphset.len() as f32;
143            }
144        }
145        coverage
146    })
147}
148
149pub fn get_glyphset_coverage<T>(codepoints: &T) -> HashMap<String, Coverage>
150where
151    for<'a> &'a T: IntoIterator<Item = &'a u32>,
152{
153    GLYPHSETS
154        .keys()
155        .map(|glyphset| {
156            let coverage = get_coverage(codepoints, glyphset).unwrap();
157            (glyphset.to_string(), coverage)
158        })
159        .collect()
160}
161
162pub fn languages_per_glyphset(gs: &str) -> Result<Vec<String>, GlyphsetError> {
163    let glyphset = GLYPHSETS.get(gs).ok_or(GlyphsetError::GlyphsetNotFound)?;
164    let mut codes = glyphset.language_codes.clone();
165    for include in &glyphset.include_glyphsets {
166        let include_glyphset = GLYPHSETS
167            .get(include)
168            .ok_or(GlyphsetError::InternalInconsistency)?;
169        codes.extend(include_glyphset.language_codes.clone());
170    }
171    if !glyphset.regions.is_empty() {
172        for language in google_fonts_languages::LANGUAGES.values() {
173            if glyphset
174                .exclude_language_codes
175                .contains(&language.id().to_string())
176            {
177                continue;
178            }
179            if language.historical() && !glyphset.historical.unwrap_or(false) {
180                continue;
181            }
182            // Why did I make language.population signed? That's just not smart.
183            if glyphset.population.unwrap_or(0) as i32 > language.population() {
184                continue;
185            }
186            if glyphset.script() == script_name(language.script()) {
187                let language_region_set: HashSet<_> = language.region.iter().collect();
188                if glyphset
189                    .regions
190                    .iter()
191                    .any(|region| language_region_set.contains(region))
192                {
193                    codes.push(language.id().to_string());
194                }
195            }
196        }
197    }
198    Ok(codes)
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204
205    #[test]
206    fn test_name() {
207        assert!(languages_per_glyphset("GF_Arabic_Plus").unwrap().len() >= 8);
208        assert!(languages_per_glyphset("GF_Latin_African").unwrap().len() >= 617);
209    }
210
211    #[test]
212    fn test_codepoints() {
213        let kernel_cps = GLYPHSETS
214            .get("GF_Latin_Kernel")
215            .unwrap()
216            .iter_codepoints()
217            .collect::<HashSet<_>>();
218        assert!(kernel_cps.contains(&0x0041)); // A
219        assert!(!kernel_cps.contains(&0x00E9)); // é
220                                                // Arabic core includes kernel plus basic Arabic glyphs
221        let arabic_core_cps = GLYPHSETS
222            .get("GF_Arabic_Core")
223            .unwrap()
224            .iter_codepoints()
225            .collect::<HashSet<_>>();
226        assert!(arabic_core_cps.is_superset(&kernel_cps));
227        assert!(arabic_core_cps.contains(&0x0627)); // ALEF
228                                                    // Arabic Plus includes Latin Kernel + Arabic Core plus more
229        let arabic_plus_cps = GLYPHSETS
230            .get("GF_Arabic_Plus")
231            .unwrap()
232            .iter_codepoints()
233            .collect::<HashSet<_>>();
234        assert!(arabic_plus_cps.is_superset(&arabic_core_cps));
235        assert!(arabic_plus_cps.is_superset(&kernel_cps));
236        assert!(arabic_plus_cps.contains(&0x06B3)); // ARABIC LETTER GUEH
237    }
238}