Skip to main content

typg_core/
scriptmap.rs

1//! Script-tag resolution for the multi-wise `--scripts` filter.
2//!
3//! A user-supplied script tag may be an **ISO 15924** code (`Latn`, `Deva`,
4//! `Latf`) or an **OpenType** script tag (`latn`, `dev2`, `arab`). Each input
5//! is resolved into a [`ScriptRequirement`] carrying two independent ways a
6//! font can satisfy it:
7//!
8//! 1. **OpenType** — a *group* of OpenType script tags. The font's GSUB/GPOS
9//!    script list must contain *at least one* tag from the group. So `deva`
10//!    expands to `["deva", "dev2", "dev3"]` and any one of them suffices.
11//!
12//! 2. **Unicode** — a single Unicode script (UAX #24). The font's `cmap` is
13//!    considered to cover that script when it maps at least
14//!    [`MIN_UNICODE_COVERAGE`] codepoints belonging to it.
15//!
16//! The whole-font decision (see [`crate::query::Query::matches`]) is: a font
17//! supports the requested list if it satisfies *every* requirement through the
18//! OpenType path, **or** it satisfies *every* requirement through the Unicode
19//! path.
20//!
21//! Resolution is best-effort: when an input cannot be mapped, it is treated as
22//! a literal OpenType script tag (and, where possible, as a Unicode script via
23//! its title-cased spelling).
24//!
25//! Made by FontLab <https://www.fontlab.com/>
26use std::collections::HashSet;
27
28use read_fonts::types::Tag;
29use unicode_script::{Script, UnicodeScript};
30
31use crate::tags::tag4;
32
33/// Minimum number of cmap codepoints in a Unicode script for the font's `cmap`
34/// to count as covering that script.
35pub const MIN_UNICODE_COVERAGE: usize = 4;
36
37/// One resolved script requirement: an OpenType tag group plus an optional
38/// Unicode script. Built by [`resolve_scripts`].
39#[derive(Debug, Clone)]
40pub struct ScriptRequirement {
41    /// The original input string, preserved for diagnostics.
42    input: String,
43    /// OpenType script tags; the font satisfies the OpenType path if its
44    /// GSUB/GPOS script list contains *any* of these.
45    ot_tags: Vec<Tag>,
46    /// Unicode script the font's `cmap` must cover (≥ [`MIN_UNICODE_COVERAGE`]
47    /// codepoints) to satisfy the Unicode path. `None` when the input maps to
48    /// no known Unicode script.
49    unicode_script: Option<Script>,
50}
51
52impl ScriptRequirement {
53    /// The original input tag string.
54    pub fn input(&self) -> &str {
55        &self.input
56    }
57
58    /// The OpenType tag group; any one present in the font satisfies the OT path.
59    pub fn ot_tags(&self) -> &[Tag] {
60        &self.ot_tags
61    }
62
63    /// The Unicode script the font's `cmap` must cover, if any.
64    pub fn unicode_script(&self) -> Option<Script> {
65        self.unicode_script
66    }
67
68    /// Whether the font's GSUB/GPOS script tags satisfy the OpenType path.
69    pub fn ot_satisfied(&self, font_scripts: &HashSet<Tag>) -> bool {
70        self.ot_tags.iter().any(|tag| font_scripts.contains(tag))
71    }
72
73    /// Whether the given codepoints cover this requirement's Unicode script.
74    ///
75    /// Returns `false` when the input mapped to no Unicode script. Otherwise,
76    /// counts codepoints belonging to that script and stops early once
77    /// [`MIN_UNICODE_COVERAGE`] is reached.
78    pub fn unicode_satisfied<I>(&self, codepoints: I) -> bool
79    where
80        I: IntoIterator<Item = char>,
81    {
82        let Some(script) = self.unicode_script else {
83            return false;
84        };
85        let mut count = 0usize;
86        for ch in codepoints {
87            if ch.script() == script {
88                count += 1;
89                if count >= MIN_UNICODE_COVERAGE {
90                    return true;
91                }
92            }
93        }
94        false
95    }
96}
97
98/// Resolve a list of raw script-tag strings into [`ScriptRequirement`]s.
99///
100/// Empty/whitespace-only entries are skipped. Each remaining entry maps to one
101/// requirement via [`resolve_one`].
102pub fn resolve_scripts(raw: &[String]) -> Vec<ScriptRequirement> {
103    raw.iter()
104        .filter_map(|s| {
105            let trimmed = s.trim();
106            if trimmed.is_empty() {
107                None
108            } else {
109                Some(resolve_one(trimmed))
110            }
111        })
112        .collect()
113}
114
115/// Resolve a single script-tag string into a [`ScriptRequirement`].
116fn resolve_one(input: &str) -> ScriptRequirement {
117    let key = input.to_ascii_lowercase();
118
119    let (ot_strings, unicode_short): (Vec<&str>, Option<&str>) = match lookup(&key) {
120        Some((tags, uni)) => (tags.to_vec(), Some(uni)),
121        None => (vec![key.as_str()], None),
122    };
123
124    let ot_tags: Vec<Tag> = ot_strings.iter().filter_map(|t| tag4(t).ok()).collect();
125
126    // Unicode script: explicit mapping if present, else the title-cased input
127    // (works for the many scripts whose OpenType/ISO base equals the Unicode
128    // 4-letter code, e.g. "latn" → "Latn", "cyrl" → "Cyrl", "hani" → "Hani").
129    let unicode_script = unicode_short
130        .and_then(Script::from_short_name)
131        .or_else(|| Script::from_short_name(&title_case(&key)))
132        // `Zzzz` and other uncoded inputs resolve to `Unknown`, which would
133        // spuriously match unassigned/private-use codepoints. Treat as "none".
134        .filter(|s| *s != Script::Unknown);
135
136    ScriptRequirement {
137        input: input.to_string(),
138        ot_tags,
139        unicode_script,
140    }
141}
142
143/// Title-case a 4-letter script code: first byte upper, rest lower.
144fn title_case(s: &str) -> String {
145    let mut chars = s.chars();
146    match chars.next() {
147        Some(first) => {
148            first.to_ascii_uppercase().to_string() + &chars.as_str().to_ascii_lowercase()
149        }
150        None => String::new(),
151    }
152}
153
154/// Static map from a lowercased input tag to (OpenType tag group, Unicode
155/// short name). Covers two cases the title-case fallback cannot:
156///
157/// - **Indic v1/v2/v3 shaping tags** — Devanagari, Bengali, etc. each have
158///   multiple OpenType tags; any one satisfies the script.
159/// - **ISO 15924 aliases** whose OpenType tag (and/or Unicode script) differs
160///   from a simple lowercasing, e.g. `latf`/`latg` → `latn`.
161///
162/// Every alternate spelling of a script maps to the *same* full OpenType group,
163/// so `dev2` and `deva` behave identically.
164fn lookup(key: &str) -> Option<(&'static [&'static str], &'static str)> {
165    // Indic scripts: (group, unicode short name). Extra/non-existent OpenType
166    // tags in a group are harmless — no font declares them.
167    const DEVA: &[&str] = &["deva", "dev2", "dev3"];
168    const BENG: &[&str] = &["beng", "bng2"];
169    const GUJR: &[&str] = &["gujr", "gjr2"];
170    const GURU: &[&str] = &["guru", "gur2"];
171    const KNDA: &[&str] = &["knda", "knd2"];
172    const MLYM: &[&str] = &["mlym", "mlm2"];
173    const ORYA: &[&str] = &["orya", "ory2"];
174    const TAML: &[&str] = &["taml", "tml2"];
175    const TELU: &[&str] = &["telu", "tel2"];
176
177    let entry: (&[&str], &str) = match key {
178        "deva" | "dev2" | "dev3" => (DEVA, "Deva"),
179        "beng" | "bng2" => (BENG, "Beng"),
180        "gujr" | "gjr2" => (GUJR, "Gujr"),
181        "guru" | "gur2" => (GURU, "Guru"),
182        "knda" | "knd2" => (KNDA, "Knda"),
183        "mlym" | "mlm2" => (MLYM, "Mlym"),
184        "orya" | "ory2" => (ORYA, "Orya"),
185        "taml" | "tml2" => (TAML, "Taml"),
186        "telu" | "tel2" => (TELU, "Telu"),
187        // ISO 15924 aliases whose OpenType/Unicode mapping is not a lowercasing.
188        "latf" | "latg" => (&["latn"], "Latn"),
189        "aran" => (&["arab"], "Arab"),
190        "syre" | "syrj" | "syrn" => (&["syrc"], "Syrc"),
191        "hans" | "hant" => (&["hani"], "Hani"),
192        "lao" | "laoo" => (&["lao "], "Laoo"),
193        "yiii" => (&["yi  "], "Yiii"),
194        "nkoo" => (&["nko "], "Nkoo"),
195        _ => return None,
196    };
197    Some(entry)
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203
204    fn tags(req: &ScriptRequirement) -> Vec<String> {
205        req.ot_tags()
206            .iter()
207            .map(|t| String::from_utf8_lossy(&t.to_be_bytes()).trim().to_string())
208            .collect()
209    }
210
211    #[test]
212    fn resolve_iso_devanagari_expands_to_ot_group() {
213        let reqs = resolve_scripts(&["deva".to_string()]);
214        assert_eq!(reqs.len(), 1);
215        assert_eq!(tags(&reqs[0]), vec!["deva", "dev2", "dev3"]);
216        assert_eq!(reqs[0].unicode_script(), Some(Script::Devanagari));
217    }
218
219    #[test]
220    fn resolve_v2_tag_maps_to_same_group_and_script() {
221        let reqs = resolve_scripts(&["dev2".to_string()]);
222        assert_eq!(tags(&reqs[0]), vec!["deva", "dev2", "dev3"]);
223        assert_eq!(reqs[0].unicode_script(), Some(Script::Devanagari));
224    }
225
226    #[test]
227    fn resolve_latin_fraktur_alias_maps_to_latn() {
228        let reqs = resolve_scripts(&["latf".to_string()]);
229        assert_eq!(tags(&reqs[0]), vec!["latn"]);
230        assert_eq!(reqs[0].unicode_script(), Some(Script::Latin));
231    }
232
233    #[test]
234    fn resolve_plain_opentype_tag_via_fallback() {
235        let reqs = resolve_scripts(&["latn".to_string()]);
236        assert_eq!(tags(&reqs[0]), vec!["latn"]);
237        assert_eq!(reqs[0].unicode_script(), Some(Script::Latin));
238    }
239
240    #[test]
241    fn resolve_is_case_insensitive() {
242        let reqs = resolve_scripts(&["LATN".to_string(), "Deva".to_string()]);
243        assert_eq!(tags(&reqs[0]), vec!["latn"]);
244        assert_eq!(reqs[0].unicode_script(), Some(Script::Latin));
245        assert_eq!(reqs[1].unicode_script(), Some(Script::Devanagari));
246    }
247
248    #[test]
249    fn resolve_unknown_tag_falls_back_to_literal_ot() {
250        let reqs = resolve_scripts(&["zzzz".to_string()]);
251        assert_eq!(tags(&reqs[0]), vec!["zzzz"]);
252        assert_eq!(reqs[0].unicode_script(), None);
253    }
254
255    #[test]
256    fn empty_and_blank_entries_are_skipped() {
257        let reqs = resolve_scripts(&["".to_string(), "   ".to_string(), "latn".to_string()]);
258        assert_eq!(reqs.len(), 1);
259    }
260
261    #[test]
262    fn ot_satisfied_matches_any_group_member() {
263        let reqs = resolve_scripts(&["deva".to_string()]);
264        let mut font: HashSet<Tag> = HashSet::new();
265        font.insert(tag4("dev2").unwrap());
266        assert!(reqs[0].ot_satisfied(&font));
267
268        let empty: HashSet<Tag> = HashSet::new();
269        assert!(!reqs[0].ot_satisfied(&empty));
270    }
271
272    #[test]
273    fn unicode_satisfied_needs_min_coverage() {
274        let reqs = resolve_scripts(&["latn".to_string()]);
275        // Three Latin letters: below threshold.
276        assert!(!reqs[0].unicode_satisfied(['a', 'b', 'c']));
277        // Four Latin letters: meets threshold.
278        assert!(reqs[0].unicode_satisfied(['a', 'b', 'c', 'd']));
279        // Non-Latin codepoints never count toward Latin.
280        assert!(!reqs[0].unicode_satisfied(['α', 'β', 'γ', 'δ']));
281    }
282}