oxideav-otf 0.1.3

Pure-Rust OpenType/CFF font parser for the oxideav framework — sfnt + CFF Top/Private DICT + Type 2 charstrings (cubic-Bezier outlines)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
//! Adobe Glyph List (AGL) — the canonical PostScript glyph-name to
//! Unicode-scalar-value mapping shipped by Adobe.
//!
//! Source: `data/agl-glyphlist.txt`, a verbatim copy of the AGL 2.0
//! table (September 20, 2002) staged under
//! `docs/text/opentype/spec/agl-glyphlist.txt`. The table format is
//! described in `docs/text/opentype/spec/agl-aglfn-README.md` and is a
//! plain-text two-column listing:
//!
//! ```text
//! # comment lines start with '#'
//! A;0041
//! AE;00C6
//! dalethatafpatah;05D3 05B2
//!//! ```
//!
//! - Column 1: PostScript glyph name (ASCII letters + digits only).
//! - Column 2: one or more space-separated 4-uppercase-hex-digit
//!   Unicode scalar values. AGL 2.0 ships **4200** single-codepoint
//!   entries plus **81** multi-codepoint sequences (mostly Hebrew
//!   base+point combinations) — **4281 total**.
//!
//! Multiple glyph names can map to the same codepoint sequence (most
//! common case: Hebrew vowel-pointing variants);
//! [`codepoint_to_name`] therefore picks the *first* name in
//! ASCII-sorted order (which is the file's on-disk order — the AGL
//! README documents it as sorted by glyph name in increasing ASCII
//! order).
//!
//! ## Scope
//!
//! This module is **only** the static AGL table lookup. The full AGL
//! Specification §6 ("Mapping glyph names to character sequences")
//! defines an algorithm for decomposing component glyph names like
//! `f_f_i` (→ `ffi`) and the `uniXXXX` / `uXXXXX` hex-encoded forms.
//! Those decomposition rules are not implemented here because the AGL
//! Specification document itself is not staged in `docs/text/opentype/`
//! — only the raw AGL table and its `aglfn-README.md` companion are.
//! Once the AGL Specification is staged, [`name_to_codepoints`] can
//! be extended with the §6 algorithm without an API change (the
//! existing exact-match path stays correct as the spec's step 2
//! "look up the name in AGL").
//!
//! ## Build-time vs. runtime
//!
//! The 78 KB table is parsed at first use via [`std::sync::OnceLock`];
//! no parsing happens at crate-load time. The lookup tables cost
//! ~250 KB resident once initialised. Both lookups are O(1) average.

use std::collections::HashMap;
use std::sync::OnceLock;

/// Raw AGL 2.0 table, included verbatim from `data/agl-glyphlist.txt`.
///
/// Format: `name;XXXX[ XXXX...]\n` per line, comment lines start with
/// `#`, blank lines are ignored. See module docs for the source
/// description.
const AGL_TEXT: &str = include_str!("../data/agl-glyphlist.txt");

/// AGL entry kind: a single Unicode codepoint or a sequence of
/// codepoints (the latter for ~81 Hebrew base + vowel-pointing
/// combinations in AGL 2.0).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Codepoints<'a> {
    /// The glyph name maps to exactly one Unicode scalar value.
    Single(char),
    /// The glyph name maps to a sequence of two or more Unicode
    /// scalar values. The slice is borrowed from the static AGL
    /// table.
    Sequence(&'a [char]),
}

impl<'a> Codepoints<'a> {
    /// Returns the first codepoint of the sequence (or the single
    /// codepoint for a [`Codepoints::Single`]). The AGL has no
    /// empty sequences, so this is always well-defined.
    pub fn first(&self) -> char {
        match *self {
            Codepoints::Single(c) => c,
            // Multi-codepoint sequences in AGL 2.0 are all
            // non-empty; the parser only emits a Sequence when at
            // least two codepoints are present.
            Codepoints::Sequence(slice) => slice[0],
        }
    }

    /// Returns the codepoint slice. For [`Codepoints::Single`] this
    /// is a one-element slice borrowed from the lazily-built parse
    /// table.
    pub fn as_slice(&self) -> &'a [char] {
        match self {
            Codepoints::Single(_) => {
                // We can't return a borrow to the local `Single`
                // copy. Callers that want a slice should construct
                // one from the single codepoint themselves; this
                // accessor is here for the sequence case.
                unreachable!("call as_slice() only on Sequence variants; use first() or match")
            }
            Codepoints::Sequence(s) => s,
        }
    }

    /// Length of the codepoint sequence (always 1 for
    /// [`Codepoints::Single`]).
    pub fn len(&self) -> usize {
        match self {
            Codepoints::Single(_) => 1,
            Codepoints::Sequence(s) => s.len(),
        }
    }

    /// `true` when the sequence is empty (never the case for AGL
    /// entries; included for API completeness).
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }
}

/// Parsed entry stored in the static table: one or more codepoints.
#[derive(Debug)]
enum StaticEntry {
    Single(char),
    Sequence(Vec<char>),
}

/// Parsed `name → entry` table (lazily initialised on first use).
fn name_table() -> &'static HashMap<&'static str, StaticEntry> {
    static MAP: OnceLock<HashMap<&'static str, StaticEntry>> = OnceLock::new();
    MAP.get_or_init(|| {
        let mut m = HashMap::with_capacity(4400);
        for (name, codepoints) in raw_entries() {
            let entry = if codepoints.len() == 1 {
                StaticEntry::Single(codepoints[0])
            } else {
                StaticEntry::Sequence(codepoints)
            };
            // The AGL is name-unique. First-wins on the off-chance
            // a future revision introduces a collision (keeps
            // behaviour deterministic).
            m.entry(name).or_insert(entry);
        }
        m
    })
}

/// Parsed `single-codepoint → first-name` table (lazily initialised).
/// Only single-codepoint entries participate in reverse lookup —
/// multi-codepoint sequences would require the caller to also know
/// the rest of the sequence and aren't surfaced through the simple
/// `char → name` accessor.
fn codepoint_table() -> &'static HashMap<u32, &'static str> {
    static MAP: OnceLock<HashMap<u32, &'static str>> = OnceLock::new();
    MAP.get_or_init(|| {
        let mut m = HashMap::with_capacity(4300);
        for (name, codepoints) in raw_entries() {
            if codepoints.len() != 1 {
                continue;
            }
            // First-wins: on-disk order is ASCII-sorted by name, so
            // this picks the alphabetically-first glyph name for any
            // codepoint that has multiple AGL aliases.
            m.entry(codepoints[0] as u32).or_insert(name);
        }
        m
    })
}

/// Parse the AGL into `(name, Vec<char>)` pairs. The Vec allocations
/// are paid once at table-build time; both `name_table` and
/// `codepoint_table` re-walk this iterator at init.
fn raw_entries() -> impl Iterator<Item = (&'static str, Vec<char>)> {
    AGL_TEXT.lines().filter_map(|line| {
        let line = line.trim_end_matches('\r');
        if line.is_empty() || line.starts_with('#') {
            return None;
        }
        let (name, hex_field) = line.split_once(';')?;
        // The hex field is one or more 4-digit hex codepoints
        // separated by ASCII space (per the AGL spec format).
        let mut codepoints = Vec::with_capacity(2);
        for hex in hex_field.split(' ') {
            if hex.len() != 4 {
                return None;
            }
            let cp = u32::from_str_radix(hex, 16).ok()?;
            let c = char::from_u32(cp)?;
            codepoints.push(c);
        }
        if codepoints.is_empty() {
            return None;
        }
        Some((name, codepoints))
    })
}

/// Resolve an Adobe Glyph List glyph name to its Unicode scalar value
/// sequence.
///
/// Returns `None` for names absent from the AGL 2.0 table. The match
/// is exact: the AGL Specification's §6 component-name decomposition
/// algorithm (`f_f_i` → `ffi`, `uniXXXX` → `U+XXXX`, etc.) is not
/// implemented because the AGL Specification document itself is not
/// staged under `docs/text/opentype/`.
///
/// Most entries surface as [`Codepoints::Single`]; the ~81 Hebrew
/// base + vowel-pointing combinations surface as
/// [`Codepoints::Sequence`].
///
/// ```
/// use oxideav_otf::agl::{name_to_codepoints, Codepoints};
///
/// assert_eq!(name_to_codepoints("A"),          Some(Codepoints::Single('A')));
/// assert_eq!(name_to_codepoints("AE"),         Some(Codepoints::Single('\u{00C6}')));
/// assert_eq!(name_to_codepoints("zero"),       Some(Codepoints::Single('0')));
/// assert!(matches!(
///     name_to_codepoints("dalethatafpatah"),
///     Some(Codepoints::Sequence(s)) if s == ['\u{05D3}', '\u{05B2}']
/// ));
/// assert_eq!(name_to_codepoints("not_a_glyph"), None);
/// ```
pub fn name_to_codepoints(name: &str) -> Option<Codepoints<'static>> {
    match name_table().get(name)? {
        StaticEntry::Single(c) => Some(Codepoints::Single(*c)),
        StaticEntry::Sequence(v) => Some(Codepoints::Sequence(v.as_slice())),
    }
}

/// Resolve an Adobe Glyph List glyph name to its *single* Unicode
/// scalar value. Returns `None` for names absent from AGL **and** for
/// names that map to a multi-codepoint sequence (callers wanting the
/// full sequence should use [`name_to_codepoints`]).
///
/// This is the common-case helper: most font / PDF consumers only
/// care about names that round-trip to one Unicode scalar.
pub fn name_to_codepoint(name: &str) -> Option<char> {
    match name_table().get(name)? {
        StaticEntry::Single(c) => Some(*c),
        StaticEntry::Sequence(_) => None,
    }
}

/// Resolve a Unicode codepoint to its canonical Adobe Glyph List name.
///
/// Returns `None` if no AGL entry maps to this codepoint. When
/// multiple glyph names share a codepoint (most common case: Hebrew
/// vowel-pointing combining marks; ~17 names share U+05B8 in AGL
/// 2.0), this returns the *first* such name in the AGL's on-disk
/// (ASCII-sorted) order. For callers who need every alias, iterate
/// [`entries`]. Multi-codepoint sequence entries (e.g.
/// `dalethatafpatah → [U+05D3, U+05B2]`) do **not** participate in
/// reverse lookup — a single `char` argument can't disambiguate them.
///
/// ```
/// use oxideav_otf::agl::codepoint_to_name;
///
/// assert_eq!(codepoint_to_name('A'),         Some("A"));
/// assert_eq!(codepoint_to_name('\u{00C6}'),  Some("AE"));
/// assert_eq!(codepoint_to_name('0'),         Some("zero"));
/// // U+FFFE is not encoded by Unicode and has no AGL entry.
/// assert_eq!(codepoint_to_name('\u{FFFE}'),  None);
/// ```
pub fn codepoint_to_name(codepoint: char) -> Option<&'static str> {
    codepoint_table().get(&(codepoint as u32)).copied()
}

/// Total number of `(name, codepoints)` entries in the AGL table.
/// Constant per AGL version (4281 in AGL 2.0).
pub fn entry_count() -> usize {
    name_table().len()
}

/// Number of *distinct* codepoints reachable through
/// [`codepoint_to_name`]. AGL 2.0 has 4200 single-codepoint entries
/// but only **3680** distinct codepoints among them — many AGL names
/// alias the same codepoint (Hebrew vowel-pointing combinations, and
/// the Mac / Windows / PUA legacy duplicates).
pub fn distinct_codepoint_count() -> usize {
    codepoint_table().len()
}

/// Iterate every `(name, codepoints)` pair in the AGL in on-disk
/// (ASCII-sorted-by-name) order.
///
/// Both single-codepoint and multi-codepoint entries are yielded.
pub fn entries() -> impl Iterator<Item = (&'static str, Codepoints<'static>)> {
    // Walk the table (which the OnceLock has already parsed by the
    // time anyone is iterating).
    let table = name_table();
    // Re-walk the on-disk order (`raw_entries` is ASCII-sorted),
    // looking each name up in the table to borrow the static slice.
    raw_entries().filter_map(move |(name, _)| {
        let entry = table.get(name)?;
        let cp = match entry {
            StaticEntry::Single(c) => Codepoints::Single(*c),
            StaticEntry::Sequence(v) => Codepoints::Sequence(v.as_slice()),
        };
        Some((name, cp))
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn entry_count_matches_agl_2_0() {
        // AGL 2.0 ships 4281 (name, codepoints) entries — see the
        // `agl-glyphlist.txt` source (sum of non-comment, non-blank
        // lines). Any change here means the source table was edited;
        // the test is the canary.
        assert_eq!(entry_count(), 4281);
    }

    #[test]
    fn distinct_codepoint_count_is_3680() {
        // AGL 2.0 has 4200 single-codepoint entries but only **3680**
        // *distinct* codepoints — many AGL names alias the same
        // codepoint (Hebrew vowel-pointing variants, Mac / Windows
        // legacy duplicates, and `Acutesmall` / `acutesmall`-style
        // case pairs that share a PUA slot). The remaining 81 of
        // 4281 are multi-codepoint sequences.
        assert_eq!(distinct_codepoint_count(), 3680);
    }

    #[test]
    fn sequence_entry_count_is_81() {
        // Count entries that surface as `Codepoints::Sequence`.
        let n = entries()
            .filter(|(_, c)| !matches!(c, Codepoints::Single(_)))
            .count();
        assert_eq!(n, 81);
    }

    #[test]
    fn ascii_uppercase_letters_round_trip() {
        for c in 'A'..='Z' {
            let name = c.to_string();
            assert_eq!(name_to_codepoint(&name), Some(c), "AGL miss: {name}");
            let reverse = codepoint_to_name(c).expect("ASCII letter has AGL name");
            assert_eq!(reverse, name.as_str(), "reverse lookup for {c}");
        }
    }

    #[test]
    fn ascii_digits_round_trip() {
        let pairs: &[(char, &str)] = &[
            ('0', "zero"),
            ('1', "one"),
            ('2', "two"),
            ('3', "three"),
            ('4', "four"),
            ('5', "five"),
            ('6', "six"),
            ('7', "seven"),
            ('8', "eight"),
            ('9', "nine"),
        ];
        for &(c, name) in pairs {
            assert_eq!(name_to_codepoint(name), Some(c));
            assert_eq!(codepoint_to_name(c), Some(name));
        }
    }

    #[test]
    fn common_punctuation_pst_names() {
        // Worked PostScript-name landmarks from the AGL.
        let pairs: &[(&str, char)] = &[
            ("space", ' '),
            ("exclam", '!'),
            ("quotedbl", '"'),
            ("numbersign", '#'),
            ("dollar", '$'),
            ("percent", '%'),
            ("ampersand", '&'),
            ("parenleft", '('),
            ("parenright", ')'),
            ("comma", ','),
            ("hyphen", '-'),
            ("period", '.'),
            ("slash", '/'),
        ];
        for &(name, c) in pairs {
            assert_eq!(name_to_codepoint(name), Some(c), "AGL miss: {name}");
        }
    }

    #[test]
    fn pua_small_caps_landmarks() {
        // Small-cap forms live in the Adobe Corporate Use Subarea
        // (`F6xx`..`F7xx`). Direct spec landmarks read from the
        // shipped AGL table.
        assert_eq!(name_to_codepoint("Acutesmall"), Some('\u{F7B4}'));
        assert_eq!(name_to_codepoint("Asmall"), Some('\u{F761}'));
        assert_eq!(name_to_codepoint("AEsmall"), Some('\u{F7E6}'));
    }

    #[test]
    fn ligatures_with_bmp_codepoints() {
        // AGL ligatures that have real BMP codepoints (not PUA).
        assert_eq!(name_to_codepoint("AE"), Some('\u{00C6}'));
        assert_eq!(name_to_codepoint("ae"), Some('\u{00E6}'));
        assert_eq!(name_to_codepoint("OE"), Some('\u{0152}'));
        assert_eq!(name_to_codepoint("oe"), Some('\u{0153}'));
        // The ASCII `ffi` ligature maps to the FB03 presentation
        // form per AGL (AGLFN omits it because §6 of the AGL spec
        // decomposes it to f+f+i — but we don't have the spec
        // staged, so the raw AGL entry is what we surface).
        assert_eq!(name_to_codepoint("ffi"), Some('\u{FB03}'));
    }

    #[test]
    fn cjk_landmarks() {
        // AGL covers Japanese kana with the same `hiragana` /
        // `katakana` suffix convention. Spec-listed entries from the
        // file's tail.
        assert_eq!(name_to_codepoint("ahiragana"), Some('\u{3042}'));
        assert_eq!(name_to_codepoint("akatakana"), Some('\u{30A2}'));
        // Last-listed entry in the AGL (per file tail).
        assert_eq!(name_to_codepoint("zukatakana"), Some('\u{30BA}'));
    }

    #[test]
    fn unknown_name_returns_none() {
        assert_eq!(name_to_codepoint(""), None);
        assert_eq!(name_to_codepoint("not_a_real_glyph_name"), None);
        // Names with embedded whitespace are not in AGL (the spec
        // restricts glyph-name characters to letters + digits).
        assert_eq!(name_to_codepoint("A B"), None);
        // Case matters — `a` and `A` are distinct glyph names.
        assert!(name_to_codepoint("A").is_some());
        assert!(name_to_codepoint("a").is_some());
        assert_ne!(name_to_codepoint("A"), name_to_codepoint("a"));
    }

    #[test]
    fn multi_codepoint_sequence_entry() {
        // `dalethatafpatah` is the canonical AGL multi-codepoint
        // example: a Hebrew DALET (U+05D3) plus the HATAF PATAH
        // combining vowel (U+05B2).
        let cp = name_to_codepoints("dalethatafpatah").expect("entry exists");
        match cp {
            Codepoints::Sequence(s) => {
                assert_eq!(s, ['\u{05D3}', '\u{05B2}']);
                assert_eq!(s.len(), 2);
            }
            Codepoints::Single(_) => panic!("expected a Sequence"),
        }
        // The single-codepoint shortcut returns None for sequence
        // entries.
        assert_eq!(name_to_codepoint("dalethatafpatah"), None);
        // Reverse lookup doesn't surface sequence entries either
        // (a single `char` argument can't disambiguate).
        assert_ne!(codepoint_to_name('\u{05D3}'), Some("dalethatafpatah"));
    }

    #[test]
    fn codepoints_first_and_len() {
        let single = Codepoints::Single('A');
        assert_eq!(single.first(), 'A');
        assert_eq!(single.len(), 1);
        assert!(!single.is_empty());

        let slice: &[char] = &['\u{05D3}', '\u{05B2}'];
        let seq = Codepoints::Sequence(slice);
        assert_eq!(seq.first(), '\u{05D3}');
        assert_eq!(seq.len(), 2);
        assert!(!seq.is_empty());
        assert_eq!(seq.as_slice(), slice);
    }

    #[test]
    fn codepoint_to_name_first_in_sort_order() {
        // U+05B8 (HEBREW POINT QAMATS) is shared by ~17 AGL aliases.
        // The codepoint→name table returns the alphabetically-first
        // one (in ASCII sort order). All such names must round-trip
        // through `name_to_codepoint`.
        let chosen = codepoint_to_name('\u{05B8}').expect("U+05B8 has AGL aliases");
        assert_eq!(name_to_codepoint(chosen), Some('\u{05B8}'));
        // Sanity: the chosen alias is among the ASCII-sorted names
        // mapping to U+05B8.
        let all_aliases: Vec<&str> = entries()
            .filter_map(|(n, c)| match c {
                Codepoints::Single('\u{05B8}') => Some(n),
                _ => None,
            })
            .collect();
        assert!(all_aliases.len() >= 2, "expected multiple aliases");
        assert_eq!(*all_aliases.first().unwrap(), chosen);
    }

    #[test]
    fn entries_yields_all_pairs() {
        let collected: Vec<(&'static str, Codepoints<'static>)> = entries().collect();
        assert_eq!(collected.len(), 4281);
        // First entry is `A;0041` per the file head.
        assert_eq!(collected[0].0, "A");
        assert_eq!(collected[0].1.first(), 'A');
        // Last entry is `zukatakana;30BA` per the file tail.
        assert_eq!(collected[4280].0, "zukatakana");
        assert_eq!(collected[4280].1.first(), '\u{30BA}');
    }

    #[test]
    fn reverse_lookup_for_unencoded_codepoint() {
        // U+FFFE is not encoded by Unicode and is not in AGL.
        assert_eq!(codepoint_to_name('\u{FFFE}'), None);
        // Astral planes are not covered by AGL 2.0 at all (the
        // shipped table is BMP-only).
        assert_eq!(codepoint_to_name('\u{1F600}'), None);
    }

    #[test]
    fn glyph_names_are_ascii() {
        // AGL format spec: "Glyph name—upper/lowercase letters and
        // digits." Every parsed name must be pure ASCII alphanumeric
        // (the AGL file itself is ASCII-only). This is a defence
        // against future source-file corruption.
        for (name, _) in entries() {
            assert!(
                name.bytes().all(|b| b.is_ascii_alphanumeric()),
                "non-alphanumeric glyph name: {name}"
            );
            assert!(!name.is_empty());
        }
    }
}