use std::collections::HashMap;
use std::sync::OnceLock;
const AGL_TEXT: &str = include_str!("../data/agl-glyphlist.txt");
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Codepoints<'a> {
Single(char),
Sequence(&'a [char]),
}
impl<'a> Codepoints<'a> {
pub fn first(&self) -> char {
match *self {
Codepoints::Single(c) => c,
Codepoints::Sequence(slice) => slice[0],
}
}
pub fn as_slice(&self) -> &'a [char] {
match self {
Codepoints::Single(_) => {
unreachable!("call as_slice() only on Sequence variants; use first() or match")
}
Codepoints::Sequence(s) => s,
}
}
pub fn len(&self) -> usize {
match self {
Codepoints::Single(_) => 1,
Codepoints::Sequence(s) => s.len(),
}
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}
#[derive(Debug)]
enum StaticEntry {
Single(char),
Sequence(Vec<char>),
}
fn name_table() -> &'static HashMap<&'static str, StaticEntry> {
static MAP: OnceLock<HashMap<&'static str, StaticEntry>> = OnceLock::new();
MAP.get_or_init(|| {
let mut m = HashMap::with_capacity(4400);
for (name, codepoints) in raw_entries() {
let entry = if codepoints.len() == 1 {
StaticEntry::Single(codepoints[0])
} else {
StaticEntry::Sequence(codepoints)
};
m.entry(name).or_insert(entry);
}
m
})
}
fn codepoint_table() -> &'static HashMap<u32, &'static str> {
static MAP: OnceLock<HashMap<u32, &'static str>> = OnceLock::new();
MAP.get_or_init(|| {
let mut m = HashMap::with_capacity(4300);
for (name, codepoints) in raw_entries() {
if codepoints.len() != 1 {
continue;
}
m.entry(codepoints[0] as u32).or_insert(name);
}
m
})
}
fn raw_entries() -> impl Iterator<Item = (&'static str, Vec<char>)> {
AGL_TEXT.lines().filter_map(|line| {
let line = line.trim_end_matches('\r');
if line.is_empty() || line.starts_with('#') {
return None;
}
let (name, hex_field) = line.split_once(';')?;
let mut codepoints = Vec::with_capacity(2);
for hex in hex_field.split(' ') {
if hex.len() != 4 {
return None;
}
let cp = u32::from_str_radix(hex, 16).ok()?;
let c = char::from_u32(cp)?;
codepoints.push(c);
}
if codepoints.is_empty() {
return None;
}
Some((name, codepoints))
})
}
pub fn name_to_codepoints(name: &str) -> Option<Codepoints<'static>> {
match name_table().get(name)? {
StaticEntry::Single(c) => Some(Codepoints::Single(*c)),
StaticEntry::Sequence(v) => Some(Codepoints::Sequence(v.as_slice())),
}
}
pub fn name_to_codepoint(name: &str) -> Option<char> {
match name_table().get(name)? {
StaticEntry::Single(c) => Some(*c),
StaticEntry::Sequence(_) => None,
}
}
pub fn codepoint_to_name(codepoint: char) -> Option<&'static str> {
codepoint_table().get(&(codepoint as u32)).copied()
}
pub fn entry_count() -> usize {
name_table().len()
}
pub fn distinct_codepoint_count() -> usize {
codepoint_table().len()
}
pub fn entries() -> impl Iterator<Item = (&'static str, Codepoints<'static>)> {
let table = name_table();
raw_entries().filter_map(move |(name, _)| {
let entry = table.get(name)?;
let cp = match entry {
StaticEntry::Single(c) => Codepoints::Single(*c),
StaticEntry::Sequence(v) => Codepoints::Sequence(v.as_slice()),
};
Some((name, cp))
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn entry_count_matches_agl_2_0() {
assert_eq!(entry_count(), 4281);
}
#[test]
fn distinct_codepoint_count_is_3680() {
assert_eq!(distinct_codepoint_count(), 3680);
}
#[test]
fn sequence_entry_count_is_81() {
let n = entries()
.filter(|(_, c)| !matches!(c, Codepoints::Single(_)))
.count();
assert_eq!(n, 81);
}
#[test]
fn ascii_uppercase_letters_round_trip() {
for c in 'A'..='Z' {
let name = c.to_string();
assert_eq!(name_to_codepoint(&name), Some(c), "AGL miss: {name}");
let reverse = codepoint_to_name(c).expect("ASCII letter has AGL name");
assert_eq!(reverse, name.as_str(), "reverse lookup for {c}");
}
}
#[test]
fn ascii_digits_round_trip() {
let pairs: &[(char, &str)] = &[
('0', "zero"),
('1', "one"),
('2', "two"),
('3', "three"),
('4', "four"),
('5', "five"),
('6', "six"),
('7', "seven"),
('8', "eight"),
('9', "nine"),
];
for &(c, name) in pairs {
assert_eq!(name_to_codepoint(name), Some(c));
assert_eq!(codepoint_to_name(c), Some(name));
}
}
#[test]
fn common_punctuation_pst_names() {
let pairs: &[(&str, char)] = &[
("space", ' '),
("exclam", '!'),
("quotedbl", '"'),
("numbersign", '#'),
("dollar", '$'),
("percent", '%'),
("ampersand", '&'),
("parenleft", '('),
("parenright", ')'),
("comma", ','),
("hyphen", '-'),
("period", '.'),
("slash", '/'),
];
for &(name, c) in pairs {
assert_eq!(name_to_codepoint(name), Some(c), "AGL miss: {name}");
}
}
#[test]
fn pua_small_caps_landmarks() {
assert_eq!(name_to_codepoint("Acutesmall"), Some('\u{F7B4}'));
assert_eq!(name_to_codepoint("Asmall"), Some('\u{F761}'));
assert_eq!(name_to_codepoint("AEsmall"), Some('\u{F7E6}'));
}
#[test]
fn ligatures_with_bmp_codepoints() {
assert_eq!(name_to_codepoint("AE"), Some('\u{00C6}'));
assert_eq!(name_to_codepoint("ae"), Some('\u{00E6}'));
assert_eq!(name_to_codepoint("OE"), Some('\u{0152}'));
assert_eq!(name_to_codepoint("oe"), Some('\u{0153}'));
assert_eq!(name_to_codepoint("ffi"), Some('\u{FB03}'));
}
#[test]
fn cjk_landmarks() {
assert_eq!(name_to_codepoint("ahiragana"), Some('\u{3042}'));
assert_eq!(name_to_codepoint("akatakana"), Some('\u{30A2}'));
assert_eq!(name_to_codepoint("zukatakana"), Some('\u{30BA}'));
}
#[test]
fn unknown_name_returns_none() {
assert_eq!(name_to_codepoint(""), None);
assert_eq!(name_to_codepoint("not_a_real_glyph_name"), None);
assert_eq!(name_to_codepoint("A B"), None);
assert!(name_to_codepoint("A").is_some());
assert!(name_to_codepoint("a").is_some());
assert_ne!(name_to_codepoint("A"), name_to_codepoint("a"));
}
#[test]
fn multi_codepoint_sequence_entry() {
let cp = name_to_codepoints("dalethatafpatah").expect("entry exists");
match cp {
Codepoints::Sequence(s) => {
assert_eq!(s, ['\u{05D3}', '\u{05B2}']);
assert_eq!(s.len(), 2);
}
Codepoints::Single(_) => panic!("expected a Sequence"),
}
assert_eq!(name_to_codepoint("dalethatafpatah"), None);
assert_ne!(codepoint_to_name('\u{05D3}'), Some("dalethatafpatah"));
}
#[test]
fn codepoints_first_and_len() {
let single = Codepoints::Single('A');
assert_eq!(single.first(), 'A');
assert_eq!(single.len(), 1);
assert!(!single.is_empty());
let slice: &[char] = &['\u{05D3}', '\u{05B2}'];
let seq = Codepoints::Sequence(slice);
assert_eq!(seq.first(), '\u{05D3}');
assert_eq!(seq.len(), 2);
assert!(!seq.is_empty());
assert_eq!(seq.as_slice(), slice);
}
#[test]
fn codepoint_to_name_first_in_sort_order() {
let chosen = codepoint_to_name('\u{05B8}').expect("U+05B8 has AGL aliases");
assert_eq!(name_to_codepoint(chosen), Some('\u{05B8}'));
let all_aliases: Vec<&str> = entries()
.filter_map(|(n, c)| match c {
Codepoints::Single('\u{05B8}') => Some(n),
_ => None,
})
.collect();
assert!(all_aliases.len() >= 2, "expected multiple aliases");
assert_eq!(*all_aliases.first().unwrap(), chosen);
}
#[test]
fn entries_yields_all_pairs() {
let collected: Vec<(&'static str, Codepoints<'static>)> = entries().collect();
assert_eq!(collected.len(), 4281);
assert_eq!(collected[0].0, "A");
assert_eq!(collected[0].1.first(), 'A');
assert_eq!(collected[4280].0, "zukatakana");
assert_eq!(collected[4280].1.first(), '\u{30BA}');
}
#[test]
fn reverse_lookup_for_unencoded_codepoint() {
assert_eq!(codepoint_to_name('\u{FFFE}'), None);
assert_eq!(codepoint_to_name('\u{1F600}'), None);
}
#[test]
fn glyph_names_are_ascii() {
for (name, _) in entries() {
assert!(
name.bytes().all(|b| b.is_ascii_alphanumeric()),
"non-alphanumeric glyph name: {name}"
);
assert!(!name.is_empty());
}
}
}