use std::sync::LazyLock;
static CORPUS_TSV: &str = include_str!("../data/corpus.tsv");
static ENTRIES: LazyLock<Vec<Entry<'static>>> = LazyLock::new(|| parse_corpus(CORPUS_TSV));
pub fn parse_corpus(tsv: &str) -> Vec<Entry<'_>> {
let mut entries: Vec<Entry<'_>> = tsv.lines().filter_map(Entry::parse).collect();
entries.sort_unstable_by_key(|e| e.codepoint);
entries
}
#[derive(Debug, Clone)]
pub struct Entry<'a> {
pub codepoint: u32,
pub glyph: &'a str,
pub name: &'a str,
pub source: &'a str,
pub category: &'a str,
pub combining: &'a str,
pub bidi: &'a str,
pub decomp: &'a str,
pub decimal: &'a str,
pub digit: &'a str,
pub numeric: &'a str,
pub mirrored: &'a str,
pub alt_name: &'a str,
pub uppercase: &'a str,
pub lowercase: &'a str,
pub titlecase: &'a str,
}
impl<'a> Entry<'a> {
fn parse(line: &'a str) -> Option<Self> {
let f: Vec<&'a str> = line.split('\t').collect();
if f.len() < 4 {
return None;
}
let codepoint = f[0].parse::<u32>().ok()?;
Some(Entry {
codepoint,
glyph: f[1],
name: f[2],
source: f[3],
category: f.get(4).unwrap_or(&""),
combining: f.get(5).unwrap_or(&""),
bidi: f.get(6).unwrap_or(&""),
decomp: f.get(7).unwrap_or(&""),
decimal: f.get(8).unwrap_or(&""),
digit: f.get(9).unwrap_or(&""),
numeric: f.get(10).unwrap_or(&""),
mirrored: f.get(11).unwrap_or(&""),
alt_name: f.get(12).unwrap_or(&""),
uppercase: f.get(13).unwrap_or(&""),
lowercase: f.get(14).unwrap_or(&""),
titlecase: f.get(15).unwrap_or(&""),
})
}
}
pub fn entries() -> &'static [Entry<'static>] {
&ENTRIES
}
pub fn lookup(cp: u32) -> Option<&'static Entry<'static>> {
ENTRIES
.binary_search_by_key(&cp, |e| e.codepoint)
.ok()
.map(|i| &ENTRIES[i])
}
pub fn lookup_str(s: &str) -> Option<&'static Entry<'static>> {
let cp = parse_cp_str(s)?;
lookup(cp)
}
pub fn parse_cp_str(s: &str) -> Option<u32> {
let s = s.trim();
if let Some(rest) = s.strip_prefix("U+").or_else(|| s.strip_prefix("u+")) {
return u32::from_str_radix(rest, 16).ok();
}
if let Some(rest) = s.strip_prefix("0x") {
return u32::from_str_radix(rest, 16).ok();
}
let first = s.chars().next()?;
if s.len() == first.len_utf8() || !first.is_ascii() {
return Some(u32::from(first));
}
u32::from_str_radix(s, 16).ok()
}
#[cfg(test)]
mod tests {
use super::*;
const SAMPLE_TSV: &str = "\
65\tA\tLATIN CAPITAL LETTER A\tunicode\tLu\t0\tL\t\t\t\t\tN\t\tA\tA\tA
66\tB\tLATIN CAPITAL LETTER B\tunicode\tLu\t0\tL\t\t\t\t\tN\t\tB\tB\tB
128513\t\u{1F600}\tGRINNING FACE\tunicode\tSo\t0\tON\t\t\t\t\tN\t\t\t\t
";
#[test]
fn parse_corpus_sorted_by_codepoint() {
let entries = parse_corpus(SAMPLE_TSV);
assert_eq!(entries.len(), 3);
assert_eq!(entries[0].codepoint, 65);
assert_eq!(entries[1].codepoint, 66);
assert_eq!(entries[2].codepoint, 128513);
}
#[test]
fn parse_corpus_short_line_skipped() {
let entries = parse_corpus("1\tA\tfoo");
assert!(entries.is_empty());
}
#[test]
fn parse_corpus_non_numeric_codepoint_skipped() {
let entries = parse_corpus("xyz\tA\tfoo\tbar");
assert!(entries.is_empty());
}
#[test]
fn lookup_hit_lower_bound() {
let e = lookup(0x0041).expect("A should exist");
assert_eq!(e.name, "LATIN CAPITAL LETTER A");
}
#[test]
fn lookup_hit_emoji() {
let e = lookup(0x1F600).expect("grinning face should exist");
assert_eq!(e.name, "GRINNING FACE");
}
#[test]
fn lookup_miss_surrogate() {
assert!(lookup(0xD800).is_none());
}
#[test]
fn lookup_miss_above_range() {
assert!(lookup(0xFFFFFF).is_none());
}
#[test]
fn lookup_miss_unsassigned() {
assert!(lookup(0x0378).is_none());
}
#[test]
fn lookup_str_uplus_format() {
let e = lookup_str("U+0041").expect("U+0041 should resolve");
assert_eq!(e.codepoint, 0x41);
}
#[test]
fn lookup_str_uplus_lowercase() {
let e = lookup_str("u+0041").expect("u+0041 should resolve");
assert_eq!(e.codepoint, 0x41);
}
#[test]
fn lookup_str_0x_format() {
let e = lookup_str("0x0041").expect("0x0041 should resolve");
assert_eq!(e.codepoint, 0x41);
}
#[test]
fn lookup_str_hex_only() {
let e = lookup_str("0041").expect("0041 should resolve");
assert_eq!(e.codepoint, 0x41);
}
#[test]
fn lookup_str_single_ascii_char() {
let e = lookup_str("A").expect("A should resolve");
assert_eq!(e.codepoint, 0x41);
}
#[test]
fn lookup_str_single_non_ascii_char() {
let e = lookup_str("😀").expect("emoji should resolve");
assert_eq!(e.codepoint, 0x1F600);
}
#[test]
fn lookup_str_trimmed() {
let e = lookup_str(" U+0041 ").expect("trimmed should resolve");
assert_eq!(e.codepoint, 0x41);
}
#[test]
fn lookup_str_not_found() {
assert!(lookup_str("ZZZZ_NOT_A_CODEPOINT").is_none());
}
#[test]
fn parse_cp_str_bare_hex_multi_char_ascii() {
assert_eq!(parse_cp_str("0041"), Some(0x0041));
}
#[test]
fn parse_cp_str_too_long_multi_char_ascii() {
assert_eq!(parse_cp_str("AB"), Some(0xAB));
}
#[test]
fn parse_cp_str_empty() {
assert_eq!(parse_cp_str(""), None);
}
#[test]
fn entries_returns_sorted() {
let e = entries();
for w in e.windows(2) {
assert!(w[0].codepoint <= w[1].codepoint, "entries not sorted");
}
}
#[test]
fn entries_are_static() {
let e1: &[Entry<'static>] = entries();
let e2: &[Entry<'static>] = entries();
assert!(std::ptr::eq(e1, e2));
}
}