use std::collections::BTreeMap;
use std::env;
use std::fmt::Write as _;
use std::fs;
use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf};
fn main() {
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
let data_dir = Path::new("src/tables/data");
println!("cargo:rerun-if-changed=src/tables/data");
println!("cargo:rerun-if-changed=build.rs");
{
let entries = read_char_str_tsv(&data_dir.join("hanzi_pinyin.tsv"));
assert!(
entries.len() >= 20_000,
"hanzi_pinyin.tsv: expected ≥20,000 entries, got {}",
entries.len()
);
for (&cp, value) in &entries {
assert!(
value.is_ascii(),
"hanzi_pinyin.tsv: non-ASCII value {value:?} for U+{cp:04X}"
);
assert!(
(0x4E00..=0x9FFF).contains(&cp),
"hanzi_pinyin.tsv: U+{cp:04X} outside the CJK Unified block; \
the dense table (#237 item 2) covers only U+4E00–U+9FFF"
);
}
let code = build_dense_interned_array(
&entries,
0x4E00,
0x9FFF - 0x4E00 + 1,
"HANZI_PINYIN",
"pub",
);
fs::write(out_dir.join("hanzi_pinyin_phf.rs"), code).unwrap();
}
generate_char_str_map(
&data_dir.join("hanzi_pinyin_toned.tsv"),
&out_dir.join("hanzi_pinyin_toned_phf.rs"),
"HANZI_PINYIN_TONED",
"pub",
);
{
let entries = read_char_str_tsv(&data_dir.join("confusables_to_latin.tsv"));
assert!(
entries.len() >= 1_000,
"confusables_to_latin.tsv: expected ≥1,000 entries, got {}",
entries.len()
);
let code = build_char_str_map(&entries, "TO_LATIN", "");
fs::write(out_dir.join("confusables_phf.rs"), code).unwrap();
}
{
let entries = read_char_str_tsv(&data_dir.join("confusables_to_cyrillic.tsv"));
assert!(
!entries.is_empty(),
"confusables_to_cyrillic.tsv: expected ≥1 entries, got 0",
);
let code = build_char_str_map(&entries, "TO_CYRILLIC", "");
fs::write(out_dir.join("confusables_to_cyrillic_phf.rs"), code).unwrap();
}
generate_char_str_map(
&data_dir.join("emoji_single.tsv"),
&out_dir.join("emoji_single_phf.rs"),
"EMOJI_SINGLE",
"pub",
);
generate_emoji_trie(
&data_dir.join("emoji_multi.tsv"),
&out_dir.join("emoji_multi_trie.rs"),
"EMOJI_MULTI_TRIE",
);
generate_str_str_map(
&data_dir.join("emoji_multi.tsv"),
&out_dir.join("emoji_multi_phf.rs"),
"EMOJI_MULTI",
"pub",
);
generate_char_set(
&data_dir.join("emoji_starters.tsv"),
&out_dir.join("emoji_starters_phf.rs"),
"EMOJI_MULTI_STARTERS",
"pub",
);
generate_char_str_map(
&data_dir.join("case_folding.tsv"),
&out_dir.join("case_folding_phf.rs"),
"CASE_FOLD",
"pub",
);
{
let default_entries = read_char_str_tsv(&data_dir.join("translit_default.tsv"));
assert!(
default_entries.len() >= 5_000,
"translit_default.tsv: expected ≥5,000 entries, got {}",
default_entries.len()
);
for (&cp, value) in &default_entries {
assert!(
value.is_ascii(),
"translit_default.tsv: non-ASCII value {value:?} for U+{cp:04X}"
);
}
}
generate_translit_flat_array(
&data_dir.join("translit_default.tsv"),
&out_dir.join("translit_default_flat.rs"),
);
{
let smp_entries = read_char_str_tsv(&data_dir.join("translit_default_smp.tsv"));
for (&cp, value) in &smp_entries {
assert!(
value.is_ascii(),
"translit_default_smp.tsv: non-ASCII value {value:?} for U+{cp:04X}"
);
}
}
generate_char_str_map(
&data_dir.join("translit_default_smp.tsv"),
&out_dir.join("translit_default_smp_phf.rs"),
"DEFAULT_SMP",
"",
);
let mut lang_tables: Vec<(String, String)> = Vec::new();
for entry in fs::read_dir(data_dir).expect("read src/tables/data") {
let name = entry
.expect("data dir entry")
.file_name()
.to_string_lossy()
.into_owned();
if let Some(code) = name
.strip_prefix("translit_lang_")
.and_then(|s| s.strip_suffix(".tsv"))
{
let file_stem = format!("lang_{code}");
let const_name = file_stem.to_uppercase();
lang_tables.push((file_stem, const_name));
}
}
assert!(
lang_tables.len() >= 20,
"expected ≥20 translit_lang_*.tsv override tables, found {} — wrong data dir?",
lang_tables.len()
);
lang_tables.push(("iso9".to_string(), "ISO9".to_string()));
lang_tables.push(("gost7034".to_string(), "GOST7034".to_string()));
lang_tables.sort();
let mut all_lang_code = String::new();
for (file_stem, const_name) in &lang_tables {
let tsv_path = data_dir.join(format!("translit_{file_stem}.tsv"));
let entries = read_char_str_tsv(&tsv_path);
for (&cp, value) in &entries {
assert!(
value.is_ascii(),
"translit_{file_stem}.tsv: non-ASCII value {value:?} for U+{cp:04X}"
);
}
all_lang_code.push_str(&build_char_str_map(&entries, const_name, ""));
all_lang_code.push('\n');
}
let lang_out = out_dir.join("translit_langs_phf.rs");
fs::write(&lang_out, all_lang_code).unwrap_or_else(|e| {
panic!("Failed to write {}: {e}", lang_out.display());
});
let reverse_tables = [
("reverse_ru", "REVERSE_RU"),
("reverse_uk", "REVERSE_UK"),
("reverse_el", "REVERSE_EL"),
];
let mut all_reverse_code = String::new();
for (file_stem, const_name) in &reverse_tables {
let tsv_path = data_dir.join(format!("{file_stem}.tsv"));
let entries = read_str_str_tsv(&tsv_path);
let formatted: Vec<(&str, String)> = entries
.iter()
.map(|(key, value)| (key.as_str(), format!("\"{}\"", escape_str(value))))
.collect();
let mut builder = phf_codegen::Map::<&str>::new();
for (key, v) in &formatted {
builder.entry(*key, v);
}
write!(
all_reverse_code,
"static {const_name}: phf::Map<&'static str, &'static str> = {};\n\n",
builder.build()
)
.unwrap();
}
let reverse_out = out_dir.join("reverse_translit_phf.rs");
fs::write(&reverse_out, all_reverse_code).unwrap_or_else(|e| {
panic!("Failed to write {}: {e}", reverse_out.display());
});
generate_width_ranges(
&data_dir.join("char_width.tsv"),
&out_dir.join("char_width_ranges.rs"),
);
generate_range_set(
&data_dir.join("emoji_presentation.tsv"),
&out_dir.join("emoji_presentation_ranges.rs"),
"EMOJI_PRESENTATION_RANGES",
);
}
fn generate_width_ranges(tsv_path: &Path, out_path: &Path) {
let content = fs::read_to_string(tsv_path)
.unwrap_or_else(|e| panic!("Failed to read {}: {e}", tsv_path.display()));
let mut rows: Vec<(u32, u32, u8)> = Vec::new();
for line in content.lines() {
let t = line.trim();
if t.is_empty() || t.starts_with('#') {
continue;
}
let mut it = t.split('\t');
let start = parse_hex(it.next().unwrap_or(""), tsv_path);
let end = parse_hex(it.next().unwrap_or(""), tsv_path);
let class = match it.next().unwrap_or("").trim() {
"Z" => 0u8,
"W" => 2,
"A" => 3,
other => panic!("bad width class {other:?} in {}", tsv_path.display()),
};
rows.push((start, end, class));
}
rows.sort_unstable();
let mut code = String::from("static WIDTH_RANGES: &[(u32, u32, u8)] = &[\n");
for (s, e, c) in &rows {
writeln!(code, " ({s}, {e}, {c}),").unwrap();
}
code.push_str("];\n");
fs::write(out_path, code).unwrap_or_else(|e| panic!("write {}: {e}", out_path.display()));
}
fn generate_range_set(tsv_path: &Path, out_path: &Path, name: &str) {
let content = fs::read_to_string(tsv_path)
.unwrap_or_else(|e| panic!("Failed to read {}: {e}", tsv_path.display()));
let mut rows: Vec<(u32, u32)> = Vec::new();
for line in content.lines() {
let t = line.trim();
if t.is_empty() || t.starts_with('#') {
continue;
}
let mut it = t.split('\t');
let start = parse_hex(it.next().unwrap_or(""), tsv_path);
let end = parse_hex(it.next().unwrap_or(""), tsv_path);
rows.push((start, end));
}
rows.sort_unstable();
let mut code = format!("static {name}: &[(u32, u32)] = &[\n");
for (s, e) in &rows {
writeln!(code, " ({s}, {e}),").unwrap();
}
code.push_str("];\n");
fs::write(out_path, code).unwrap_or_else(|e| panic!("write {}: {e}", out_path.display()));
}
fn parse_hex(hex: &str, path: &Path) -> u32 {
u32::from_str_radix(hex.trim(), 16)
.unwrap_or_else(|e| panic!("Bad hex '{hex}' in {}: {e}", path.display()))
}
fn read_char_str_tsv(path: &Path) -> BTreeMap<u32, String> {
let content = fs::read_to_string(path)
.unwrap_or_else(|e| panic!("Failed to read {}: {e}", path.display()));
let mut map = BTreeMap::new();
for line in content.lines() {
let trimmed = line.trim_start();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
let (hex, value) = trimmed.split_once('\t').unwrap_or((trimmed.trim_end(), ""));
let cp = u32::from_str_radix(hex.trim(), 16).unwrap_or_else(|e| {
panic!("Bad hex '{hex}' in {}: {e}", path.display());
});
map.insert(cp, unescape_rust_str(value));
}
map
}
fn read_str_str_tsv(path: &Path) -> Vec<(String, String)> {
let content = fs::read_to_string(path)
.unwrap_or_else(|e| panic!("Failed to read {}: {e}", path.display()));
let mut entries = Vec::new();
for line in content.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let (key, value) = line.split_once('\t').unwrap_or_else(|| {
panic!("Bad line in {}: {line}", path.display());
});
entries.push((key.to_string(), value.to_string()));
}
entries
}
fn read_char_set_tsv(path: &Path) -> Vec<u32> {
let content = fs::read_to_string(path)
.unwrap_or_else(|e| panic!("Failed to read {}: {e}", path.display()));
let mut entries = Vec::new();
for line in content.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let cp = u32::from_str_radix(line, 16).unwrap_or_else(|e| {
panic!("Bad hex '{line}' in {}: {e}", path.display());
});
entries.push(cp);
}
entries
}
fn build_char_str_map(entries: &BTreeMap<u32, String>, name: &str, vis: &str) -> String {
let formatted: Vec<(char, String)> = entries
.iter()
.map(|(&cp, value)| {
let ch = char::from_u32(cp).unwrap_or_else(|| panic!("Invalid codepoint U+{cp:04X}"));
(ch, format!("\"{}\"", escape_str(value)))
})
.collect();
let mut builder = phf_codegen::Map::<char>::new();
for (ch, val) in &formatted {
builder.entry(*ch, val);
}
let vis_prefix = if vis.is_empty() {
String::new()
} else {
format!("{vis} ")
};
format!(
"{vis_prefix}static {name}: phf::Map<char, &'static str> = {};\n",
builder.build()
)
}
fn build_dense_interned_array(
entries: &BTreeMap<u32, String>,
base: u32,
len: usize,
name: &str,
vis: &str,
) -> String {
let mut values: Vec<&str> = vec![""]; let mut id_of: std::collections::HashMap<&str, u16> = std::collections::HashMap::new();
let mut ids: Vec<u16> = vec![0u16; len];
for (&cp, value) in entries {
let off = cp
.checked_sub(base)
.filter(|&o| (o as usize) < len)
.unwrap_or_else(|| panic!("U+{cp:04X} outside dense range [{base:#X}, +{len})"));
let id = *id_of.entry(value.as_str()).or_insert_with(|| {
let next = u16::try_from(values.len()).expect("interned value count exceeds u16");
values.push(value.as_str());
next
});
ids[off as usize] = id;
}
let vis_prefix = if vis.is_empty() {
String::new()
} else {
format!("{vis} ")
};
let mut out = String::with_capacity(len * 6 + values.len() * 8);
writeln!(
out,
"{vis_prefix}static {name}_VALUES: [&str; {}] = [",
values.len()
)
.unwrap();
for v in &values {
writeln!(out, " \"{}\",", escape_str(v)).unwrap();
}
out.push_str("];\n");
writeln!(out, "{vis_prefix}static {name}_IDS: [u16; {len}] = [").unwrap();
for (i, id) in ids.iter().enumerate() {
if i % 32 == 0 {
out.push_str(" ");
}
write!(out, "{id},").unwrap();
if i % 32 == 31 {
out.push('\n');
}
}
out.push_str("\n];\n");
writeln!(out, "{vis_prefix}const {name}_BASE: u32 = {base:#X};").unwrap();
out
}
fn generate_char_str_map(tsv_path: &Path, out_path: &Path, name: &str, vis: &str) {
let entries = read_char_str_tsv(tsv_path);
let code = build_char_str_map(&entries, name, vis);
let mut file = BufWriter::new(fs::File::create(out_path).unwrap_or_else(|e| {
panic!("Failed to create {}: {e}", out_path.display());
}));
file.write_all(code.as_bytes()).unwrap();
}
fn generate_str_str_map(tsv_path: &Path, out_path: &Path, name: &str, vis: &str) {
let entries = read_str_str_tsv(tsv_path);
let formatted: Vec<(&str, String)> = entries
.iter()
.map(|(key, value)| (key.as_str(), format!("\"{}\"", escape_str(value))))
.collect();
let mut builder = phf_codegen::Map::<&str>::new();
for (key, v) in &formatted {
builder.entry(*key, v);
}
let vis_prefix = if vis.is_empty() {
String::new()
} else {
format!("{vis} ")
};
let code = format!(
"{vis_prefix}static {name}: phf::Map<&'static str, &'static str> = {};\n",
builder.build()
);
let mut file = BufWriter::new(fs::File::create(out_path).unwrap_or_else(|e| {
panic!("Failed to create {}: {e}", out_path.display());
}));
file.write_all(code.as_bytes()).unwrap();
}
fn generate_emoji_trie(tsv_path: &Path, out_path: &Path, name: &str) {
struct Node {
edges: BTreeMap<u32, usize>,
value: Option<usize>,
}
fn emit_u32(out: &mut String, arr: &[u32], decl: &str) {
out.push_str(decl);
for (i, v) in arr.iter().enumerate() {
if i % 16 == 0 {
out.push_str("\n ");
}
write!(out, "{v},").unwrap();
}
out.push_str("\n];\n");
}
let entries = read_str_str_tsv(tsv_path);
let mut nodes: Vec<Node> = vec![Node {
edges: BTreeMap::new(),
value: None,
}];
let mut value_of: BTreeMap<String, usize> = BTreeMap::new();
let mut values: Vec<String> = Vec::new();
for (key, emoji_name) in &entries {
let mut node = 0usize;
for hex in key.split('_') {
let cp = u32::from_str_radix(hex, 16).unwrap_or_else(|_| {
panic!("emoji_multi.tsv: bad hex code point {hex:?} in {key:?}")
});
node = if let Some(&child) = nodes[node].edges.get(&cp) {
child
} else {
let child = nodes.len();
nodes.push(Node {
edges: BTreeMap::new(),
value: None,
});
nodes[node].edges.insert(cp, child);
child
};
}
let vidx = *value_of.entry(emoji_name.clone()).or_insert_with(|| {
values.push(emoji_name.clone());
values.len() - 1
});
nodes[node].value = Some(vidx);
}
let mut edge_start: Vec<u32> = Vec::with_capacity(nodes.len() + 1);
let mut edge_cp: Vec<u32> = Vec::new();
let mut edge_target: Vec<u32> = Vec::new();
let mut node_value: Vec<u32> = Vec::with_capacity(nodes.len());
for node in &nodes {
edge_start.push(u32::try_from(edge_cp.len()).expect("edge count fits u32"));
for (&cp, &child) in &node.edges {
edge_cp.push(cp);
edge_target.push(u32::try_from(child).expect("node id fits u32"));
}
node_value.push(node.value.map_or(u32::MAX, |v| {
u32::try_from(v).expect("value index fits u32")
}));
}
edge_start.push(u32::try_from(edge_cp.len()).expect("edge count fits u32"));
let mut out = String::with_capacity(edge_cp.len() * 8 + values.len() * 16);
emit_u32(
&mut out,
&edge_start,
&format!(
"pub static {name}_EDGE_START: [u32; {}] = [",
edge_start.len()
),
);
emit_u32(
&mut out,
&edge_cp,
&format!("pub static {name}_EDGE_CP: [u32; {}] = [", edge_cp.len()),
);
emit_u32(
&mut out,
&edge_target,
&format!(
"pub static {name}_EDGE_TARGET: [u32; {}] = [",
edge_target.len()
),
);
emit_u32(
&mut out,
&node_value,
&format!(
"pub static {name}_NODE_VALUE: [u32; {}] = [",
node_value.len()
),
);
writeln!(
out,
"pub static {name}_VALUES: [&str; {}] = [",
values.len()
)
.unwrap();
for v in &values {
writeln!(out, " \"{}\",", escape_str(v)).unwrap();
}
out.push_str("];\n");
fs::write(out_path, out)
.unwrap_or_else(|e| panic!("Failed to write {}: {e}", out_path.display()));
}
fn generate_char_set(tsv_path: &Path, out_path: &Path, name: &str, vis: &str) {
let entries = read_char_set_tsv(tsv_path);
let mut builder = phf_codegen::Set::<char>::new();
for &cp in &entries {
let ch = char::from_u32(cp).unwrap_or_else(|| {
panic!("Invalid codepoint U+{cp:04X}");
});
builder.entry(ch);
}
let vis_prefix = if vis.is_empty() {
String::new()
} else {
format!("{vis} ")
};
let code = format!(
"{vis_prefix}static {name}: phf::Set<char> = {};\n",
builder.build()
);
let mut file = BufWriter::new(fs::File::create(out_path).unwrap_or_else(|e| {
panic!("Failed to create {}: {e}", out_path.display());
}));
file.write_all(code.as_bytes()).unwrap();
}
fn generate_translit_flat_array(tsv_path: &Path, out_path: &Path) {
const NONE_ENTRY: u32 = u32::MAX;
const PAGE_LEN: usize = 256;
let entries = read_char_str_tsv(tsv_path);
let mut blob = String::new();
let mut offset_of: std::collections::HashMap<&str, u32> = std::collections::HashMap::new();
offset_of.insert("", 0);
for value in entries.values() {
if value.is_empty() {
continue;
}
offset_of.entry(value.as_str()).or_insert_with(|| {
let off = u32::try_from(blob.len()).expect("BMP blob offset fits u32");
blob.push_str(value);
off
});
}
assert!(
blob.len() <= 0x00FF_FFFF,
"DEFAULT_BMP blob {} B exceeds the 16 MB u24 offset budget",
blob.len()
);
let encode = |cp: u32| -> u32 {
match entries.get(&cp) {
None => NONE_ENTRY,
Some(v) => {
let len = u32::try_from(v.len()).expect("BMP value len fits u32");
assert!(len <= 255, "DEFAULT_BMP value too long for u8 len: {v:?}");
(offset_of[v.as_str()] << 8) | len
}
}
};
let mut pages_entries: Vec<u32> = vec![NONE_ENTRY; PAGE_LEN];
let mut page_base: Vec<u32> = vec![0u32; 256];
for (page, base_slot) in page_base.iter_mut().enumerate() {
let lo = (page as u32) << 8;
let populated = (0u32..256).any(|o| {
let cp = lo | o;
(0x80..0x10000).contains(&cp) && entries.contains_key(&cp)
});
if !populated {
continue; }
*base_slot = u32::try_from(pages_entries.len()).expect("ENTRIES base fits u32");
for o in 0u32..256 {
let cp = lo | o;
pages_entries.push(if (0x80..0x10000).contains(&cp) {
encode(cp)
} else {
NONE_ENTRY
});
}
}
let decode = |cp: u32| -> Option<String> {
let base = page_base[(cp >> 8) as usize] as usize;
let cell = pages_entries[base + (cp & 0xFF) as usize];
if cell == NONE_ENTRY {
None
} else {
let off = (cell >> 8) as usize;
let len = (cell & 0xFF) as usize;
Some(blob[off..off + len].to_string())
}
};
for cp in 0x80u32..0x10000 {
let expected = entries.get(&cp).cloned();
assert_eq!(
decode(cp),
expected,
"DEFAULT_BMP trie self-check failed at U+{cp:04X}"
);
}
let mut file = BufWriter::new(fs::File::create(out_path).unwrap_or_else(|e| {
panic!("Failed to create {}: {e}", out_path.display());
}));
writeln!(
file,
"/// Two-level BMP transliteration trie (#237 item 1), generated by build.rs\n\
/// from translit_default.tsv. `u32::MAX` = None; else `(offset << 8) | len`\n\
/// into DEFAULT_BMP_BLOB, with len==0 meaning Some(\"\")."
)
.unwrap();
writeln!(
file,
"static DEFAULT_BMP_BLOB: &str = \"{}\";",
escape_str(&blob)
)
.unwrap();
writeln!(
file,
"static DEFAULT_BMP_ENTRIES: [u32; {}] = [",
pages_entries.len()
)
.unwrap();
for (i, cell) in pages_entries.iter().enumerate() {
if i % 16 == 0 {
write!(file, " ").unwrap();
}
write!(file, "{cell},").unwrap();
if i % 16 == 15 {
writeln!(file).unwrap();
}
}
writeln!(file, "\n];").unwrap();
writeln!(file, "static DEFAULT_BMP_PAGES: [u32; 256] = [").unwrap();
for (i, b) in page_base.iter().enumerate() {
if i % 16 == 0 {
write!(file, " ").unwrap();
}
write!(file, "{b},").unwrap();
if i % 16 == 15 {
writeln!(file).unwrap();
}
}
writeln!(file, "\n];").unwrap();
}
fn unescape_rust_str(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut chars = s.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '\\' {
match chars.peek() {
Some(&'u') => {
chars.next(); assert!(
chars.peek() == Some(&'{'),
"Malformed \\u escape in TSV: expected '{{' after \\u"
);
chars.next();
let mut hex = String::new();
let mut closed = false;
for c in chars.by_ref() {
if c == '}' {
closed = true;
break;
}
hex.push(c);
}
assert!(
closed,
"Malformed \\u escape in TSV: missing closing '}}' (got '\\u{{{hex}')"
);
let cp = u32::from_str_radix(&hex, 16).unwrap_or_else(|e| {
panic!("Invalid hex in \\u{{...}} escape: '{hex}': {e}");
});
let c = char::from_u32(cp).unwrap_or_else(|| {
panic!("Invalid Unicode scalar value: U+{cp:04X}");
});
out.push(c);
}
Some(&'"') => {
chars.next();
out.push('"');
}
Some(&'\\') => {
chars.next();
out.push('\\');
}
Some(&'n') => {
chars.next();
out.push('\n');
}
Some(&'r') => {
chars.next();
out.push('\r');
}
Some(&'t') => {
chars.next();
out.push('\t');
}
None => out.push('\\'),
Some(&other) => {
chars.next();
out.push('\\');
out.push(other);
}
}
} else {
out.push(ch);
}
}
out
}
fn escape_str(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for ch in s.chars() {
match ch {
'"' => out.push_str("\\\""),
'\\' => out.push_str("\\\\"),
'\n' => out.push_str("\\n"),
'\r' => out.push_str("\\r"),
'\t' => out.push_str("\\t"),
_ => out.push(ch),
}
}
out
}