use formatting::Context;
use std::{
char, cmp,
collections::{hash_map, HashMap},
fs::{self, File},
io::{self, prelude::*, BufWriter},
iter::repeat,
path::Path,
};
macro_rules! w {
($ctxt: expr, $($tt: tt)*) => {
(write!($ctxt.out, $($tt)*)).unwrap()
}
}
mod formatting;
mod phf;
mod trie;
mod util;
const SPLITTERS: &[u8] = b"-";
struct TableData {
codepoint_names: Vec<(char, &'static str)>,
cjk_ideograph_ranges: Vec<(char, char)>,
}
fn get_table_data(unicode_data: &'static str) -> TableData {
fn extract(line: &'static str) -> Option<(char, &'static str)> {
let splits: Vec<_> = line.splitn(15, ';').collect();
assert_eq!(splits.len(), 15);
let s = splits[0];
let cp = u32::from_str_radix(s, 16)
.ok()
.unwrap_or_else(|| panic!("invalid {}", line));
let c = char::from_u32(cp)?;
let name = splits[1];
Some((c, name))
}
let mut iter = unicode_data.lines();
let mut codepoint_names = vec![];
let mut cjk_ideograph_ranges = vec![];
while let Some(l) = iter.next() {
if l.is_empty() {
break;
}
let (cp, name) = if let Some(extracted) = extract(l.trim()) {
extracted
} else {
continue;
};
if name.starts_with('<') {
assert!(name.ends_with('>'), "should >: {}", name);
let name = &name[1..name.len() - 1];
if name.starts_with("CJK Ideograph") {
assert!(name.ends_with("First"));
let line2 = iter.next().expect("unclosed ideograph range");
let (cp2, name2) = if let Some(extracted) = extract(line2.trim()) {
extracted
} else {
continue;
};
assert_eq!(&*name.replace("First", "Last"), &name2[1..name2.len() - 1]);
cjk_ideograph_ranges.push((cp, cp2));
} else if name.starts_with("Hangul Syllable") {
if name.ends_with("First") {
assert_eq!(cp, '\u{AC00}');
} else if name.ends_with("Last") {
assert_eq!(cp, '\u{D7A3}');
} else {
panic!("unknown hangul syllable {}", name)
}
}
} else {
codepoint_names.push((cp, name))
}
}
TableData {
codepoint_names,
cjk_ideograph_ranges,
}
}
pub struct Alias {
pub code: &'static str,
pub alias: &'static str,
pub category: &'static str,
}
pub fn get_aliases(name_aliases: &'static str) -> Vec<Alias> {
let mut aliases = Vec::new();
for line in name_aliases.lines() {
if line.is_empty() | line.starts_with('#') {
continue;
}
let mut parts = line.splitn(3, ';');
let code = parts.next().expect(line);
let alias = parts.next().expect(code);
let category = parts.next().expect(alias);
aliases.push(Alias {
code,
alias,
category,
});
}
aliases
}
fn write_cjk_ideograph_ranges(ctxt: &mut Context, ranges: &[(char, char)]) {
ctxt.write_debugs("CJK_IDEOGRAPH_RANGES", "(char, char)", ranges)
}
fn create_lexicon_and_offsets(
mut codepoint_names: Vec<(char, &str)>,
) -> (String, Vec<(usize, Vec<u8>, usize)>) {
codepoint_names.sort_by(|a, b| a.1.len().cmp(&b.1.len()).reverse());
let mut t = trie::Trie::new();
let mut output = String::new();
let mut substring_overlaps = 0;
let mut substring_o_bytes = 0;
for &(_, name) in codepoint_names.iter() {
for n in util::split(name, SPLITTERS) {
if n.len() == 1 && SPLITTERS.contains(&n.as_bytes()[0]) {
continue;
}
let (already, previous_was_exact) = t.insert(n.bytes(), None, false);
if already {
if !previous_was_exact {
substring_overlaps += 1;
substring_o_bytes += n.len();
}
} else {
let offset = output.len();
t.set_offset(n.bytes(), offset);
output.push_str(n);
for i in 1..n.len() {
if t.insert(n[i..].bytes(), Some(offset + i), true).0 {
break;
}
}
}
}
}
let words: Vec<_> = t
.iter()
.map(|(a, b, c)| (a, b, c.expect("unset offset?")))
.collect();
println!(
"Lexicon: # words {}, byte size {}, with {} ({} bytes) non-exact matches",
words.len(),
output.len(),
substring_overlaps,
substring_o_bytes
);
(output, words)
}
fn bin_data(dat: &[u32]) -> (Vec<u32>, Vec<u32>, usize) {
let mut smallest = 0xFFFFFFFF;
let mut data = (vec![], vec![], 0);
let mut cache = HashMap::new();
for shift in 0..14 {
cache.clear();
let mut t1 = vec![];
let mut t2 = vec![];
for chunk in dat.chunks(1 << shift) {
let index = *match cache.entry(chunk) {
hash_map::Entry::Occupied(o) => o.into_mut(),
hash_map::Entry::Vacant(v) => {
let index = t2.len();
t2.extend(chunk.iter().cloned());
v.insert(index)
}
};
t1.push((index >> shift) as u32)
}
let my_size = t1.len() * util::smallest_type(t1.iter().copied())
+ t2.len() * util::smallest_type(t2.iter().copied());
println!("binning: shift {}, size {}", shift, my_size);
if my_size < smallest {
data = (t1, t2, shift);
smallest = my_size
}
}
{
let (ref t1, ref t2, shift) = data;
let mask = (1 << shift) - 1;
for (i, &elem) in dat.iter().enumerate() {
assert_eq!(
elem,
t2[((t1[i >> shift] << shift) + (i as u32 & mask)) as usize]
)
}
}
data
}
fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>) {
let (lexicon_string, mut lexicon_words) = create_lexicon_and_offsets(codepoint_names.clone());
let num_escapes = (lexicon_words.len() + 255) / 256;
let short = 128 - SPLITTERS.len() - num_escapes;
lexicon_words.sort_by(|a, b| a.cmp(b).reverse());
lexicon_words[short..].sort_by(|(_, a, _), (_, b, _)| a.len().cmp(&b.len()));
let mut word_encodings = HashMap::new();
for (i, x) in SPLITTERS.iter().enumerate() {
word_encodings.insert(vec![*x], vec![128 - 1 - i as u32]);
}
let mut lexicon_offsets = vec![];
let mut lexicon_short_lengths = vec![];
let mut iter = lexicon_words.into_iter().enumerate();
for (i, (_, word, offset)) in iter.by_ref().take(short) {
lexicon_offsets.push(offset);
lexicon_short_lengths.push(word.len());
assert!(word_encodings.insert(word, vec![i as u32]).is_none())
}
let mut lexicon_ordered_lengths = vec![];
let mut previous_len = 0xFFFF;
for (i, (_, word, offset)) in iter {
let (hi, lo) = (short + i / 256, i % 256);
assert!(short <= hi && hi < 128 - SPLITTERS.len());
lexicon_offsets.push(offset);
let len = word.len();
if len != previous_len {
if previous_len != 0xFFFF {
lexicon_ordered_lengths.push((i, previous_len));
}
previous_len = len;
}
assert!(word_encodings
.insert(word, vec![hi as u32, lo as u32])
.is_none());
}
lexicon_ordered_lengths.push((lexicon_offsets.len(), previous_len));
let mut phrasebook = vec![0u32];
let mut phrasebook_offsets = repeat(0).take(0x10FFFF + 1).collect::<Vec<_>>();
let mut longest_name = 0;
for &(cp, name) in codepoint_names.iter() {
longest_name = cmp::max(name.len(), longest_name);
let start = phrasebook.len() as u32;
phrasebook_offsets[cp as usize] = start;
let mut last_len = 0;
for w in util::split(name, SPLITTERS) {
let data = word_encodings
.get(w.as_bytes())
.expect(concat!("option on ", line!()));
last_len = data.len();
phrasebook.extend(data.iter().cloned())
}
let idx = phrasebook.len() - last_len;
phrasebook[idx] |= 0b1000_0000;
}
let (t1, t2, shift) = bin_data(&phrasebook_offsets);
w!(
ctxt,
"pub const MAX_NAME_LENGTH: usize = {};\n",
longest_name
);
ctxt.write_plain_string("LEXICON", &lexicon_string);
ctxt.write_debugs("LEXICON_OFFSETS", "u32", &lexicon_offsets);
ctxt.write_debugs("LEXICON_SHORT_LENGTHS", "u8", &lexicon_short_lengths);
ctxt.write_debugs(
"LEXICON_ORDERED_LENGTHS",
"(usize, u8)",
&lexicon_ordered_lengths,
);
w!(ctxt, "pub static PHRASEBOOK_SHORT: u8 = {};\n", short);
ctxt.write_debugs("PHRASEBOOK", "u8", &phrasebook);
w!(
ctxt,
"pub static PHRASEBOOK_OFFSET_SHIFT: usize = {};\n",
shift
);
ctxt.write_debugs(
"PHRASEBOOK_OFFSETS1",
&util::smallest_u(t1.iter().copied()),
&t1,
);
ctxt.write_debugs(
"PHRASEBOOK_OFFSETS2",
&util::smallest_u(t2.iter().copied()),
&t2,
);
}
fn make_context(path: Option<&Path>) -> Context {
let mut ctxt = Context {
out: match path {
Some(p) => Box::new(BufWriter::new(
File::create(p.with_extension("tmp")).unwrap(),
)) as Box<dyn Write>,
None => Box::new(io::sink()) as Box<dyn Write>,
},
};
let _ = ctxt
.out
.write(b"// autogenerated by generator.rs\n")
.unwrap();
ctxt
}
#[allow(clippy::type_complexity)]
fn get_truncated_table_data(
unicode_data: &'static str,
truncate: Option<usize>,
) -> (Vec<(char, &'static str)>, Vec<(char, char)>) {
let TableData {
mut codepoint_names,
cjk_ideograph_ranges: cjk,
} = get_table_data(unicode_data);
if let Some(n) = truncate {
codepoint_names.truncate(n)
}
(codepoint_names, cjk)
}
pub fn generate_phf(
unicode_data: &'static str,
path: Option<&Path>,
truncate: Option<usize>,
lambda: usize,
tries: usize,
) {
let (codepoint_names, _) = get_truncated_table_data(unicode_data, truncate);
let mut ctxt = make_context(path);
let (n, disps, data) = phf::create_phf(&codepoint_names, lambda, tries);
w!(ctxt, "pub static NAME2CODE_N: u64 = {};\n", n);
ctxt.write_debugs("NAME2CODE_DISP", "(u16, u16)", &disps);
ctxt.write_debugs("NAME2CODE_CODE", "char", &data);
if let Some(path) = path {
fs::rename(path.with_extension("tmp"), path).unwrap()
}
}
pub fn generate(unicode_data: &'static str, path: Option<&Path>, truncate: Option<usize>) {
let (codepoint_names, cjk) = get_truncated_table_data(unicode_data, truncate);
let mut ctxt = make_context(path);
write_cjk_ideograph_ranges(&mut ctxt, &cjk);
let _ = ctxt.out.write(b"\n").unwrap();
write_codepoint_maps(&mut ctxt, codepoint_names);
if let Some(path) = path {
fs::rename(path.with_extension("tmp"), path).unwrap()
}
}
pub fn generate_aliases(name_aliases: &'static str, path: &Path) {
let mut aliases = phf_codegen::Map::new();
for Alias { code, alias, .. } in get_aliases(name_aliases).into_iter() {
let formatted = format!("'\\u{{{code}}}'");
aliases.entry(alias, &formatted);
}
let aliases = aliases.build().to_string().replace("(\"", "(b\"");
writeln!(BufWriter::new(File::create(path).unwrap()), "{aliases}",).unwrap();
}