#![allow(clippy::type_complexity)]
use std::collections::HashMap;
use std::fmt::{self, Write};
use std::fs;
use std::path::Path;
#[test]
fn generate_code() {
let mut languages: [(&str, &str, &[&str], &str, &str, u8, u8); 48] = [
("Afrikaans", "af", &[], "Latn", "hyph-af.tex", 1, 2),
("Assamese", "as", &[], "Beng", "hyph-as.tex", 2, 2),
("Belarusian", "be", &[], "Cyrl", "hyph-be.tex", 2, 2),
("Bengali", "bn", &[], "Beng", "hyph-bn.tex", 2, 2),
("Bulgarian", "bg", &[], "Cyrl", "hyph-bg.tex", 2, 2),
("Catalan", "ca", &[], "Latn", "hyph-ca.tex", 2, 2),
("Czech", "cs", &[], "Latn", "hyph-cs-sojka.tex", 2, 2),
("Danish", "da", &[], "Latn", "hyph-da.tex", 2, 2),
("Galician", "gl", &[], "Latn", "hyph-gl.tex", 2, 2),
("German", "de", &[], "Latn", "hyph-de-1996.tex", 2, 2),
("Greek", "el", &[], "Grek", "hyph-el-monoton.tex", 1, 1),
("English", "en", &[], "Latn", "hyph-en-us.tex", 2, 3),
("Spanish", "es", &[], "Latn", "hyph-es.tex", 2, 2),
("Estonian", "et", &[], "Latn", "hyph-et.tex", 2, 3),
("Finnish", "fi", &[], "Latn", "hyph-fi.tex", 2, 2),
("French", "fr", &[], "Latn", "hyph-fr.tex", 2, 2),
("Gujarati", "gu", &[], "Gujr", "hyph-gu.tex", 2, 2),
("Hindi", "hi", &[], "Deva", "hyph-hi.tex", 2, 2),
("Croatian", "hr", &[], "Latn", "hyph-hr.tex", 2, 2),
("Hungarian", "hu", &[], "Latn", "hyph-hu.tex", 2, 2),
("Icelandic", "is", &[], "Latn", "hyph-is.tex", 2, 2),
("Italian", "it", &[], "Latn", "hyph-it.tex", 2, 2),
("Kannada", "kn", &[], "Knda", "hyph-kn.tex", 2, 2),
("Georgian", "ka", &[], "Geor", "hyph-ka.tex", 1, 2),
("Kurmanji", "ku", &[], "Latn", "hyph-kmr.tex", 2, 2),
("Latin", "la", &[], "Latn", "hyph-la.tex", 2, 2),
("Lithuanian", "lt", &[], "Latn", "hyph-lt.tex", 2, 2),
("Malayalam", "ml", &[], "Mlym", "hyph-ml.tex", 2, 2),
("Marathi", "mr", &[], "Deva", "hyph-mr.tex", 2, 2),
("Mongolian", "mn", &[], "Cyrl", "hyph-mn.tex", 2, 2),
("Dutch", "nl", &[], "Latn", "hyph-nl.tex", 2, 2),
("Norwegian", "no", &["nb", "nn"], "Latn", "hyph-no.tex", 2, 2),
("Oriya", "or", &[], "Orya", "hyph-or.tex", 2, 2),
("Panjabi", "pa", &[], "Guru", "hyph-pa.tex", 2, 2),
("Polish", "pl", &[], "Latn", "hyph-pl.tex", 2, 2),
("Portuguese", "pt", &[], "Latn", "hyph-pt.tex", 2, 3),
("Russian", "ru", &[], "Cyrl", "hyph-ru.tex", 2, 2),
("Sanskrit", "sa", &[], "Deva", "hyph-sa.tex", 2, 2),
("Serbian", "sr", &[], "Cyrl", "hyph-sh-cyrl.tex", 2, 2),
("Slovak", "sk", &[], "Latn", "hyph-sk.tex", 2, 3),
("Slovenian", "sl", &[], "Latn", "hyph-sl.tex", 2, 2),
("Albanian", "sq", &[], "Latn", "hyph-sq.tex", 2, 2),
("Swedish", "sv", &[], "Latn", "hyph-sv.tex", 2, 2),
("Tamil", "ta", &[], "Taml", "hyph-ta.tex", 2, 2),
("Telugu", "te", &[], "Telu", "hyph-te.tex", 2, 2),
("Turkmen", "tk", &[], "Latn", "hyph-tk.tex", 2, 2),
("Turkish", "tr", &[], "Latn", "hyph-tr.tex", 2, 2),
("Ukrainian", "uk", &[], "Cyrl", "hyph-uk.tex", 2, 2),
];
languages.sort();
let mut fresh = true;
for (_, iso, _, _, filename, ..) in languages {
let path = Path::new("patterns").join(filename);
let tex = fs::read_to_string(&path).unwrap();
let mut builder = TrieBuilder::new();
parse(&tex, |pat| builder.insert(pat));
builder.compress();
let trie = builder.encode();
let path = format!("tries/{iso}.bin");
fresh &= write_check(&path, trie);
}
let mut text = String::new();
write_lang(&mut text, &languages).unwrap();
fresh &= write_check("src/lang.rs", text.into_bytes());
if !fresh {
panic!("Trie data or generated code was outdated.");
}
}
fn write_check(path: &str, data: Vec<u8>) -> bool {
let prev = fs::read(path).unwrap_or_default();
fs::write(path, &data).unwrap();
prev == data
}
fn write_lang(
w: &mut String,
languages: &[(&str, &str, &[&str], &str, &str, u8, u8)],
) -> fmt::Result {
writeln!(w, "// This file is generated by tests/generate.rs")?;
writeln!(w, "// Do not edit by hand!")?;
writeln!(w)?;
writeln!(w, "/// A language you can hyphenate in.")?;
writeln!(w, "///")?;
writeln!(w, "/// Lists for each language also the ISO 639-1 two")?;
writeln!(w, "/// letter language code and the ISO 15924 four letter")?;
writeln!(w, "/// script code.")?;
writeln!(w, "#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]")?;
writeln!(w, "#[non_exhaustive]")?;
writeln!(w, "pub enum Lang {{")?;
for &(name, iso, isos, script, ..) in languages {
let feature = name.to_lowercase();
write!(w, " /// Hyphenation for _{name}._ (Code: `{iso}`, ")?;
for code in isos {
write!(w, "Alias: `{code}`, ")?;
}
writeln!(w, "Script, `{script}`, Feature: `{feature}`)")?;
write!(w, " ")?;
write_cfg(w, &feature)?;
writeln!(w, " {name},")?;
}
writeln!(w, "}}")?;
writeln!(w)?;
writeln!(w, "impl Lang {{")?;
writeln!(w, " /// Select a language using its ISO 639-1 code.")?;
writeln!(w, " pub fn from_iso(code: [u8; 2]) -> Option<Self> {{")?;
writeln!(w, " match &code {{")?;
for &(name, iso, isos, ..) in languages {
let feature = name.to_lowercase();
write!(w, " ")?;
write_cfg(w, &feature)?;
writeln!(w, r#" b"{iso}" => Some(Self::{name}),"#)?;
for code in isos {
write!(w, " ")?;
write_cfg(w, &feature)?;
writeln!(w, r#" b"{code}" => Some(Self::{name}),"#)?;
}
}
writeln!(w, " _ => None,")?;
writeln!(w, " }}")?;
writeln!(w, " }}")?;
writeln!(w)?;
writeln!(w, " /// The default number of chars to each side between")?;
writeln!(w, " /// which breaking is forbidden.")?;
writeln!(w, " ///")?;
writeln!(w, " /// This follows typographic conventions.")?;
writeln!(w, " pub fn bounds(self) -> (usize, usize) {{")?;
writeln!(w, " match self {{")?;
for (name, .., lmin, rmin) in languages {
let feature = name.to_lowercase();
write!(w, " ")?;
write_cfg(w, &feature)?;
writeln!(w, " Self::{name} => ({lmin}, {rmin}),")?;
}
writeln!(w, " }}")?;
writeln!(w, " }}")?;
writeln!(w)?;
writeln!(w, " /// The default character used to join syllables.")?;
writeln!(w, " ///")?;
writeln!(w, " /// Returns `Some('\\u{{ad}}')` (SOFT HYPHEN) for most languages, but `None`")?;
writeln!(
w,
" /// for Indic scripts where visual hyphenation is not conventional."
)?;
writeln!(w, " pub fn hyphenation_character(self) -> Option<char> {{")?;
writeln!(w, " match self {{")?;
for &(name, _, _, script, ..) in languages {
if !is_indic_script(script) {
continue;
}
let feature = name.to_lowercase();
write!(w, " ")?;
write_cfg(w, &feature)?;
writeln!(w, " Self::{name} => None,")?;
}
writeln!(w, " _ => Some('\\u{{ad}}'),")?;
writeln!(w, " }}")?;
writeln!(w, " }}")?;
writeln!(w)?;
writeln!(w, " fn root(self) -> State<'static> {{")?;
writeln!(w, " match self {{")?;
for (name, iso, ..) in languages {
let feature = name.to_lowercase();
write!(w, " ")?;
write_cfg(w, &feature)?;
write!(w, " Self::{name} => State::root(")?;
writeln!(w, "include_bytes!(\"../tries/{iso}.bin\")),")?;
}
writeln!(w, " }}")?;
writeln!(w, " }}")?;
writeln!(w, "}}")
}
fn is_indic_script(script: &str) -> bool {
matches!(
script,
"Beng" | "Deva" | "Gujr" | "Guru" | "Knda" | "Mlym" | "Orya" | "Taml" | "Telu"
)
}
fn write_cfg(w: &mut String, feature: &str) -> fmt::Result {
writeln!(w, r#"#[cfg(feature = "{feature}")]"#)
}
pub fn parse<F>(tex: &str, mut f: F)
where
F: FnMut(&str),
{
let mut s = Scanner(tex);
while let Some(c) = s.eat() {
match c {
'%' => {
s.eat_while(|c| c != '\n');
}
'\\' if s.eat_if("patterns{") => loop {
let pat = s.eat_while(|c| c != '}' && c != '%' && !c.is_whitespace());
if !pat.is_empty() {
f(pat);
}
match s.eat() {
Some('}') => break,
Some('%') => s.eat_while(|c| c != '\n'),
_ => s.eat_while(char::is_whitespace),
};
},
_ => {}
}
}
}
struct Scanner<'a>(&'a str);
impl<'a> Scanner<'a> {
fn eat(&mut self) -> Option<char> {
let mut chars = self.0.chars();
let c = chars.next();
self.0 = chars.as_str();
c
}
fn eat_if(&mut self, pat: &str) -> bool {
let matches = self.0.starts_with(pat);
if matches {
self.0 = &self.0[pat.len()..];
}
matches
}
fn eat_while(&mut self, f: fn(char) -> bool) -> &'a str {
let mut offset = 0;
let mut chars = self.0.chars();
while chars.next().map_or(false, f) {
offset = self.0.len() - chars.as_str().len();
}
let head = &self.0[..offset];
self.0 = &self.0[offset..];
head
}
}
struct TrieBuilder {
root: usize,
nodes: Vec<Node>,
levels: Vec<(usize, u8)>,
}
#[derive(Debug, Default, Clone, PartialEq, Eq, Hash)]
struct Node {
trans: Vec<u8>,
targets: Vec<usize>,
levels: Option<(usize, usize)>,
}
impl TrieBuilder {
fn new() -> Self {
Self {
root: 0,
nodes: vec![Node::default()],
levels: vec![],
}
}
fn insert(&mut self, pattern: &str) {
let mut state = 0;
let mut dist = 0;
let mut levels = vec![];
for b in pattern.bytes() {
if b.is_ascii_digit() {
levels.push((dist, b - b'0'));
dist = 0;
} else {
let len = self.nodes.len();
let node = &mut self.nodes[state];
if let Some(i) = node.trans.iter().position(|&x| x == b) {
state = node.targets[i];
} else {
node.trans.push(b);
node.targets.push(len);
state = len;
self.nodes.push(Node::default());
}
dist += 1;
}
}
let mut offset = 0;
while offset < self.levels.len() && !self.levels[offset..].starts_with(&levels) {
offset += 1;
}
if offset == self.levels.len() {
self.levels.extend(&levels);
}
self.nodes[state].levels = Some((offset, levels.len()));
}
fn compress(&mut self) {
let mut map = HashMap::new();
let mut new = vec![];
self.root = self.compress_node(0, &mut map, &mut new);
self.nodes = new;
}
fn compress_node(
&self,
node: usize,
map: &mut HashMap<Node, usize>,
new: &mut Vec<Node>,
) -> usize {
let mut x = self.nodes[node].clone();
for target in x.targets.iter_mut() {
*target = self.compress_node(*target, map, new);
}
*map.entry(x.clone()).or_insert_with(|| {
let idx = new.len();
new.push(x);
idx
})
}
fn encode(&self) -> Vec<u8> {
let start = 4 + self.levels.len();
let mut addr = start;
let mut estimates = vec![];
for node in &self.nodes {
estimates.push(addr);
addr += 1
+ ((node.trans.len() >= 31) as usize)
+ 2 * (node.levels.is_some() as usize)
+ (1 + 3) * node.trans.len();
}
let mut addr = start;
let mut addrs = vec![];
let mut strides = vec![];
for (i, node) in self.nodes.iter().enumerate() {
let stride = node
.targets
.iter()
.map(|&t| how_many_bytes(estimates[t] as isize - estimates[i] as isize))
.max()
.unwrap_or(1);
addrs.push(addr);
strides.push(stride);
addr += 1
+ ((node.trans.len() >= 31) as usize)
+ 2 * (node.levels.is_some() as usize)
+ (1 + stride) * node.trans.len();
}
let mut data = vec![];
data.extend(u32::try_from(addrs[self.root]).unwrap().to_be_bytes());
for &(dist, level) in &self.levels {
assert!(dist <= 24, "too high level distance");
assert!(level < 10, "too high level");
data.push(dist as u8 * 10 + level);
}
for ((node, &addr), stride) in self.nodes.iter().zip(&addrs).zip(strides) {
data.push(
(node.levels.is_some() as u8) << 7
| (stride as u8) << 5
| (node.trans.len().min(31) as u8),
);
if node.trans.len() >= 31 {
data.push(u8::try_from(node.trans.len()).expect("too many transitions"));
}
if let Some((offset, len)) = node.levels {
let offset = 4 + offset;
assert!(offset < 4096, "too high level offset");
assert!(len < 16, "too high level count");
let offset_hi = (offset >> 4) as u8;
let offset_lo = ((offset & 15) << 4) as u8;
let len = len as u8;
data.push(offset_hi);
data.push(offset_lo | len);
}
data.extend(&node.trans);
for &target in &node.targets {
let delta = addrs[target] as isize - addr as isize;
to_be_bytes(&mut data, delta, stride);
}
}
data
}
}
fn how_many_bytes(num: isize) -> usize {
if i8::try_from(num).is_ok() {
1
} else if i16::try_from(num).is_ok() {
2
} else if (-(1 << 23)..(1 << 23)).contains(&num) {
3
} else {
panic!("too large number");
}
}
fn to_be_bytes(buf: &mut Vec<u8>, num: isize, stride: usize) {
if stride == 1 {
buf.extend(i8::try_from(num).unwrap().to_be_bytes());
} else if stride == 2 {
buf.extend(i16::try_from(num).unwrap().to_be_bytes());
} else if stride == 3 {
let unsigned = (num + (1 << 23)) as usize;
buf.push((unsigned >> 16) as u8);
buf.push((unsigned >> 8) as u8);
buf.push(unsigned as u8);
} else {
panic!("invalid stride");
}
}