use std::collections::BTreeMap;
use std::env;
use std::fs;
use std::io::Write;
use std::path::Path;
struct BuildPronunciation {
phoneme_exprs: Vec<String>,
frequency: Option<f32>,
region: Option<String>,
}
struct BuildEntry {
pronunciations: Vec<BuildPronunciation>,
}
fn main() {
println!("cargo::rerun-if-changed=data/cmudict-5k.txt");
let data =
fs::read_to_string("data/cmudict-5k.txt").expect("failed to read data/cmudict-5k.txt");
let mut entries: BTreeMap<String, BuildEntry> = BTreeMap::new();
let mut pending_freq: Option<f32> = None;
let mut pending_region: Option<String> = None;
for (line_num, line) in data.lines().enumerate() {
let line = line.trim();
if line.is_empty() {
continue;
}
if let Some(comment) = line.strip_prefix(";;;") {
for token in comment.split_whitespace() {
if let Some(val) = token.strip_prefix("@freq=") {
pending_freq = Some(val.parse::<f32>().unwrap_or_else(|_| {
panic!("line {}: invalid @freq value: {val}", line_num + 1)
}));
} else if let Some(val) = token.strip_prefix("@region=") {
pending_region = Some(val.to_string());
}
}
continue;
}
let Some((word_part, phonemes_str)) = line.split_once(" ") else {
panic!("line {}: missing two-space separator: {line}", line_num + 1);
};
let word = word_part
.trim()
.split('(')
.next()
.unwrap_or(word_part.trim())
.to_lowercase();
let phoneme_exprs: Vec<String> = phonemes_str
.split_whitespace()
.map(|sym| arpabet_to_phoneme_expr(sym, line_num + 1))
.collect();
if phoneme_exprs.is_empty() {
panic!("line {}: no phonemes for word '{word}'", line_num + 1);
}
let pronunciation = BuildPronunciation {
phoneme_exprs,
frequency: pending_freq.take(),
region: pending_region.take(),
};
entries
.entry(word)
.or_insert_with(|| BuildEntry {
pronunciations: Vec::new(),
})
.pronunciations
.push(pronunciation);
}
const BATCH_SIZE: usize = 500;
let out_dir = env::var("OUT_DIR").unwrap();
let dest = Path::new(&out_dir).join("generated_dict.rs");
let mut out = fs::File::create(&dest).expect("failed to create generated_dict.rs");
writeln!(
out,
"// Auto-generated by build.rs from data/cmudict-5k.txt"
)
.unwrap();
writeln!(
out,
"// DO NOT EDIT — changes will be overwritten on next build."
)
.unwrap();
writeln!(out, "//").unwrap();
writeln!(out, "// {} entries", entries.len()).unwrap();
writeln!(out).unwrap();
let imports = "\
use alloc::string::String;\n\
use alloc::vec;\n\
use svara::phoneme::Phoneme;\n\
#[allow(unused_imports)]\n\
use crate::dictionary::entry::{DictEntry, Pronunciation, Region};";
// Collect entries into a Vec so we can chunk them.
let entry_vec: Vec<_> = entries.iter().collect();
let num_batches = entry_vec.chunks(BATCH_SIZE).len();
// Write each batch function.
for (batch_idx, chunk) in entry_vec.chunks(BATCH_SIZE).enumerate() {
writeln!(
out,
"fn generated_batch_{batch_idx}(m: &mut hashbrown::HashMap<alloc::string::String, crate::dictionary::entry::DictEntry>) {{"
)
.unwrap();
writeln!(out, "{imports}").unwrap();
for (word, entry) in chunk {
write_entry(&mut out, word, entry);
}
writeln!(out, "}}").unwrap();
writeln!(out).unwrap();
}
// Write the top-level function that calls all batches.
writeln!(
out,
"fn generated_english_entries() -> hashbrown::HashMap<alloc::string::String, crate::dictionary::entry::DictEntry> {{"
)
.unwrap();
writeln!(
out,
" let mut m = hashbrown::HashMap::with_capacity({});",
entries.len()
)
.unwrap();
for batch_idx in 0..num_batches {
writeln!(out, " generated_batch_{batch_idx}(&mut m);").unwrap();
}
writeln!(out, " m").unwrap();
writeln!(out, "}}").unwrap();
let total_prons: usize = entries.values().map(|e| e.pronunciations.len()).sum();
eprintln!(
"shabdakosh build: generated {} entries ({} pronunciations, {} batches)",
entries.len(),
total_prons,
num_batches,
);
// --- PHF static dictionary (when phf feature is enabled) ---
#[cfg(feature = "phf")]
generate_phf_dict(&entries, &out_dir);
}
#[cfg(feature = "phf")]
fn generate_phf_dict(entries: &BTreeMap<String, BuildEntry>, out_dir: &str) {
let dest = Path::new(out_dir).join("generated_phf_dict.rs");
let mut out = fs::File::create(&dest).expect("failed to create generated_phf_dict.rs");
writeln!(
out,
" )
.unwrap();
writeln!(
out,
"// DO NOT EDIT — changes will be overwritten on next build."
)
.unwrap();
writeln!(out).unwrap();
for (word, entry) in entries {
let safe_name = word.replace(['-', '\''], "_").to_uppercase();
for (i, pron) in entry.pronunciations.iter().enumerate() {
let phonemes: Vec<String> = pron
.phoneme_exprs
.iter()
.map(|e| format!("svara::phoneme::{e}"))
.collect();
writeln!(
out,
"static PHF_PH_{safe_name}_{i}: &[svara::phoneme::Phoneme] = &[{}];",
phonemes.join(", ")
)
.unwrap();
}
let pron_entries: Vec<String> = entry
.pronunciations
.iter()
.enumerate()
.map(|(i, pron)| {
let freq = match pron.frequency {
Some(f) => format!("Some({f}_f32)"),
None => "None".to_string(),
};
let region = match &pron.region {
Some(r) => {
let expr = region_to_expr(r);
format!("Some(crate::dictionary::entry::{expr})")
}
None => "None".to_string(),
};
format!(
"crate::dictionary::static_dict::StaticPronunciation {{ phonemes: PHF_PH_{safe_name}_{i}, frequency: {freq}, region: {region} }}"
)
})
.collect();
writeln!(
out,
"static PHF_PR_{safe_name}: &[crate::dictionary::static_dict::StaticPronunciation] = &[{}];",
pron_entries.join(", ")
)
.unwrap();
}
writeln!(out).unwrap();
let mut map = phf_codegen::Map::new();
for word in entries.keys() {
let safe_name = word.replace(['-', '\''], "_").to_uppercase();
map.entry(
word.as_str(),
&format!(
"crate::dictionary::static_dict::StaticEntry {{ pronunciations: PHF_PR_{safe_name} }}"
),
);
}
writeln!(
out,
"static PHF_ENGLISH_DICT: phf::Map<&'static str, crate::dictionary::static_dict::StaticEntry> = {};",
map.build()
)
.unwrap();
eprintln!(
"shabdakosh build: generated PHF dictionary ({} entries)",
entries.len()
);
}
fn write_entry(out: &mut fs::File, word: &str, entry: &BuildEntry) {
if entry.pronunciations.len() == 1 {
let pron = &entry.pronunciations[0];
let phonemes = pron.phoneme_exprs.join(", ");
let mut expr = format!("Pronunciation::new(vec![{phonemes}])");
if let Some(freq) = pron.frequency {
expr = format!("{expr}.with_frequency({freq}_f32)");
}
if let Some(ref region) = pron.region {
let region_expr = region_to_expr(region);
expr = format!("{expr}.with_region({region_expr})");
}
writeln!(
out,
" m.insert(String::from({word:?}), DictEntry::new({expr}));"
)
.unwrap();
} else {
write!(
out,
" m.insert(String::from({word:?}), DictEntry::from_pronunciations(vec!["
)
.unwrap();
for (i, pron) in entry.pronunciations.iter().enumerate() {
if i > 0 {
write!(out, ", ").unwrap();
}
let phonemes = pron.phoneme_exprs.join(", ");
write!(out, "Pronunciation::new(vec![{phonemes}])").unwrap();
if let Some(freq) = pron.frequency {
write!(out, ".with_frequency({freq}_f32)").unwrap();
}
if let Some(ref region) = pron.region {
let region_expr = region_to_expr(region);
write!(out, ".with_region({region_expr})").unwrap();
}
}
writeln!(out, "]).expect(\"non-empty\"));").unwrap();
}
}
fn region_to_expr(code: &str) -> &'static str {
match code {
"GA" => "Region::GeneralAmerican",
"RP" => "Region::ReceivedPronunciation",
_ => panic!("unknown region code: {code}"),
}
}
fn arpabet_to_phoneme_expr(symbol: &str, line: usize) -> String {
let (base, stress) = strip_stress(symbol);
let variant = match base {
"AA" => "VowelOpenA",
"AE" => "VowelAsh",
"AH" if stress == Some(0) => "VowelSchwa",
"AH" => "VowelCupV",
"AO" => "VowelOpenO",
"AW" => "DiphthongAU",
"AY" => "DiphthongAI",
"EH" => "VowelOpenE",
"ER" => "VowelBird",
"EY" => "DiphthongEI",
"IH" => "VowelNearI",
"IY" => "VowelE",
"OW" => "DiphthongOU",
"OY" => "DiphthongOI",
"UH" => "VowelNearU",
"UW" => "VowelU",
"B" => "PlosiveB",
"CH" => "AffricateCh",
"D" => "PlosiveD",
"DH" => "FricativeDh",
"F" => "FricativeF",
"G" => "PlosiveG",
"HH" => "FricativeH",
"JH" => "AffricateJ",
"K" => "PlosiveK",
"L" => "LateralL",
"M" => "NasalM",
"N" => "NasalN",
"NG" => "NasalNg",
"P" => "PlosiveP",
"R" => "ApproximantR",
"S" => "FricativeS",
"SH" => "FricativeSh",
"T" => "PlosiveT",
"TH" => "FricativeTh",
"V" => "FricativeV",
"W" => "ApproximantW",
"Y" => "ApproximantJ",
"Z" => "FricativeZ",
"ZH" => "FricativeZh",
_ => panic!("line {line}: unknown ARPABET symbol '{symbol}'"),
};
format!("Phoneme::{variant}")
}
fn strip_stress(symbol: &str) -> (&str, Option<u8>) {
if let Some(last) = symbol.as_bytes().last() {
match last {
b'0' => (&symbol[..symbol.len() - 1], Some(0)),
b'1' => (&symbol[..symbol.len() - 1], Some(1)),
b'2' => (&symbol[..symbol.len() - 1], Some(2)),
_ => (symbol, None),
}
} else {
(symbol, None)
}
}