shabdakosh 2.0.0

shabdakosh — Pronunciation dictionary with ARPABET/CMUdict support for svara phonemes
Documentation
//! Build script for shabdakosh: generates the English pronunciation dictionary
//! from `data/cmudict-5k.txt` at compile time.
//!
//! The ARPABET-to-Phoneme mapping here mirrors `src/arpabet.rs` — keep both in sync.
//!
//! ## Data file format
//!
//! - `WORD  PH1 PH2 PH3` — single pronunciation (two-space separator)
//! - `WORD(n)  PH1 PH2` — variant pronunciation (n >= 2)
//! - `;;; @freq=0.85` — frequency annotation for the next entry line
//! - `;;; @region=GA` — region annotation for the next entry line (GA or RP)
//! - `;;;` — regular comment (ignored)

use std::collections::BTreeMap;
use std::env;
use std::fs;
use std::io::Write;
use std::path::Path;

struct BuildPronunciation {
    phoneme_exprs: Vec<String>,
    frequency: Option<f32>,
    region: Option<String>,
}

struct BuildEntry {
    pronunciations: Vec<BuildPronunciation>,
}

fn main() {
    println!("cargo::rerun-if-changed=data/cmudict-5k.txt");

    let data =
        fs::read_to_string("data/cmudict-5k.txt").expect("failed to read data/cmudict-5k.txt");

    let mut entries: BTreeMap<String, BuildEntry> = BTreeMap::new();
    let mut pending_freq: Option<f32> = None;
    let mut pending_region: Option<String> = None;

    for (line_num, line) in data.lines().enumerate() {
        let line = line.trim();
        if line.is_empty() {
            continue;
        }

        // Handle comment lines (may contain annotations)
        if let Some(comment) = line.strip_prefix(";;;") {
            for token in comment.split_whitespace() {
                if let Some(val) = token.strip_prefix("@freq=") {
                    pending_freq = Some(val.parse::<f32>().unwrap_or_else(|_| {
                        panic!("line {}: invalid @freq value: {val}", line_num + 1)
                    }));
                } else if let Some(val) = token.strip_prefix("@region=") {
                    pending_region = Some(val.to_string());
                }
            }
            continue;
        }

        // Parse entry line: WORD  PH1 PH2 or WORD(n)  PH1 PH2
        let Some((word_part, phonemes_str)) = line.split_once("  ") else {
            panic!("line {}: missing two-space separator: {line}", line_num + 1);
        };

        // Strip (n) variant suffix to get base word
        let word = word_part
            .trim()
            .split('(')
            .next()
            .unwrap_or(word_part.trim())
            .to_lowercase();

        let phoneme_exprs: Vec<String> = phonemes_str
            .split_whitespace()
            .map(|sym| arpabet_to_phoneme_expr(sym, line_num + 1))
            .collect();

        if phoneme_exprs.is_empty() {
            panic!("line {}: no phonemes for word '{word}'", line_num + 1);
        }

        let pronunciation = BuildPronunciation {
            phoneme_exprs,
            frequency: pending_freq.take(),
            region: pending_region.take(),
        };

        entries
            .entry(word)
            .or_insert_with(|| BuildEntry {
                pronunciations: Vec::new(),
            })
            .pronunciations
            .push(pronunciation);
    }

    // Generate Rust source.
    //
    // Split inserts across batched helper functions to keep LLVM memory usage
    // manageable on CI runners with limited RAM (GitHub ubuntu = 7 GB).
    const BATCH_SIZE: usize = 500;

    let out_dir = env::var("OUT_DIR").unwrap();
    let dest = Path::new(&out_dir).join("generated_dict.rs");
    let mut out = fs::File::create(&dest).expect("failed to create generated_dict.rs");

    writeln!(
        out,
        "// Auto-generated by build.rs from data/cmudict-5k.txt"
    )
    .unwrap();
    writeln!(
        out,
        "// DO NOT EDIT — changes will be overwritten on next build."
    )
    .unwrap();
    writeln!(out, "//").unwrap();
    writeln!(out, "// {} entries", entries.len()).unwrap();
    writeln!(out).unwrap();

    // Common imports used by every batch function.
    let imports = "\
    use alloc::string::String;\n\
    use alloc::vec;\n\
    use svara::phoneme::Phoneme;\n\
    #[allow(unused_imports)]\n\
    use crate::dictionary::entry::{DictEntry, Pronunciation, Region};";

    // Collect entries into a Vec so we can chunk them.
    let entry_vec: Vec<_> = entries.iter().collect();
    let num_batches = entry_vec.chunks(BATCH_SIZE).len();

    // Write each batch function.
    for (batch_idx, chunk) in entry_vec.chunks(BATCH_SIZE).enumerate() {
        writeln!(
            out,
            "fn generated_batch_{batch_idx}(m: &mut hashbrown::HashMap<alloc::string::String, crate::dictionary::entry::DictEntry>) {{"
        )
        .unwrap();
        writeln!(out, "{imports}").unwrap();

        for (word, entry) in chunk {
            write_entry(&mut out, word, entry);
        }

        writeln!(out, "}}").unwrap();
        writeln!(out).unwrap();
    }

    // Write the top-level function that calls all batches.
    writeln!(
        out,
        "fn generated_english_entries() -> hashbrown::HashMap<alloc::string::String, crate::dictionary::entry::DictEntry> {{"
    )
    .unwrap();
    writeln!(
        out,
        "    let mut m = hashbrown::HashMap::with_capacity({});",
        entries.len()
    )
    .unwrap();
    for batch_idx in 0..num_batches {
        writeln!(out, "    generated_batch_{batch_idx}(&mut m);").unwrap();
    }
    writeln!(out, "    m").unwrap();
    writeln!(out, "}}").unwrap();

    let total_prons: usize = entries.values().map(|e| e.pronunciations.len()).sum();
    eprintln!(
        "shabdakosh build: generated {} entries ({} pronunciations, {} batches)",
        entries.len(),
        total_prons,
        num_batches,
    );

    // --- PHF static dictionary (when phf feature is enabled) ---
    #[cfg(feature = "phf")]
    generate_phf_dict(&entries, &out_dir);
}

#[cfg(feature = "phf")]
fn generate_phf_dict(entries: &BTreeMap<String, BuildEntry>, out_dir: &str) {
    let dest = Path::new(out_dir).join("generated_phf_dict.rs");
    let mut out = fs::File::create(&dest).expect("failed to create generated_phf_dict.rs");

    writeln!(
        out,
        "// Auto-generated PHF dictionary by build.rs from data/cmudict-5k.txt"
    )
    .unwrap();
    writeln!(
        out,
        "// DO NOT EDIT — changes will be overwritten on next build."
    )
    .unwrap();
    writeln!(out).unwrap();

    // Generate static phoneme arrays and pronunciation arrays for each word.
    for (word, entry) in entries {
        let safe_name = word.replace(['-', '\''], "_").to_uppercase();
        for (i, pron) in entry.pronunciations.iter().enumerate() {
            let phonemes: Vec<String> = pron
                .phoneme_exprs
                .iter()
                .map(|e| format!("svara::phoneme::{e}"))
                .collect();
            writeln!(
                out,
                "static PHF_PH_{safe_name}_{i}: &[svara::phoneme::Phoneme] = &[{}];",
                phonemes.join(", ")
            )
            .unwrap();
        }

        // Generate static pronunciation array.
        let pron_entries: Vec<String> = entry
            .pronunciations
            .iter()
            .enumerate()
            .map(|(i, pron)| {
                let freq = match pron.frequency {
                    Some(f) => format!("Some({f}_f32)"),
                    None => "None".to_string(),
                };
                let region = match &pron.region {
                    Some(r) => {
                        let expr = region_to_expr(r);
                        format!("Some(crate::dictionary::entry::{expr})")
                    }
                    None => "None".to_string(),
                };
                format!(
                    "crate::dictionary::static_dict::StaticPronunciation {{ phonemes: PHF_PH_{safe_name}_{i}, frequency: {freq}, region: {region} }}"
                )
            })
            .collect();
        writeln!(
            out,
            "static PHF_PR_{safe_name}: &[crate::dictionary::static_dict::StaticPronunciation] = &[{}];",
            pron_entries.join(", ")
        )
        .unwrap();
    }

    writeln!(out).unwrap();

    // Generate the phf map.
    let mut map = phf_codegen::Map::new();
    for word in entries.keys() {
        let safe_name = word.replace(['-', '\''], "_").to_uppercase();
        map.entry(
            word.as_str(),
            &format!(
                "crate::dictionary::static_dict::StaticEntry {{ pronunciations: PHF_PR_{safe_name} }}"
            ),
        );
    }

    writeln!(
        out,
        "static PHF_ENGLISH_DICT: phf::Map<&'static str, crate::dictionary::static_dict::StaticEntry> = {};",
        map.build()
    )
    .unwrap();

    eprintln!(
        "shabdakosh build: generated PHF dictionary ({} entries)",
        entries.len()
    );
}

fn write_entry(out: &mut fs::File, word: &str, entry: &BuildEntry) {
    if entry.pronunciations.len() == 1 {
        let pron = &entry.pronunciations[0];
        let phonemes = pron.phoneme_exprs.join(", ");
        let mut expr = format!("Pronunciation::new(vec![{phonemes}])");
        if let Some(freq) = pron.frequency {
            expr = format!("{expr}.with_frequency({freq}_f32)");
        }
        if let Some(ref region) = pron.region {
            let region_expr = region_to_expr(region);
            expr = format!("{expr}.with_region({region_expr})");
        }
        writeln!(
            out,
            "    m.insert(String::from({word:?}), DictEntry::new({expr}));"
        )
        .unwrap();
    } else {
        write!(
            out,
            "    m.insert(String::from({word:?}), DictEntry::from_pronunciations(vec!["
        )
        .unwrap();
        for (i, pron) in entry.pronunciations.iter().enumerate() {
            if i > 0 {
                write!(out, ", ").unwrap();
            }
            let phonemes = pron.phoneme_exprs.join(", ");
            write!(out, "Pronunciation::new(vec![{phonemes}])").unwrap();
            if let Some(freq) = pron.frequency {
                write!(out, ".with_frequency({freq}_f32)").unwrap();
            }
            if let Some(ref region) = pron.region {
                let region_expr = region_to_expr(region);
                write!(out, ".with_region({region_expr})").unwrap();
            }
        }
        writeln!(out, "]).expect(\"non-empty\"));").unwrap();
    }
}

fn region_to_expr(code: &str) -> &'static str {
    match code {
        "GA" => "Region::GeneralAmerican",
        "RP" => "Region::ReceivedPronunciation",
        _ => panic!("unknown region code: {code}"),
    }
}

/// Converts an ARPABET symbol (possibly with stress digit) to a Rust expression string.
///
/// Mirrors the mapping in `src/arpabet.rs` — keep both in sync.
fn arpabet_to_phoneme_expr(symbol: &str, line: usize) -> String {
    let (base, stress) = strip_stress(symbol);

    let variant = match base {
        // Vowels
        "AA" => "VowelOpenA",
        "AE" => "VowelAsh",
        "AH" if stress == Some(0) => "VowelSchwa",
        "AH" => "VowelCupV",
        "AO" => "VowelOpenO",
        "AW" => "DiphthongAU",
        "AY" => "DiphthongAI",
        "EH" => "VowelOpenE",
        "ER" => "VowelBird",
        "EY" => "DiphthongEI",
        "IH" => "VowelNearI",
        "IY" => "VowelE",
        "OW" => "DiphthongOU",
        "OY" => "DiphthongOI",
        "UH" => "VowelNearU",
        "UW" => "VowelU",

        // Consonants
        "B" => "PlosiveB",
        "CH" => "AffricateCh",
        "D" => "PlosiveD",
        "DH" => "FricativeDh",
        "F" => "FricativeF",
        "G" => "PlosiveG",
        "HH" => "FricativeH",
        "JH" => "AffricateJ",
        "K" => "PlosiveK",
        "L" => "LateralL",
        "M" => "NasalM",
        "N" => "NasalN",
        "NG" => "NasalNg",
        "P" => "PlosiveP",
        "R" => "ApproximantR",
        "S" => "FricativeS",
        "SH" => "FricativeSh",
        "T" => "PlosiveT",
        "TH" => "FricativeTh",
        "V" => "FricativeV",
        "W" => "ApproximantW",
        "Y" => "ApproximantJ",
        "Z" => "FricativeZ",
        "ZH" => "FricativeZh",

        _ => panic!("line {line}: unknown ARPABET symbol '{symbol}'"),
    };

    format!("Phoneme::{variant}")
}

fn strip_stress(symbol: &str) -> (&str, Option<u8>) {
    if let Some(last) = symbol.as_bytes().last() {
        match last {
            b'0' => (&symbol[..symbol.len() - 1], Some(0)),
            b'1' => (&symbol[..symbol.len() - 1], Some(1)),
            b'2' => (&symbol[..symbol.len() - 1], Some(2)),
            _ => (symbol, None),
        }
    } else {
        (symbol, None)
    }
}