espeak-ng 0.1.1

Pure Rust port of eSpeak NG text-to-speech
Documentation
#!/usr/bin/env python3

from __future__ import annotations

import shutil
from pathlib import Path


ROOT = Path(__file__).resolve().parent.parent
SOURCE_DATA_DIR = ROOT / "espeak-ng-data"
TARGET_ROOT = ROOT / "data-crates"
ROOT_CARGO_TOML = ROOT / "Cargo.toml"
ROOT_BUNDLED_DATA_RS = ROOT / "src" / "bundled_data_generated.rs"
BUILD_RS = """// build.rs — generates `files.rs` in OUT_DIR.
//
// The generated file contains:
//   pub static ALL_FILES: &[(&str, &[u8])] = &[…];
//
// Every file under data/ is embedded with include_bytes! so the crate is
// fully self-contained and works without any installed data at runtime.

use std::fs;
use std::path::Path;

fn main() {
    let out_dir = std::env::var("OUT_DIR").unwrap();
    let data_dir = Path::new("data");

    // Collect all file paths relative to data/
    let mut entries: Vec<String> = Vec::new();
    collect_files(data_dir, data_dir, &mut entries);
    entries.sort();

    // Generate the static array
    let mut code = String::new();
    code.push_str("/// All data files embedded in this crate.\n");
    code.push_str("///\n");
    code.push_str("/// Each entry is `(relative_path, file_bytes)`.\n");
    code.push_str("pub static ALL_FILES: &[(&str, &[u8])] = &[\n");
    for rel in &entries {
        let abs_expr = format!(
            r#"concat!(env!(\"CARGO_MANIFEST_DIR\"), \"/data/{rel}\")"#
        );
        code.push_str(&format!("    ({rel:?}, include_bytes!({abs_expr})),\n"));
    }
    code.push_str("];\n");

    fs::write(format!("{out_dir}/files.rs"), code)
        .expect("could not write generated files.rs");

    println!("cargo:rerun-if-changed=data");
    println!("cargo:rerun-if-changed=build.rs");
}

fn collect_files(base: &Path, current: &Path, out: &mut Vec<String>) {
    let mut entries: Vec<_> = fs::read_dir(current)
        .unwrap_or_else(|e| panic!("cannot read dir {}: {e}", current.display()))
        .map(|e| e.unwrap().path())
        .collect();
    entries.sort();

    for path in entries {
        if path.is_dir() {
            collect_files(base, &path, out);
        } else {
            let rel = path
                .strip_prefix(base)
                .unwrap()
                .to_string_lossy()
                .replace('\\\\', "/");
            out.push(rel);
        }
    }
}
"""

CARGO_FEATURES_BEGIN = "# BEGIN generated per-language features"
CARGO_FEATURES_END = "# END generated per-language features"
CARGO_DEPS_BEGIN = "# BEGIN generated per-language dependencies"
CARGO_DEPS_END = "# END generated per-language dependencies"


def cargo_toml(crate_name: str, dict_name: str, lang: str) -> str:
    return f"""[package]
name        = \"{crate_name}\"
version     = \"0.1.0\"
edition     = \"2021\"
description = \"eSpeak NG dictionary data for {dict_name}\"
license     = \"GPL-3.0-or-later\"
keywords    = [\"espeak\", \"tts\", \"{lang}\", \"dictionary\", \"data\"]
categories  = [\"multimedia::audio\", \"accessibility\"]

include = [
    \"build.rs\",
    \"src/**\",
    \"data/{dict_name}\",
]
"""


def lib_rs(crate_name: str, dict_name: str) -> str:
    return f"""//! eSpeak NG dictionary data for `{dict_name}`, embedded at compile time.
//!
//! ## Usage
//!
//! ```rust,no_run
//! use std::path::Path;
//!
//! let data_dir = Path::new(\"/tmp/espeak-data\");
//! {crate_name.replace('-', '_')}::install(data_dir).unwrap();
//! ```

include!(concat!(env!(\"OUT_DIR\"), \"/files.rs\"));

/// Install the embedded `{dict_name}` into `dest_dir`.
pub fn install(dest_dir: &std::path::Path) -> std::io::Result<()> {{
    for (rel_path, data) in ALL_FILES {{
        let dest = dest_dir.join(rel_path);
        if let Some(parent) = dest.parent() {{
            std::fs::create_dir_all(parent)?;
        }}
        std::fs::write(dest, data)?;
    }}
    Ok(())
}}

#[cfg(test)]
mod tests {{
    use super::*;

    #[test]
    fn all_files_nonempty() {{
        assert!(!ALL_FILES.is_empty(), \"ALL_FILES must not be empty\");
        for (path, data) in ALL_FILES {{
            assert!(!data.is_empty(), \"embedded file {{path:?}} is empty\");
        }}
    }}
}}
"""


def replace_between(text: str, begin: str, end: str, body: str) -> str:
    start = text.index(begin) + len(begin)
    finish = text.index(end)
    return text[:start] + "\n" + body.rstrip() + "\n" + text[finish:]


def root_dependency_name(lang: str) -> str:
    if lang == "ru":
        return "espeak-ng-data-dict-ru"
    return f"espeak-ng-data-dict-{lang}"


def feature_name(lang: str) -> str:
    return f"bundled-data-{lang}"


def generate_root_feature_block(langs: list[str]) -> str:
    lines = [
        f'{feature_name(lang)} = ["dep:espeak-ng-data-phonemes", "dep:{root_dependency_name(lang)}"]'
        for lang in langs
    ]
    return "\n".join(lines)


def generate_root_dependency_block(langs: list[str]) -> str:
    lines = []
    for lang in langs:
        if lang == "ru":
            continue
        dep_name = root_dependency_name(lang)
        lines.append(
            f'{dep_name} = {{ version = "0.1.0", path = "data-crates/{dep_name}", optional = true }}'
        )
    return "\n".join(lines)


def generate_bundled_data_rs(langs: list[str]) -> str:
    active_cfg = ", ".join(f'feature = "{feature_name(lang)}"' for lang in langs)

    constant_entries = []
    for lang in langs:
        constant_entries.append(f'    #[cfg(feature = "{feature_name(lang)}")]')
        constant_entries.append(f'    "{lang}",')

    match_arms = []
    for lang in langs:
        crate_mod = root_dependency_name(lang).replace('-', '_')
        match_arms.append(f'        #[cfg(feature = "{feature_name(lang)}")]')
        match_arms.append(f'        "{lang}" => {crate_mod}::install(dest_dir),')

    return f'''// This file is generated by scripts/generate_dict_crates.py.
// Do not edit manually.

use std::io;
use std::path::Path;

pub const BUNDLED_LANGUAGES: &[&str] = &[
{chr(10).join(constant_entries)}
];

pub fn bundled_languages() -> &'static [&'static str] {{
    BUNDLED_LANGUAGES
}}

pub fn has_bundled_language(lang: &str) -> bool {{
    BUNDLED_LANGUAGES.contains(&lang)
}}

fn unsupported_language_error(lang: &str) -> io::Error {{
    let available = if BUNDLED_LANGUAGES.is_empty() {{
        "none".to_string()
    }} else {{
        BUNDLED_LANGUAGES.join(", ")
    }};
    io::Error::new(
        io::ErrorKind::InvalidInput,
        format!("language {{lang:?}} is not bundled in this build; enabled bundled languages: {{available}}"),
    )
}}

#[allow(dead_code)]
#[cfg(any({active_cfg}))]
fn install_selected_dictionary(dest_dir: &Path, lang: &str) -> io::Result<()> {{
    match lang {{
{chr(10).join(match_arms)}
        _ => Err(unsupported_language_error(lang)),
    }}
}}

#[allow(dead_code)]
#[cfg(not(any({active_cfg})))]
fn install_selected_dictionary(_dest_dir: &Path, lang: &str) -> io::Result<()> {{
    Err(unsupported_language_error(lang))
}}

#[cfg(any({active_cfg}))]
pub fn install_bundled_language(dest_dir: &Path, lang: &str) -> io::Result<()> {{
    espeak_ng_data_phonemes::install(dest_dir)?;
    install_selected_dictionary(dest_dir, lang)
}}

#[cfg(not(any({active_cfg})))]
pub fn install_bundled_language(_dest_dir: &Path, lang: &str) -> io::Result<()> {{
    Err(unsupported_language_error(lang))
}}

#[cfg(any({active_cfg}))]
pub fn install_bundled_languages(dest_dir: &Path, languages: &[&str]) -> io::Result<()> {{
    espeak_ng_data_phonemes::install(dest_dir)?;
    for &lang in languages {{
        install_selected_dictionary(dest_dir, lang)?;
    }}
    Ok(())
}}

#[cfg(not(any({active_cfg})))]
pub fn install_bundled_languages(_dest_dir: &Path, languages: &[&str]) -> io::Result<()> {{
    if let Some(lang) = languages.first() {{
        Err(unsupported_language_error(lang))
    }} else {{
        Ok(())
    }}
}}
'''


def update_root_files(langs: list[str]) -> None:
    cargo_text = ROOT_CARGO_TOML.read_text(encoding="utf-8")
    cargo_text = replace_between(
        cargo_text,
        CARGO_FEATURES_BEGIN,
        CARGO_FEATURES_END,
        generate_root_feature_block(langs),
    )
    cargo_text = replace_between(
        cargo_text,
        CARGO_DEPS_BEGIN,
        CARGO_DEPS_END,
        generate_root_dependency_block(langs),
    )
    ROOT_CARGO_TOML.write_text(cargo_text, encoding="utf-8")

    ROOT_BUNDLED_DATA_RS.write_text(generate_bundled_data_rs(langs), encoding="utf-8")


def main() -> None:
    dict_files = sorted(SOURCE_DATA_DIR.glob("*_dict"))
    if not dict_files:
        raise SystemExit("no *_dict files found under espeak-ng-data")

    langs = [dict_path.name.removesuffix("_dict") for dict_path in dict_files]
    update_root_files(langs)

    generated = 0
    updated = 0

    for dict_path in dict_files:
        lang = dict_path.name.removesuffix("_dict")
        if lang == "ru":
            continue

        crate_name = f"espeak-ng-data-dict-{lang}"
        crate_dir = TARGET_ROOT / crate_name
        data_dir = crate_dir / "data"
        src_dir = crate_dir / "src"
        crate_dir.mkdir(parents=True, exist_ok=True)
        data_dir.mkdir(parents=True, exist_ok=True)
        src_dir.mkdir(parents=True, exist_ok=True)

        files = {
            crate_dir / "Cargo.toml": cargo_toml(crate_name, dict_path.name, lang),
            crate_dir / "build.rs": BUILD_RS,
            src_dir / "lib.rs": lib_rs(crate_name, dict_path.name),
        }

        created_here = False
        for path, content in files.items():
            encoded = content.encode("utf-8")
            if not path.exists() or path.read_bytes() != encoded:
                path.write_bytes(encoded)
                created_here = True

        target_dict_path = data_dir / dict_path.name
        source_bytes = dict_path.read_bytes()
        if not target_dict_path.exists() or target_dict_path.read_bytes() != source_bytes:
            shutil.copyfile(dict_path, target_dict_path)
            created_here = True

        if created_here:
            if (crate_dir / "Cargo.toml").exists() and (crate_dir / "src" / "lib.rs").exists():
                updated += 1
            else:
                generated += 1

    total = len(dict_files) - 1
    print(f"generated or updated {updated + generated} per-language crates ({total} non-Russian dictionaries)")


if __name__ == "__main__":
    main()