tanaka 0.1.0

A Rust interface the Tanaka Corpus of parallel Japanese-English sentences
Documentation
use xz_decom::decompress;

use std::{env, fs, path::PathBuf};

static EXAMPLES_XZ: &[u8] = include_bytes!("examples.utf.xz");
static EXAMPLES_SUBSET_XZ: &[u8] = include_bytes!("examples_subset.utf.xz");

fn main() {
    #[cfg(feature = "include")]
    generate_examples();

    #[cfg(feature = "include_subset")]
    generate_examples_subset();
}

fn decompress_to(bytes: &[u8], fnm: &str) {
    let path: PathBuf = [
        env::var("OUT_DIR").expect("no OUT_DIR in environment"),
        fnm.into(),
    ]
    .iter()
    .collect();

    let decompressed = decompress(bytes).expect("failed to decompress examples");
    let string =
        std::str::from_utf8(&decompressed).expect("failed to parse decompressed examples as UTF-8");

    fs::write(path, string).expect("failed to write decompressed examples to file");
}

#[cfg(feature = "include")]
fn generate_examples() {
    decompress_to(EXAMPLES_XZ, "examples.utf");
}

#[cfg(feature = "include_subset")]
fn generate_examples_subset() {
    decompress_to(EXAMPLES_SUBSET_XZ, "examples_subset.utf");
}