aozora-encoding 0.4.1

Aozora Bunko notation: Shift_JIS decoding and gaiji (外字) resolution. Internal crate; depend on the aozora umbrella crate instead.
Documentation
// build.rs runs once per Cargo invocation (when inputs change) and
// emits a Rust source fragment to OUT_DIR. The pedantic clippy
// lints the workspace applies to library code surface a lot of
// noise here that doesn't affect downstream builds — relax them
// at the file level.
#![allow(
    clippy::too_many_lines,
    clippy::redundant_clone,
    clippy::format_collect,
    clippy::missing_assert_message,
    clippy::absolute_paths,
    reason = "build.rs is dev-tooling code that emits source to OUT_DIR; \
              the workspace's pedantic lint profile aimed at library APIs \
              fires noisy here without improving anything downstream"
)]
//! Compile-time gaiji table generator.
//!
//! Reads the three TSV data files in `data/`, runs `phf_codegen`
//! to build the perfect hashes, and writes the result to
//! `$OUT_DIR/jisx0213_table.rs`. The library `include!()`s that
//! file from `gaiji.rs`.
//!
//! ## Why `phf_codegen` instead of `phf::phf_map!`
//!
//! The macro form makes rustc do the perfect-hash construction at
//! macro-expansion time. For the 4 329-entry single-char map +
//! 8 883-entry description map (combined ~13 k entries) this costs
//! ~30–60 seconds of `cargo check` on cold caches and a noticeable
//! pause on every `rust-analyzer` reload. `phf_codegen` runs the
//! same algorithm at build-time inside `build.rs`; Cargo caches the
//! emitted file and only re-runs it when the input TSVs change, so
//! repeat-build cost drops to "include the cached `.rs`".
//!
//! The generated source uses `phf::Map<&'static str, ...>` —
//! identical runtime types to the macro path, so the lookup hot
//! path is unchanged.

use std::env;
use std::fs;
use std::io::{BufWriter, Write};
use std::path::PathBuf;

const INFALLIBLE: &str = "BufWriter::write_all over Vec<u8> is infallible";

fn main() {
    let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR"));
    let data_dir = manifest_dir.join("data");
    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
    let out_path = out_dir.join("jisx0213_table.rs");

    let single_tsv = data_dir.join("jisx0213-2004.tsv");
    let combo_tsv = data_dir.join("jisx0213-combo.tsv");
    let dict_tsv = data_dir.join("aozora-gaiji-chuki.tsv");
    let special_tsv = data_dir.join("aozora-gaiji-special.tsv");

    println!("cargo:rerun-if-changed=build.rs");
    println!("cargo:rerun-if-changed={}", single_tsv.display());
    println!("cargo:rerun-if-changed={}", combo_tsv.display());
    println!("cargo:rerun-if-changed={}", dict_tsv.display());
    println!("cargo:rerun-if-changed={}", special_tsv.display());

    let single = parse_single_tsv(&single_tsv);
    let combo = parse_combo_tsv(&combo_tsv);
    let dict = parse_description_tsv(&dict_tsv);
    let special = parse_description_tsv(&special_tsv);

    // Hand-curated specials win on conflict — same precedence as
    // the previous `xtask gaiji-gen` path.
    let mut description: Vec<DescriptionEntry> = special.clone();
    let seen: std::collections::HashSet<String> =
        description.iter().map(|e| e.description.clone()).collect();
    for entry in dict {
        if !seen.contains(&entry.description) {
            description.push(entry);
        }
    }
    // Stable order across builds for diff readability if the file
    // is ever inspected by a human.
    description.sort_by(|a, b| a.description.cmp(&b.description));

    let plane1_count = single.iter().filter(|e| e.plane == 1).count();
    let plane2_count = single.iter().filter(|e| e.plane == 2).count();
    let combo_count = combo.len();
    let description_count = description.len();

    let file = fs::File::create(&out_path).expect("create OUT_DIR/jisx0213_table.rs");
    let mut out = BufWriter::new(file);

    writeln!(
        out,
        "// AUTO-GENERATED by `crates/aozora-encoding/build.rs`. \
         DO NOT EDIT BY HAND.\n\
         // Source TSVs in `crates/aozora-encoding/data/`; rerun by \
         touching any of them.\n",
    )
    .expect(INFALLIBLE);

    // ---- single-char map ----
    let mut single_builder = phf_codegen::Map::<&str>::new();
    let single_keys: Vec<String> = single
        .iter()
        .map(|e| mencode(e.plane, e.row, e.cell))
        .collect();
    let single_values: Vec<String> = single
        .iter()
        .map(|e| format!("'\\u{{{:04X}}}'", e.codepoint))
        .collect();
    for (k, v) in single_keys.iter().zip(single_values.iter()) {
        single_builder.entry(k.as_str(), v.as_str());
    }
    writeln!(
        out,
        "pub(crate) static JISX0213_MENCODE_TO_CHAR: phf::Map<&'static str, char> = {};",
        single_builder.build(),
    )
    .expect(INFALLIBLE);
    writeln!(out).expect(INFALLIBLE);

    // ---- combo (multi-codepoint) map ----
    let mut combo_builder = phf_codegen::Map::<&str>::new();
    let combo_keys: Vec<String> = combo
        .iter()
        .map(|e| mencode(e.plane, e.row, e.cell))
        .collect();
    let combo_values: Vec<String> = combo
        .iter()
        .map(|e| {
            let body: String = e
                .codepoints
                .iter()
                .map(|cp| format!("\\u{{{cp:04X}}}"))
                .collect();
            format!("\"{body}\"")
        })
        .collect();
    for (k, v) in combo_keys.iter().zip(combo_values.iter()) {
        combo_builder.entry(k.as_str(), v.as_str());
    }
    writeln!(
        out,
        "pub(crate) static JISX0213_MENCODE_TO_STR: phf::Map<&'static str, &'static str> = {};",
        combo_builder.build(),
    )
    .expect(INFALLIBLE);
    writeln!(out).expect(INFALLIBLE);

    // ---- description map ----
    // phf_codegen takes `&str` keys; the descriptions live in
    // `description[*].description` so we can borrow directly.
    // `phf_codegen` source-prints the keys with `Debug`, which
    // already escapes embedded `"` / `\` correctly.
    let mut description_builder = phf_codegen::Map::<&str>::new();
    let description_values: Vec<String> = description
        .iter()
        .map(|e| format!("'\\u{{{:04X}}}'", e.codepoint))
        .collect();
    for (entry, value) in description.iter().zip(description_values.iter()) {
        description_builder.entry(entry.description.as_str(), value.as_str());
    }
    writeln!(
        out,
        "pub(crate) static DESCRIPTION_TO_CHAR: phf::Map<&'static str, char> = {};",
        description_builder.build(),
    )
    .expect(INFALLIBLE);
    writeln!(out).expect(INFALLIBLE);

    // ---- count constants (used by tests) ----
    writeln!(
        out,
        "#[allow(dead_code, reason = \"pinned for table-size tests\")]\n\
         pub(crate) const JISX0213_PLANE1_COUNT: usize = {plane1_count};\n\
         #[allow(dead_code, reason = \"pinned for table-size tests\")]\n\
         pub(crate) const JISX0213_PLANE2_COUNT: usize = {plane2_count};\n\
         #[allow(dead_code, reason = \"pinned for table-size tests\")]\n\
         pub(crate) const JISX0213_COMBO_COUNT: usize = {combo_count};\n\
         #[allow(dead_code, reason = \"pinned for table-size tests\")]\n\
         pub(crate) const DESCRIPTION_COUNT: usize = {description_count};",
    )
    .expect(INFALLIBLE);
    writeln!(out).expect(INFALLIBLE);
    writeln!(
        out,
        "// Summary: 第3水準={plane1_count}, 第4水準={plane2_count}, \
         combo={combo_count}, description={description_count}, \
         total={total}.",
        total = plane1_count + plane2_count + combo_count + description_count,
    )
    .expect(INFALLIBLE);
}

#[derive(Clone, Copy)]
struct SingleEntry {
    plane: u8,
    row: u8,
    cell: u8,
    codepoint: u32,
}

struct ComboEntry {
    plane: u8,
    row: u8,
    cell: u8,
    codepoints: Vec<u32>,
}

#[derive(Clone)]
struct DescriptionEntry {
    description: String,
    codepoint: u32,
}

fn parse_single_tsv(path: &std::path::Path) -> Vec<SingleEntry> {
    let text =
        fs::read_to_string(path).unwrap_or_else(|err| panic!("read {}: {err}", path.display()));
    let mut out = Vec::new();
    for (lineno, line) in text.lines().enumerate() {
        let trimmed = line.trim();
        if trimmed.is_empty() || trimmed.starts_with('#') {
            continue;
        }
        let cols: Vec<&str> = trimmed.split('\t').collect();
        assert!(
            cols.len() == 4,
            "{}:{} expected 4 cols, got {}",
            path.display(),
            lineno + 1,
            cols.len(),
        );
        let plane: u8 = cols[0].parse().expect("plane");
        let row: u8 = cols[1].parse().expect("row");
        let cell: u8 = cols[2].parse().expect("cell");
        let codepoint = u32::from_str_radix(cols[3], 16).expect("codepoint hex");
        out.push(SingleEntry {
            plane,
            row,
            cell,
            codepoint,
        });
    }
    out
}

fn parse_combo_tsv(path: &std::path::Path) -> Vec<ComboEntry> {
    let text =
        fs::read_to_string(path).unwrap_or_else(|err| panic!("read {}: {err}", path.display()));
    let mut out = Vec::new();
    for line in text.lines() {
        let trimmed = line.trim();
        if trimmed.is_empty() || trimmed.starts_with('#') {
            continue;
        }
        let cols: Vec<&str> = trimmed.split('\t').collect();
        assert_eq!(cols.len(), 4);
        let plane: u8 = cols[0].parse().expect("plane");
        let row: u8 = cols[1].parse().expect("row");
        let cell: u8 = cols[2].parse().expect("cell");
        let codepoints: Vec<u32> = cols[3]
            .split(',')
            .map(|hex| u32::from_str_radix(hex, 16).expect("codepoint hex"))
            .collect();
        out.push(ComboEntry {
            plane,
            row,
            cell,
            codepoints,
        });
    }
    out
}

fn parse_description_tsv(path: &std::path::Path) -> Vec<DescriptionEntry> {
    let text =
        fs::read_to_string(path).unwrap_or_else(|err| panic!("read {}: {err}", path.display()));
    let mut out = Vec::new();
    for line in text.lines() {
        let trimmed = line.trim_end();
        if trimmed.is_empty() || trimmed.starts_with('#') {
            continue;
        }
        let cols: Vec<&str> = trimmed.split('\t').collect();
        assert_eq!(cols.len(), 2);
        let codepoint = u32::from_str_radix(cols[1], 16).expect("codepoint hex");
        out.push(DescriptionEntry {
            description: cols[0].to_owned(),
            codepoint,
        });
    }
    out
}

fn mencode(plane: u8, row: u8, cell: u8) -> String {
    let level = if plane == 1 { 3 } else { 4 };
    format!("{level}水準{plane}-{row}-{cell}")
}