grift_unicode 1.4.0

Unicode character operations for the Grift Scheme language
Documentation
//! Build script for grift_unicode.
//!
//! Parses Unicode CaseFolding.txt at compile time and generates the
//! `lookup_full_casefold()` function containing all status 'F' entries.
//! This ensures accuracy, completeness, and easy updates when Unicode
//! versions change — just replace CaseFolding.txt and rebuild.

use std::collections::BTreeMap;
use std::fs;
use std::io::Write;
use std::path::Path;

const CASEFOLDING_FILE: &str = "CaseFolding.txt";

fn main() {
    let casefolding_path = Path::new(CASEFOLDING_FILE);
    println!("cargo:rerun-if-changed={}", CASEFOLDING_FILE);
    println!("cargo:rerun-if-changed=build.rs");

    let contents = fs::read_to_string(casefolding_path)
        .unwrap_or_else(|e| panic!("Failed to read {}: {}", CASEFOLDING_FILE, e));

    // Parse all 'F' status entries into a sorted map: code_point -> Vec<u32>
    let mut entries: BTreeMap<u32, (Vec<u32>, String)> = BTreeMap::new();

    for line in contents.lines() {
        let line = line.trim();
        if line.is_empty() || line.starts_with('#') {
            continue;
        }

        let parts: Vec<&str> = line.splitn(4, ';').collect();
        if parts.len() < 4 {
            continue;
        }

        let status = parts[1].trim();
        if status != "F" {
            continue;
        }

        let code_point = u32::from_str_radix(parts[0].trim(), 16)
            .unwrap_or_else(|e| panic!("Invalid code point '{}': {}", parts[0].trim(), e));

        let mapping: Vec<u32> = parts[2]
            .trim()
            .split_whitespace()
            .map(|s| u32::from_str_radix(s, 16)
                .unwrap_or_else(|e| panic!("Invalid mapping code point '{}': {}", s, e)))
            .collect();

        // Extract the comment (after '#')
        let comment = parts[3]
            .trim()
            .trim_start_matches('#')
            .trim()
            .to_string();

        entries.insert(code_point, (mapping, comment));
    }

    // Generate the Rust source file
    let out_dir = std::env::var("OUT_DIR").expect("OUT_DIR not set");
    let out_path = Path::new(&out_dir).join("casefold_generated.rs");
    let mut out = fs::File::create(&out_path).expect("Failed to create output file");

    writeln!(out, "// Auto-generated from CaseFolding.txt — do not edit manually.").unwrap();
    writeln!(out, "// Source: Unicode CaseFolding.txt (status 'F' entries)").unwrap();
    writeln!(out, "// Total entries: {}", entries.len()).unwrap();
    writeln!(out).unwrap();
    writeln!(out, "/// Lookup table for Unicode full case folding (CaseFolding.txt status 'F').").unwrap();
    writeln!(out, "/// These are the characters that expand to multiple characters under full case folding.").unwrap();
    writeln!(out, "///").unwrap();
    writeln!(out, "/// Auto-generated by build.rs from the official Unicode CaseFolding.txt file.").unwrap();
    writeln!(out, "fn lookup_full_casefold(c: char) -> Option<CaseMapResult> {{").unwrap();
    writeln!(out, "    let (chars, len) = match c as u32 {{").unwrap();

    for (cp, (mapping, comment)) in &entries {
        let len = mapping.len();
        assert!(len >= 2 && len <= 3, "Unexpected mapping length {} for U+{:04X}", len, cp);

        let chars_str = if len == 2 {
            format!("['\\u{{{:04X}}}', '\\u{{{:04X}}}', '\\0']", mapping[0], mapping[1])
        } else {
            format!("['\\u{{{:04X}}}', '\\u{{{:04X}}}', '\\u{{{:04X}}}']", mapping[0], mapping[1], mapping[2])
        };

        writeln!(out, "        0x{:04X} => ({}, {}), // {}", cp, chars_str, len, comment).unwrap();
    }

    writeln!(out, "        _ => return None,").unwrap();
    writeln!(out, "    }};").unwrap();
    writeln!(out, "    Some(CaseMapResult {{ chars, len }})").unwrap();
    writeln!(out, "}}").unwrap();
}