langsan 0.0.14

A library for sanitizing language model input and output.
use std::path::PathBuf;

/// A build script to parse unicode range json and generate a rust file with
/// those ranges, but only if their corresponding feature is enabled.
use serde_json;

const CRATE_ROOT: &str = env!("CARGO_MANIFEST_DIR");
/// Git submodule path to the unicode range json file.
const JSON_PATH: &str = "unicode-range-json/unicode-ranges.json";
/// Rust file to generate with the unicode ranges.
const RANGES_RS: &str = "src/ranges.rs";
/// Cargo.toml content, so we can generate the features
const CARGO_TOML: &str = r#"# WARNING: This file is generated by build.rs
[package]
name = "langsan"
version = "0.0.14"
edition = "2021"
authors = ["Michael de Gans <michael.john.degans@gmail.com>"]
description = "A library for sanitizing language model input and output."
homepage = "https://github.com/mdegans/langsan"
repository = "https://github.com/mdegans/langsan"
readme = "README.md"
keywords = ["sanitization", "language", "model"]
categories = [
    "text-processing",
]
license = "MIT"

[dependencies]
serde = { version = "1", features = ["derive"], optional = true }

[build-dependencies]
serde_json = "1"
serde = { version = "1", features = ["derive"] }
static_assertions = "1"

[dev-dependencies]
serde_json = "1"

[features]
default = ["general-punctuation"]
bidi = []
cow = []
verbose = []
serde = ["dep:serde"]

# Languages
english = []
spanish = ["latin-1-supplement"]
french = ["latin-1-supplement"]
german = ["latin-1-supplement"]
italian = ["latin-1-supplement"]
dutch = ["latin-1-supplement"]
portuguese = ["latin-1-supplement"]
russian = ["cyrillic"]
emoji = ["emoticons-emoji"]

# Unicode ranges. Note that whitespace and basic-latin are enabled by default.
# "tags" are included for completion sake but very much not recommended for use.
"#;

#[derive(serde::Deserialize)]
struct NamedRange {
    category: String,
    range: [u32; 2],
}

/// The feature name for a Unicode block, e.g. `Latin-1 Supplement` ->
/// `latin-1-supplement`. Must match the transform used to declare features.
fn feature_name(category: &str) -> String {
    category
        .to_lowercase()
        .replace(' ', "-")
        .replace(['(', ')'], "")
}

/// Returns `(ranges.rs, Cargo.toml)`. We have a lot of features to generate
/// so we don't want to write them all out
fn gen_ranges(ranges: &[NamedRange]) -> Result<(String, String), Box<dyn std::error::Error>> {
    let features: Vec<String> = ranges.iter().map(|r| feature_name(&r.category)).collect();
    let const_names: Vec<String> = features
        .iter()
        .map(|feature| feature.to_uppercase().replace('-', "_"))
        .collect();
    let mut cargo_toml = CARGO_TOML.to_string();
    let mut code = r#"// WARNING: This file is generated by build.rs
// Do not modify this file directly.
/// Unicode ranges
use core::ops::RangeInclusive;

// Constants for unicode ranges

/// Whitespace other than space
pub const WHITESPACE: RangeInclusive<u32> = 0x00009..=0x0000C;
/// Basic latin, excluding control characters
pub const BASIC_LATIN: RangeInclusive<u32> = 0x00020..=0x0007E; // 0x7F is DEL
"#
    .to_string();

    // The `cfg` attribute is not supported on expressions, so we have to
    // generate constants for each feature.

    for ((feature, range), const_name) in features
        .iter()
        .zip(ranges.iter())
        .zip(const_names.iter())
        .skip(2)
    {
        code.push_str(&format!("/// {}\n", range.category));
        code.push_str(&format!("#[cfg(feature = \"{feature}\")]\n",));
        code.push_str(&format!(
            "pub const {}: RangeInclusive<u32> = {:#07X}..={:#07X};\n",
            const_name, range.range[0], range.range[1]
        ));

        cargo_toml.push_str(&format!("{feature} = []\n",));
    }

    code.push_str(
        r#"/// Enabled unicode ranges.
pub const ENABLED_RANGES: &[RangeInclusive<u32>] = &[
    WHITESPACE,
    BASIC_LATIN,
"#,
    );

    for (feature, const_name) in features.iter().zip(const_names.iter()).skip(2) {
        code.push_str(&format!("    #[cfg(feature = \"{feature}\")]\n",));
        code.push_str(&format!("    {},\n", const_name));
    }

    code.push_str("];\n");

    Ok((code, cargo_toml))
}

/// Generate the feature-resolved two-level allow-set table consumed by
/// [`san::is_allowed`].
///
/// `ranges.rs` is feature-*agnostic* (every block `cfg`-gated, resolved by the
/// compiler) so the committed file is stable. The table is the opposite: it is
/// baked for exactly the features enabled in *this* build, read from the
/// `CARGO_FEATURE_*` environment Cargo sets, which is what lets us deduplicate
/// it into a compact `O(1)` lookup. It therefore lives in `OUT_DIR`, never the
/// source tree.
///
/// Layout: the code point's high bits (`cp >> 8`) index `STAGE1`, yielding a
/// leaf index into `STAGE2`; each leaf is a 256-bit (`[u8; 32]`) allow bitmap
/// for that block. Empty/full blocks collapse to a single shared leaf, so the
/// table stays a few KB even with the whole symbol/CJK range enabled. Code
/// points above the highest enabled range fall off the end of `STAGE1` and are
/// denied — see the `.get()` in `is_allowed`.
fn gen_table(ranges: &[NamedRange]) -> String {
    // Whitespace and Basic Latin are always enabled (see `gen_ranges`); every
    // other block is gated on its `CARGO_FEATURE_*` flag, which Cargo also sets
    // for transitively-enabled blocks (e.g. `french` -> `latin-1-supplement`).
    let mut allowed: Vec<(u32, u32)> = vec![(0x0009, 0x000C), (0x0020, 0x007E)];
    for range in ranges.iter().skip(2) {
        let env = format!(
            "CARGO_FEATURE_{}",
            feature_name(&range.category)
                .to_uppercase()
                .replace('-', "_")
        );
        if std::env::var_os(&env).is_some() {
            allowed.push((range.range[0], range.range[1]));
        }
    }
    let in_allowed = |cp: u32| allowed.iter().any(|&(s, e)| cp >= s && cp <= e);

    // One deduplicated 256-code-point bitmap per high block, up to the highest
    // enabled code point (everything beyond is denied by omission).
    let max_cp = allowed.iter().map(|&(_, e)| e).max().unwrap_or(0);
    let num_blocks = (max_cp >> 8) as usize + 1;

    let mut stage1: Vec<u16> = Vec::with_capacity(num_blocks);
    let mut stage2: Vec<[u8; 32]> = Vec::new();
    let mut seen: std::collections::HashMap<[u8; 32], u16> = std::collections::HashMap::new();

    for block in 0..num_blocks {
        let base = (block as u32) << 8;
        let mut leaf = [0u8; 32];
        for lo in 0..256u32 {
            if in_allowed(base + lo) {
                leaf[(lo >> 3) as usize] |= 1u8 << (lo & 7);
            }
        }
        let idx = *seen.entry(leaf).or_insert_with(|| {
            let i = stage2.len() as u16;
            stage2.push(leaf);
            i
        });
        stage1.push(idx);
    }

    let mut out = String::from(
        "// WARNING: generated by build.rs for this build's feature set.\n\
         // Two-level allow-set table; see `gen_table` in build.rs and\n\
         // `san::is_allowed`.\n",
    );
    out.push_str(&format!(
        "pub(crate) static SANITIZE_STAGE1: [u16; {}] = [",
        stage1.len()
    ));
    for (i, idx) in stage1.iter().enumerate() {
        out.push_str(if i % 16 == 0 { "\n    " } else { " " });
        out.push_str(&format!("{idx},"));
    }
    out.push_str("\n];\n");
    out.push_str(&format!(
        "pub(crate) static SANITIZE_STAGE2: [[u8; 32]; {}] = [\n",
        stage2.len()
    ));
    for leaf in &stage2 {
        out.push_str("    [");
        for b in leaf {
            out.push_str(&format!("{b:#04x},"));
        }
        out.push_str("],\n");
    }
    out.push_str("];\n");
    out
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Input json file
    let json_path = PathBuf::from(CRATE_ROOT).join(JSON_PATH);
    // Output `ranges.rs` file
    let ranges_path = PathBuf::from(CRATE_ROOT).join(RANGES_RS);
    // Output `Cargo.toml` file (breaks crates.io)
    // let cargo_toml_path = PathBuf::from(CRATE_ROOT).join("Cargo.toml");

    let json = std::fs::read_to_string(json_path)?;
    let ranges: Vec<NamedRange> = serde_json::from_str(&json)?;

    let (ranges_rs, _cargo_toml) = gen_ranges(&ranges)?;
    std::fs::write(ranges_path, ranges_rs)?;
    // std::fs::write(cargo_toml_path, cargo_toml)?;

    // The allow-set table is feature-resolved, so it goes in OUT_DIR.
    let out_dir = std::env::var("OUT_DIR")?;
    let table_path = PathBuf::from(out_dir).join("sanitize_table.rs");
    std::fs::write(table_path, gen_table(&ranges))?;

    Ok(())
}