aft-tokenizer 0.35.1

Claude lookup-encoding tokenizer for Agent File Tools
Documentation
//! Optional regenerator for `src/claude_data.rs`.
//!
//! The generated file is checked into the repo so the crate builds from a clean
//! clone without any extra setup. This build script is only used by developers
//! who want to refresh the data from a newer `ai-tokenizer` release:
//!
//!   mkdir -p tmp/ai-tokenizer-pkg && cd tmp/ai-tokenizer-pkg
//!   npm pack ai-tokenizer && tar xzf ai-tokenizer-*.tgz
//!
//! When that directory (or `AFT_TOKENIZER_CLAUDE_JS`) is absent, this script
//! does nothing — Cargo proceeds against the vendored file.

use regex::Regex;
use std::env;
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};

fn main() {
    let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR"));
    let workspace_root = manifest_dir
        .parent()
        .and_then(Path::parent)
        .expect("crate should live under crates/aft-tokenizer");

    println!("cargo:rerun-if-env-changed=AFT_TOKENIZER_CLAUDE_JS");
    println!(
        "cargo:rerun-if-changed={}",
        manifest_dir.join("src/claude_data.rs").display()
    );

    // Resolve the optional regen source. Missing source = no-op build script.
    let source = match env::var_os("AFT_TOKENIZER_CLAUDE_JS").map(PathBuf::from) {
        Some(explicit) => {
            if !explicit.exists() {
                println!(
                    "cargo:warning=AFT_TOKENIZER_CLAUDE_JS={} does not exist; using vendored claude_data.rs",
                    explicit.display()
                );
                return;
            }
            explicit
        }
        None => match locate_default_source(workspace_root) {
            Some(path) => path,
            None => return,
        },
    };

    println!("cargo:rerun-if-changed={}", source.display());

    let js = match fs::read_to_string(&source) {
        Ok(js) => js,
        Err(err) => {
            println!(
                "cargo:warning=failed to read {}: {err}; using vendored claude_data.rs",
                source.display()
            );
            return;
        }
    };

    let string_entries = parse_string_encoder(&js);
    let binary_entries = parse_binary_encoder(&js);

    assert_eq!(
        string_entries.len(),
        64_241,
        "unexpected Claude stringEncoder entry count"
    );
    assert_eq!(
        binary_entries.len(),
        754,
        "unexpected Claude binaryEncoder entry count"
    );

    let dest = manifest_dir.join("src/claude_data.rs");
    let mut out = fs::File::create(&dest).expect("create vendored Claude data");

    writeln!(
        out,
        "// Auto-generated by build.rs from ai-tokenizer Claude encoding.",
    )
    .unwrap();
    writeln!(
        out,
        "// Do not edit by hand; run `cargo build` after refreshing tmp/ai-tokenizer-pkg/."
    )
    .unwrap();
    writeln!(
        out,
        "pub static STRING_ENCODER_ENTRIES: &[(&str, u32)] = &["
    )
    .unwrap();
    for (token, rank) in &string_entries {
        writeln!(out, "    ({token:?}, {rank}),").unwrap();
    }
    writeln!(out, "];\n").unwrap();

    writeln!(
        out,
        "pub static BINARY_ENCODER_ENTRIES: &[(&[u8], u32)] = &["
    )
    .unwrap();
    for (bytes, rank) in &binary_entries {
        let bytes = bytes
            .iter()
            .map(u8::to_string)
            .collect::<Vec<_>>()
            .join(", ");
        writeln!(out, "    (&[{bytes}], {rank}),").unwrap();
    }
    writeln!(out, "];").unwrap();

    println!(
        "cargo:warning=regenerated {} from {}",
        dest.display(),
        source.display()
    );
}

fn locate_default_source(workspace_root: &Path) -> Option<PathBuf> {
    let encoding_dir = workspace_root.join("tmp/ai-tokenizer-pkg/package/dist/encoding");
    let entries = fs::read_dir(&encoding_dir).ok()?;

    for entry in entries.flatten() {
        let path = entry.path();
        let Some(name) = path.file_name().and_then(|name| name.to_str()) else {
            continue;
        };
        if name.starts_with("claude-") && name.ends_with(".js") {
            return Some(path);
        }
    }
    None
}

fn parse_string_encoder(js: &str) -> Vec<(String, u32)> {
    let start = js
        .find("const stringEncoder = {")
        .expect("stringEncoder start");
    let after_start = &js[start..];
    let block_start = after_start.find('{').expect("stringEncoder open brace") + 1;
    let block_end = after_start
        .find("\n};\nconst binaryEncoder")
        .expect("stringEncoder closing brace");
    let block = &after_start[block_start..block_end];

    let mut entries = Vec::new();
    for line in block.lines() {
        let line = line.trim();
        if line.is_empty() {
            continue;
        }

        let line = line.strip_suffix(',').unwrap_or(line).trim();
        let (key, rest) = if line.starts_with('"') {
            let (key, consumed) = parse_js_string_literal(line);
            let rest = line[consumed..]
                .trim_start()
                .strip_prefix(':')
                .expect("quoted stringEncoder key should be followed by colon");
            (key, rest)
        } else {
            let (key, rest) = line
                .split_once(':')
                .expect("unquoted stringEncoder key/value");
            (key.trim().to_owned(), rest)
        };
        let rank = parse_js_u32(rest.trim());
        entries.push((key, rank));
    }

    entries
}

fn parse_binary_encoder(js: &str) -> Vec<(Vec<u8>, u32)> {
    let start = js
        .find("const binaryEncoder = [")
        .expect("binaryEncoder start");
    let after_start = &js[start..];
    let end = after_start
        .find("\n];\nconst decoder")
        .expect("binaryEncoder end");
    let block = &after_start[..end];

    let entry_re = Regex::new(
        r"(?s)\[\s*new Uint8Array\(\s*\[([0-9,\s]+)\]\s*\)\s*,\s*([0-9]+(?:e[0-9]+)?)\s*\]",
    )
    .expect("binaryEncoder regex");

    entry_re
        .captures_iter(block)
        .map(|caps| {
            let bytes = caps[1]
                .split(',')
                .map(str::trim)
                .filter(|part| !part.is_empty())
                .map(|part| part.parse::<u8>().expect("binaryEncoder byte"))
                .collect::<Vec<_>>();
            let rank = parse_js_u32(&caps[2]);
            (bytes, rank)
        })
        .collect()
}

fn parse_js_u32(input: &str) -> u32 {
    if let Ok(value) = input.parse::<u32>() {
        return value;
    }

    let value = input
        .parse::<f64>()
        .unwrap_or_else(|err| panic!("invalid JS numeric literal {input:?}: {err}"));
    assert!(
        value.is_finite() && value.fract() == 0.0 && value >= 0.0 && value <= u32::MAX as f64,
        "JS numeric literal is not a u32: {input:?}"
    );
    value as u32
}

fn parse_js_string_literal(input: &str) -> (String, usize) {
    let mut chars = input.char_indices();
    assert_eq!(chars.next().map(|(_, ch)| ch), Some('"'));

    let mut output = String::new();
    while let Some((idx, ch)) = chars.next() {
        match ch {
            '"' => return (output, idx + ch.len_utf8()),
            '\\' => {
                let (_, escaped) = chars.next().expect("unterminated JS string escape");
                match escaped {
                    '"' => output.push('"'),
                    '\\' => output.push('\\'),
                    '/' => output.push('/'),
                    'b' => output.push('\u{0008}'),
                    'f' => output.push('\u{000c}'),
                    'n' => output.push('\n'),
                    'r' => output.push('\r'),
                    't' => output.push('\t'),
                    'v' => output.push('\u{000b}'),
                    '0' => output.push('\0'),
                    'x' => {
                        let high = chars.next().expect("missing first hex escape digit").1;
                        let low = chars.next().expect("missing second hex escape digit").1;
                        let value = hex_value(high) * 16 + hex_value(low);
                        output.push(char::from_u32(value).expect("valid hex escape"));
                    }
                    'u' => {
                        let mut value = 0;
                        for _ in 0..4 {
                            let digit = chars.next().expect("missing unicode escape digit").1;
                            value = value * 16 + hex_value(digit);
                        }
                        output.push(char::from_u32(value).expect("valid unicode escape"));
                    }
                    other => panic!("unsupported JS string escape: \\{other}"),
                }
            }
            other => output.push(other),
        }
    }

    panic!("unterminated JS string literal")
}

fn hex_value(ch: char) -> u32 {
    ch.to_digit(16)
        .unwrap_or_else(|| panic!("invalid hex escape digit: {ch}"))
}