use regex::Regex;
use std::env;
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
fn main() {
let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR"));
let workspace_root = manifest_dir
.parent()
.and_then(Path::parent)
.expect("crate should live under crates/aft-tokenizer");
println!("cargo:rerun-if-env-changed=AFT_TOKENIZER_CLAUDE_JS");
println!(
"cargo:rerun-if-changed={}",
manifest_dir.join("src/claude_data.rs").display()
);
let source = match env::var_os("AFT_TOKENIZER_CLAUDE_JS").map(PathBuf::from) {
Some(explicit) => {
if !explicit.exists() {
println!(
"cargo:warning=AFT_TOKENIZER_CLAUDE_JS={} does not exist; using vendored claude_data.rs",
explicit.display()
);
return;
}
explicit
}
None => match locate_default_source(workspace_root) {
Some(path) => path,
None => return,
},
};
println!("cargo:rerun-if-changed={}", source.display());
let js = match fs::read_to_string(&source) {
Ok(js) => js,
Err(err) => {
println!(
"cargo:warning=failed to read {}: {err}; using vendored claude_data.rs",
source.display()
);
return;
}
};
let string_entries = parse_string_encoder(&js);
let binary_entries = parse_binary_encoder(&js);
assert_eq!(
string_entries.len(),
64_241,
"unexpected Claude stringEncoder entry count"
);
assert_eq!(
binary_entries.len(),
754,
"unexpected Claude binaryEncoder entry count"
);
let dest = manifest_dir.join("src/claude_data.rs");
let mut out: Vec<u8> = Vec::new();
writeln!(
out,
"// Auto-generated by build.rs from ai-tokenizer Claude encoding.",
)
.unwrap();
writeln!(
out,
"// Do not edit by hand; run `cargo build` after refreshing tmp/ai-tokenizer-pkg/."
)
.unwrap();
writeln!(
out,
"pub static STRING_ENCODER_ENTRIES: &[(&str, u32)] = &["
)
.unwrap();
for (token, rank) in &string_entries {
writeln!(out, " ({token:?}, {rank}),").unwrap();
}
writeln!(out, "];\n").unwrap();
writeln!(
out,
"pub static BINARY_ENCODER_ENTRIES: &[(&[u8], u32)] = &["
)
.unwrap();
for (bytes, rank) in &binary_entries {
let bytes = bytes
.iter()
.map(u8::to_string)
.collect::<Vec<_>>()
.join(", ");
writeln!(out, " (&[{bytes}], {rank}),").unwrap();
}
writeln!(out, "];").unwrap();
let tmp = dest.with_extension(format!("rs.tmp.{}", std::process::id()));
fs::write(&tmp, &out).expect("write temp Claude data");
fs::rename(&tmp, &dest).expect("atomically publish Claude data");
println!(
"cargo:warning=regenerated {} from {}",
dest.display(),
source.display()
);
}
fn locate_default_source(workspace_root: &Path) -> Option<PathBuf> {
let encoding_dir = workspace_root.join("tmp/ai-tokenizer-pkg/package/dist/encoding");
let entries = fs::read_dir(&encoding_dir).ok()?;
for entry in entries.flatten() {
let path = entry.path();
let Some(name) = path.file_name().and_then(|name| name.to_str()) else {
continue;
};
if name.starts_with("claude-") && name.ends_with(".js") {
return Some(path);
}
}
None
}
fn parse_string_encoder(js: &str) -> Vec<(String, u32)> {
let start = js
.find("const stringEncoder = {")
.expect("stringEncoder start");
let after_start = &js[start..];
let block_start = after_start.find('{').expect("stringEncoder open brace") + 1;
let block_end = after_start
.find("\n};\nconst binaryEncoder")
.expect("stringEncoder closing brace");
let block = &after_start[block_start..block_end];
let mut entries = Vec::new();
for line in block.lines() {
let line = line.trim();
if line.is_empty() {
continue;
}
let line = line.strip_suffix(',').unwrap_or(line).trim();
let (key, rest) = if line.starts_with('"') {
let (key, consumed) = parse_js_string_literal(line);
let rest = line[consumed..]
.trim_start()
.strip_prefix(':')
.expect("quoted stringEncoder key should be followed by colon");
(key, rest)
} else {
let (key, rest) = line
.split_once(':')
.expect("unquoted stringEncoder key/value");
(key.trim().to_owned(), rest)
};
let rank = parse_js_u32(rest.trim());
entries.push((key, rank));
}
entries
}
fn parse_binary_encoder(js: &str) -> Vec<(Vec<u8>, u32)> {
let start = js
.find("const binaryEncoder = [")
.expect("binaryEncoder start");
let after_start = &js[start..];
let end = after_start
.find("\n];\nconst decoder")
.expect("binaryEncoder end");
let block = &after_start[..end];
let entry_re = Regex::new(
r"(?s)\[\s*new Uint8Array\(\s*\[([0-9,\s]+)\]\s*\)\s*,\s*([0-9]+(?:e[0-9]+)?)\s*\]",
)
.expect("binaryEncoder regex");
entry_re
.captures_iter(block)
.map(|caps| {
let bytes = caps[1]
.split(',')
.map(str::trim)
.filter(|part| !part.is_empty())
.map(|part| part.parse::<u8>().expect("binaryEncoder byte"))
.collect::<Vec<_>>();
let rank = parse_js_u32(&caps[2]);
(bytes, rank)
})
.collect()
}
fn parse_js_u32(input: &str) -> u32 {
if let Ok(value) = input.parse::<u32>() {
return value;
}
let value = input
.parse::<f64>()
.unwrap_or_else(|err| panic!("invalid JS numeric literal {input:?}: {err}"));
assert!(
value.is_finite() && value.fract() == 0.0 && value >= 0.0 && value <= u32::MAX as f64,
"JS numeric literal is not a u32: {input:?}"
);
value as u32
}
fn parse_js_string_literal(input: &str) -> (String, usize) {
let mut chars = input.char_indices();
assert_eq!(chars.next().map(|(_, ch)| ch), Some('"'));
let mut output = String::new();
while let Some((idx, ch)) = chars.next() {
match ch {
'"' => return (output, idx + ch.len_utf8()),
'\\' => {
let (_, escaped) = chars.next().expect("unterminated JS string escape");
match escaped {
'"' => output.push('"'),
'\\' => output.push('\\'),
'/' => output.push('/'),
'b' => output.push('\u{0008}'),
'f' => output.push('\u{000c}'),
'n' => output.push('\n'),
'r' => output.push('\r'),
't' => output.push('\t'),
'v' => output.push('\u{000b}'),
'0' => output.push('\0'),
'x' => {
let high = chars.next().expect("missing first hex escape digit").1;
let low = chars.next().expect("missing second hex escape digit").1;
let value = hex_value(high) * 16 + hex_value(low);
output.push(char::from_u32(value).expect("valid hex escape"));
}
'u' => {
let mut value = 0;
for _ in 0..4 {
let digit = chars.next().expect("missing unicode escape digit").1;
value = value * 16 + hex_value(digit);
}
output.push(char::from_u32(value).expect("valid unicode escape"));
}
other => panic!("unsupported JS string escape: \\{other}"),
}
}
other => output.push(other),
}
}
panic!("unterminated JS string literal")
}
fn hex_value(ch: char) -> u32 {
ch.to_digit(16)
.unwrap_or_else(|| panic!("invalid hex escape digit: {ch}"))
}