use crate::error::{CliError, Result};
use aprender::format::v2::{stamp_provenance_bytes, AprV2Reader, ProvenancePatch};
use std::fs;
use std::path::Path;
#[allow(clippy::too_many_arguments)]
pub(crate) fn run(
file: &Path,
license: Option<&str>,
data_source: Option<&str>,
data_license: Option<&str>,
hf_architecture: Option<&str>,
hf_model_type: Option<&str>,
architecture: Option<&str>,
tokenizer_dir: Option<&Path>,
output: &Path,
force: bool,
json_output: bool,
) -> Result<()> {
if license.is_none()
&& data_source.is_none()
&& data_license.is_none()
&& hf_architecture.is_none()
&& hf_model_type.is_none()
&& architecture.is_none()
&& tokenizer_dir.is_none()
{
return Err(CliError::ValidationFailed(
"apr stamp: at least one of --license, --data-source, --data-license, \
--hf-architecture, --hf-model-type, --architecture, --tokenizer must \
be specified — refusing to rewrite without changes"
.to_string(),
));
}
if !file.exists() {
return Err(CliError::FileNotFound(file.to_path_buf()));
}
if output.exists() && !force {
return Err(CliError::ValidationFailed(format!(
"Output file '{}' already exists. Use --force to overwrite.",
output.display()
)));
}
let (tok_vocab, tok_merges, tok_model_type) = if let Some(dir) = tokenizer_dir {
load_tokenizer_files(dir)?
} else {
(None, None, None)
};
if !json_output {
eprintln!("Reading {}", file.display());
}
let input =
fs::read(file).map_err(|e| CliError::ValidationFailed(format!("read failed: {e}")))?;
let patch = ProvenancePatch {
license: license.map(str::to_string),
data_source: data_source.map(str::to_string),
data_license: data_license.map(str::to_string),
hf_architecture: hf_architecture.map(str::to_string),
hf_model_type: hf_model_type.map(str::to_string),
architecture: architecture.map(str::to_string),
tokenizer_vocab: tok_vocab,
tokenizer_merges: tok_merges,
tokenizer_model_type: tok_model_type,
};
let stamped = stamp_provenance_bytes(&input, &patch)
.map_err(|e| CliError::ValidationFailed(format!("stamp failed: {e:?}")))?;
fs::write(output, &stamped)
.map_err(|e| CliError::ValidationFailed(format!("write failed: {e}")))?;
let verify_reader = AprV2Reader::from_bytes(&stamped)
.map_err(|e| CliError::ValidationFailed(format!("post-stamp verify failed: {e:?}")))?;
if json_output {
let summary = serde_json::json!({
"command": "stamp",
"input": file.display().to_string(),
"output": output.display().to_string(),
"input_bytes": input.len(),
"output_bytes": stamped.len(),
"tensor_count": verify_reader.tensor_names().len(),
"stamped": {
"license": verify_reader.metadata().license,
"data_source": verify_reader.metadata().data_source,
"data_license": verify_reader.metadata().data_license,
"hf_architecture": verify_reader.metadata().hf_architecture,
"hf_model_type": verify_reader.metadata().hf_model_type,
"architecture": verify_reader.metadata().architecture,
},
"header_flags_bits": verify_reader.header().flags.bits(),
});
println!(
"{}",
serde_json::to_string_pretty(&summary).unwrap_or_default()
);
} else {
println!(
"✓ Stamped {} → {} ({} tensors, {} → {} bytes)",
file.display(),
output.display(),
verify_reader.tensor_names().len(),
input.len(),
stamped.len(),
);
println!(" license: {:?}", verify_reader.metadata().license);
println!(
" data_source: {:?}",
verify_reader.metadata().data_source
);
println!(
" data_license: {:?}",
verify_reader.metadata().data_license
);
println!(
" hf_architecture: {:?}",
verify_reader.metadata().hf_architecture
);
println!(
" hf_model_type: {:?}",
verify_reader.metadata().hf_model_type
);
println!(
" architecture: {:?}",
verify_reader.metadata().architecture
);
}
Ok(())
}
fn load_tokenizer_files(
dir: &Path,
) -> Result<(Option<Vec<String>>, Option<Vec<String>>, Option<String>)> {
if !dir.is_dir() {
return Err(CliError::ValidationFailed(format!(
"apr stamp --tokenizer: {} is not a directory",
dir.display()
)));
}
let unified = dir.join("tokenizer.json");
if unified.is_file() {
return load_unified_tokenizer(&unified);
}
let vocab_path = dir.join("vocab.json");
let merges_path = dir.join("merges.txt");
if !vocab_path.is_file() {
return Err(CliError::ValidationFailed(format!(
"apr stamp --tokenizer: neither tokenizer.json nor vocab.json found in {}",
dir.display()
)));
}
let vocab_str = fs::read_to_string(&vocab_path).map_err(|e| {
CliError::ValidationFailed(format!(
"apr stamp --tokenizer: read vocab.json failed: {e}"
))
})?;
let vocab_map: serde_json::Map<String, serde_json::Value> = serde_json::from_str(&vocab_str)
.map_err(|e| {
CliError::ValidationFailed(format!(
"apr stamp --tokenizer: vocab.json is not a valid JSON object: {e}"
))
})?;
let mut pairs: Vec<(u64, String)> = vocab_map
.iter()
.filter_map(|(tok, id)| id.as_u64().map(|n| (n, tok.clone())))
.collect();
pairs.sort_by_key(|(id, _)| *id);
let vocab: Vec<String> = pairs.into_iter().map(|(_, tok)| tok).collect();
if vocab.is_empty() {
return Err(CliError::ValidationFailed(format!(
"apr stamp --tokenizer: vocab.json in {} has no entries",
dir.display()
)));
}
let merges: Option<Vec<String>> = if merges_path.is_file() {
let merges_str = fs::read_to_string(&merges_path).map_err(|e| {
CliError::ValidationFailed(format!(
"apr stamp --tokenizer: read merges.txt failed: {e}"
))
})?;
let m: Vec<String> = merges_str
.lines()
.filter(|l| !l.is_empty() && !l.starts_with('#'))
.map(str::to_string)
.collect();
if m.is_empty() {
None
} else {
Some(m)
}
} else {
None
};
let model_type = if merges.is_some() {
Some("BPE".to_string())
} else {
None
};
Ok((Some(vocab), merges, model_type))
}
fn load_unified_tokenizer(
path: &Path,
) -> Result<(Option<Vec<String>>, Option<Vec<String>>, Option<String>)> {
let content = fs::read_to_string(path).map_err(|e| {
CliError::ValidationFailed(format!(
"apr stamp --tokenizer: read {} failed: {e}",
path.display()
))
})?;
let json: serde_json::Value = serde_json::from_str(&content).map_err(|e| {
CliError::ValidationFailed(format!(
"apr stamp --tokenizer: {} is not valid JSON: {e}",
path.display()
))
})?;
let model = json.get("model").ok_or_else(|| {
CliError::ValidationFailed(format!(
"apr stamp --tokenizer: {} missing `model` field",
path.display()
))
})?;
let model_type = model
.get("type")
.and_then(|v| v.as_str())
.map(ToString::to_string);
let vocab_obj = model
.get("vocab")
.and_then(|v| v.as_object())
.ok_or_else(|| {
CliError::ValidationFailed(format!(
"apr stamp --tokenizer: {} missing `model.vocab`",
path.display()
))
})?;
let mut pairs: Vec<(u64, String)> = vocab_obj
.iter()
.filter_map(|(tok, id)| id.as_u64().map(|n| (n, tok.clone())))
.collect();
pairs.sort_by_key(|(id, _)| *id);
let vocab: Vec<String> = pairs.into_iter().map(|(_, tok)| tok).collect();
if vocab.is_empty() {
return Err(CliError::ValidationFailed(format!(
"apr stamp --tokenizer: {} has empty vocab",
path.display()
)));
}
let merges: Option<Vec<String>> = model.get("merges").and_then(|v| v.as_array()).map(|arr| {
arr.iter()
.filter_map(|m| match m {
serde_json::Value::String(s) => Some(s.clone()),
serde_json::Value::Array(parts) if parts.len() == 2 => {
let a = parts[0].as_str()?;
let b = parts[1].as_str()?;
Some(format!("{a} {b}"))
}
_ => None,
})
.collect()
});
let merges = merges.filter(|m| !m.is_empty());
Ok((Some(vocab), merges, model_type))
}
#[cfg(test)]
mod tests {
use super::*;
use aprender::format::v2::{AprV2Metadata, AprV2Writer, TensorDType};
use tempfile::TempDir;
fn write_unpopulated_apr(path: &Path) {
let metadata = AprV2Metadata::new("stamp-cli-test");
let mut writer = AprV2Writer::new(metadata);
writer.add_tensor("weight", TensorDType::F32, vec![2, 3], vec![0u8; 24]);
let bytes = writer.write().expect("write test apr");
fs::write(path, &bytes).expect("write test apr to disk");
}
#[test]
fn stamp_cli_populates_all_three_fields() {
let dir = TempDir::new().unwrap();
let input = dir.path().join("input.apr");
let output = dir.path().join("output.apr");
write_unpopulated_apr(&input);
let result = run(
&input,
Some("Apache-2.0"),
Some("huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct"),
Some("Apache-2.0"),
None, None, None, None, &output,
false,
true, );
assert!(result.is_ok(), "stamp run must succeed: {result:?}");
let bytes = fs::read(&output).unwrap();
let reader = AprV2Reader::from_bytes(&bytes).unwrap();
assert_eq!(reader.metadata().license.as_deref(), Some("Apache-2.0"));
assert_eq!(
reader.metadata().data_source.as_deref(),
Some("huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct")
);
assert_eq!(
reader.metadata().data_license.as_deref(),
Some("Apache-2.0")
);
}
#[test]
fn stamp_cli_rejects_empty_patch() {
let dir = TempDir::new().unwrap();
let input = dir.path().join("input.apr");
let output = dir.path().join("output.apr");
write_unpopulated_apr(&input);
let result = run(
&input, None, None, None, None, None, None, None, &output, false, true,
);
let err = result.unwrap_err();
let msg = format!("{err:?}");
assert!(
msg.contains("at least one"),
"empty-patch CLI error must be explicit: {msg}"
);
assert!(
!output.exists(),
"rejected stamp must not create the output file"
);
}
#[test]
fn stamp_cli_rejects_missing_input() {
let dir = TempDir::new().unwrap();
let input = dir.path().join("does-not-exist.apr");
let output = dir.path().join("output.apr");
let result = run(
&input,
Some("Apache-2.0"),
None,
None,
None,
None,
None,
None,
&output,
false,
true,
);
let err = result.unwrap_err();
assert!(
matches!(err, CliError::FileNotFound(_)),
"missing-input must surface FileNotFound, got: {err:?}"
);
}
#[test]
fn stamp_cli_rejects_existing_output_without_force() {
let dir = TempDir::new().unwrap();
let input = dir.path().join("input.apr");
let output = dir.path().join("output.apr");
write_unpopulated_apr(&input);
fs::write(&output, b"pre-existing").unwrap();
let result = run(
&input,
Some("Apache-2.0"),
None,
None,
None, None, None, None, &output,
false, true,
);
let err = result.unwrap_err();
let msg = format!("{err:?}");
assert!(
msg.contains("already exists") && msg.contains("--force"),
"existing-output error must mention --force: {msg}"
);
let still_there = fs::read(&output).unwrap();
assert_eq!(still_there, b"pre-existing");
}
#[test]
fn stamp_cli_overwrites_existing_output_with_force() {
let dir = TempDir::new().unwrap();
let input = dir.path().join("input.apr");
let output = dir.path().join("output.apr");
write_unpopulated_apr(&input);
fs::write(&output, b"pre-existing").unwrap();
let result = run(
&input,
Some("MIT"),
None,
None,
None, None, None, None, &output,
true, true,
);
assert!(
result.is_ok(),
"stamp with --force must succeed: {result:?}"
);
let bytes = fs::read(&output).unwrap();
let reader = AprV2Reader::from_bytes(&bytes).expect("force-overwritten file must parse");
assert_eq!(reader.metadata().license.as_deref(), Some("MIT"));
}
#[test]
fn stamp_p0k_recovers_pre_p0k_apr_identity() {
let dir = TempDir::new().unwrap();
let input = dir.path().join("input.apr");
let output = dir.path().join("output.apr");
let metadata = AprV2Metadata {
architecture: Some("LlamaForCausalLM".to_string()),
hf_architecture: None,
hf_model_type: None,
..AprV2Metadata::new("p0k-stamp-test")
};
let mut writer = AprV2Writer::new(metadata);
writer.add_tensor(
"model.embed_tokens.weight",
TensorDType::F32,
vec![128, 64],
vec![0u8; 128 * 64 * 4],
);
let bytes = writer.write().expect("write pre-P0-K test apr");
fs::write(&input, &bytes).expect("write test apr to disk");
let result = run(
&input,
None,
None,
None,
Some("Qwen2ForCausalLM"),
Some("qwen2"),
Some("qwen2"),
None, &output,
false,
true,
);
assert!(result.is_ok(), "stamp run must succeed: {result:?}");
let out_bytes = fs::read(&output).unwrap();
let reader = AprV2Reader::from_bytes(&out_bytes).unwrap();
assert_eq!(
reader.metadata().hf_architecture.as_deref(),
Some("Qwen2ForCausalLM"),
"hf_architecture must be patched"
);
assert_eq!(
reader.metadata().hf_model_type.as_deref(),
Some("qwen2"),
"hf_model_type must be patched"
);
assert_eq!(
reader.metadata().architecture.as_deref(),
Some("qwen2"),
"architecture (family slug) must be patched away from the wrong P0-H fallback"
);
}
#[test]
fn stamp_p0k_partial_hf_architecture_only() {
let dir = TempDir::new().unwrap();
let input = dir.path().join("input.apr");
let output = dir.path().join("output.apr");
write_unpopulated_apr(&input);
let result = run(
&input,
None,
None,
None,
Some("Qwen2ForCausalLM"),
None,
None,
None, &output,
false,
true,
);
assert!(result.is_ok(), "partial stamp must succeed: {result:?}");
let out_bytes = fs::read(&output).unwrap();
let reader = AprV2Reader::from_bytes(&out_bytes).unwrap();
assert_eq!(
reader.metadata().hf_architecture.as_deref(),
Some("Qwen2ForCausalLM")
);
assert_eq!(
reader.metadata().hf_model_type,
None,
"unpatched field must remain None"
);
}
#[test]
fn stamp_p3c_defect1_embeds_tokenizer_from_vocab_merges() {
use aprender::format::v2::AprV2Flags;
let dir = TempDir::new().unwrap();
let input = dir.path().join("input.apr");
let output = dir.path().join("output.apr");
write_unpopulated_apr(&input);
let tok_dir = dir.path().join("tokenizer");
fs::create_dir_all(&tok_dir).unwrap();
let vocab_json = r#"{"<unk>": 0, "Ġ": 1, "the": 2}"#;
fs::write(tok_dir.join("vocab.json"), vocab_json).unwrap();
fs::write(
tok_dir.join("merges.txt"),
"#version: 0.2\nĠ t\nh e\nĠt he\n",
)
.unwrap();
let result = run(
&input,
None,
None,
None,
None,
None,
None,
Some(&tok_dir),
&output,
false,
true,
);
assert!(
result.is_ok(),
"stamp with --tokenizer must succeed: {result:?}"
);
let bytes = fs::read(&output).unwrap();
let reader = AprV2Reader::from_bytes(&bytes).unwrap();
assert!(
reader.header().flags.contains(AprV2Flags::HAS_VOCAB),
"HAS_VOCAB header flag must be set after --tokenizer stamp \
(the load-bearing check for apr run inference)"
);
let vocab = reader
.metadata()
.custom
.get("tokenizer.vocabulary")
.and_then(|v| v.as_array())
.expect("tokenizer.vocabulary must be set");
assert_eq!(vocab.len(), 3);
assert_eq!(vocab[0].as_str(), Some("<unk>"));
assert_eq!(vocab[2].as_str(), Some("the"));
let merges = reader
.metadata()
.custom
.get("tokenizer.merges")
.and_then(|v| v.as_array())
.expect("tokenizer.merges must be set");
assert_eq!(merges.len(), 3);
assert_eq!(
reader
.metadata()
.custom
.get("tokenizer.model_type")
.and_then(|v| v.as_str()),
Some("BPE")
);
}
#[test]
fn stamp_p3c_defect1_tokenizer_alone_passes_has_any_gate() {
let dir = TempDir::new().unwrap();
let input = dir.path().join("input.apr");
let output = dir.path().join("output.apr");
write_unpopulated_apr(&input);
let tok_dir = dir.path().join("tokenizer");
fs::create_dir_all(&tok_dir).unwrap();
fs::write(tok_dir.join("vocab.json"), r#"{"a": 0}"#).unwrap();
let result = run(
&input,
None,
None,
None,
None,
None,
None,
Some(&tok_dir),
&output,
false,
true,
);
assert!(
result.is_ok(),
"stamp with --tokenizer alone must succeed: {result:?}"
);
}
#[test]
fn stamp_p3c_defect1_tokenizer_dir_without_files_errors() {
let dir = TempDir::new().unwrap();
let input = dir.path().join("input.apr");
let output = dir.path().join("output.apr");
write_unpopulated_apr(&input);
let empty_tok = dir.path().join("empty-tokenizer");
fs::create_dir_all(&empty_tok).unwrap();
let result = run(
&input,
None,
None,
None,
None,
None,
None,
Some(&empty_tok),
&output,
false,
true,
);
let err = result.unwrap_err();
let msg = format!("{err:?}");
assert!(
msg.contains("neither tokenizer.json nor vocab.json found"),
"expected clear missing-files error, got: {msg}"
);
}
}