use crate::error::{CliError, Result};
use crate::output;
use colored::Colorize;
use std::path::Path;
use std::time::Instant;
use super::code_eval::{compute_pass_at_k, emit_eval_results, run_multisample_loop};
#[derive(Debug, serde::Deserialize)]
pub(super) struct HumanEvalProblem {
pub(super) task_id: String,
pub(super) prompt: String,
#[serde(default)]
pub(super) canonical_solution: Option<String>,
pub(super) test: String,
#[serde(default)]
pub(super) entry_point: Option<String>,
}
pub(crate) fn run_humaneval(
model_path: &Path,
data_path: Option<&Path>,
k_values: &[usize],
json_output: bool,
device: &str,
num_samples: usize,
temperature: f32,
) -> Result<()> {
let data_path = data_path.ok_or_else(|| {
CliError::ValidationFailed(
"--data <humaneval.jsonl> is required for HumanEval evaluation.\n\
Format: OpenAI HumanEval JSONL with task_id, prompt, canonical_solution, test, entry_point"
.to_string(),
)
})?;
if !data_path.exists() {
return Err(CliError::FileNotFound(data_path.to_path_buf()));
}
if !model_path.exists() {
return Err(CliError::FileNotFound(model_path.to_path_buf()));
}
let content = std::fs::read_to_string(data_path)
.map_err(|e| CliError::ValidationFailed(format!("Cannot read HumanEval data: {e}")))?;
let problems: Vec<HumanEvalProblem> = content
.lines()
.filter(|l| !l.trim().is_empty())
.enumerate()
.map(|(i, line)| {
serde_json::from_str(line).map_err(|e| {
CliError::ValidationFailed(format!("Invalid JSON on line {}: {e}", i + 1))
})
})
.collect::<Result<Vec<_>>>()?;
if problems.is_empty() {
return Err(CliError::ValidationFailed(
"HumanEval file is empty".to_string(),
));
}
let valid = problems
.iter()
.filter(|p| validate_humaneval_problem(p))
.count();
let num_samples = num_samples.max(1);
if !json_output {
output::section("APR HumanEval Evaluation");
println!();
output::kv("Model", model_path.display());
output::kv("Benchmark", data_path.display());
output::kv("Problems", format!("{} ({valid} valid)", problems.len()));
output::kv("k values", format!("{k_values:?}"));
if num_samples > 1 {
output::kv("Samples/problem", num_samples);
output::kv("Temperature", format!("{temperature:.2}"));
}
println!();
}
let start = Instant::now();
let mut per_problem_correct: Vec<(String, String, usize)> = problems
.iter()
.map(|p| {
let ep = p
.entry_point
.as_deref()
.or_else(|| extract_function_name(&p.prompt))
.unwrap_or("")
.to_string();
(p.task_id.clone(), ep, 0usize)
})
.collect();
let mut first_err: Option<String> = None;
let any_ok = run_multisample_loop(&mut per_problem_correct, num_samples, json_output, || {
let result = if device == "cuda" {
run_humaneval_inference_cuda(model_path, &problems, k_values, json_output)
} else {
run_humaneval_inference(model_path, &problems, k_values, json_output)
};
if let Err(ref e) = result {
if first_err.is_none() {
first_err = Some(format!("{e}"));
}
}
result
});
if !any_ok {
if !json_output {
if let Some(ref err) = first_err {
println!(" Inference error: {err}");
}
println!(" Falling back to structural validation (no inference)");
}
for (i, problem) in problems.iter().enumerate() {
if validate_humaneval_problem(problem) {
if let Some(ref sol) = problem.canonical_solution {
if !sol.trim().is_empty() {
per_problem_correct[i].2 = 1;
}
}
}
}
}
let elapsed = start.elapsed().as_secs_f32();
emit_eval_results(
"humaneval",
model_path,
&per_problem_correct,
num_samples,
temperature,
k_values,
elapsed,
if any_ok { "inference" } else { "structural" },
json_output,
None,
);
Ok(())
}
pub(super) fn sample_token(logits: &[f32], temperature: f32, rng_state: &mut u64) -> u32 {
contract_pre_repeat_penalty!();
contract_pre_generation_temperature_zero!();
if temperature <= 0.0 || logits.is_empty() {
let result = logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map_or(0, |(idx, _)| idx as u32);
contract_post_repeat_penalty!(&result);
contract_post_generation_temperature_zero!(&result);
return result;
}
let inv_temp = 1.0 / temperature;
let max_logit = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let mut probs: Vec<f32> = logits
.iter()
.map(|&l| ((l - max_logit) * inv_temp).exp())
.collect();
let sum: f32 = probs.iter().sum();
if sum > 0.0 {
for p in &mut probs {
*p /= sum;
}
}
*rng_state ^= *rng_state << 13;
*rng_state ^= *rng_state >> 7;
*rng_state ^= *rng_state << 17;
let r = (*rng_state as f32) / (u64::MAX as f32);
let mut cumulative = 0.0f32;
for (i, &p) in probs.iter().enumerate() {
cumulative += p;
if r < cumulative {
let result = i as u32;
contract_post_repeat_penalty!(&result);
contract_post_generation_temperature_zero!(&result);
return result;
}
}
let result = (probs.len() - 1) as u32;
contract_post_repeat_penalty!(&result);
contract_post_generation_temperature_zero!(&result);
result
}
#[cfg(feature = "inference")]
fn load_humaneval_model(
model_path: &Path,
) -> std::result::Result<realizar::apr_transformer::AprTransformer, String> {
use realizar::apr_transformer::AprTransformer;
use realizar::safetensors_infer::SafetensorsToAprConverter;
if model_path.extension().is_some_and(|e| e == "apr")
|| model_path.join("model-best.apr").exists()
{
let apr_path = if model_path.is_dir() {
model_path.join("model-best.apr")
} else {
model_path.to_path_buf()
};
AprTransformer::from_apr_file(&apr_path).map_err(|e| format!("Cannot load APR model: {e}"))
} else {
SafetensorsToAprConverter::convert(model_path)
.map_err(|e| format!("Cannot load model: {e}"))
.map(|c| c.into_inner())
}
}
#[cfg(feature = "inference")]
fn load_humaneval_tokenizer(
model_path: &Path,
json_output: bool,
) -> std::result::Result<realizar::apr::BpeTokenizer, String> {
let apr_file = if model_path.is_dir() {
model_path.join("model-best.apr")
} else {
model_path.to_path_buf()
};
if apr_file.extension().is_some_and(|e| e == "apr") {
if let Some(embedded) = realizar::apr::AprV2Model::load(&apr_file)
.ok()
.and_then(|m| m.load_embedded_bpe_tokenizer())
{
if !json_output {
println!(" {} Loaded embedded BPE tokenizer", "✓".green());
}
return Ok(embedded);
}
}
realizar::apr::AprV2Model::load_tokenizer(model_path).ok_or_else(|| {
"No tokenizer found (no embedded tokenizer and no sibling tokenizer.json)".to_string()
})
}
#[cfg(feature = "inference")]
fn run_humaneval_inference(
model_path: &Path,
problems: &[HumanEvalProblem],
_k_values: &[usize],
json_output: bool,
) -> std::result::Result<(usize, Vec<(String, String, bool)>), String> {
use realizar::{run_inference, InferenceConfig};
if !json_output {
println!(" {} Loading model for inference...", "→".dimmed());
}
let tokenizer = load_humaneval_tokenizer(model_path, json_output)?;
if !json_output {
println!(" {} Tokenizer loaded", "✓".green());
}
let mut passed = 0usize;
let mut results = Vec::new();
for (i, problem) in problems.iter().enumerate() {
let entry = problem
.entry_point
.as_deref()
.or_else(|| extract_function_name(&problem.prompt))
.unwrap_or("unknown");
let prompt_tokens = tokenizer.encode(&problem.prompt);
if prompt_tokens.is_empty() {
results.push((problem.task_id.clone(), entry.to_string(), false));
continue;
}
let config_chatml = InferenceConfig::new(model_path)
.with_prompt(problem.prompt.clone())
.with_max_tokens(512)
.with_temperature(0.0)
.with_top_k(1);
let result = match run_inference(&config_chatml) {
Ok(r) => r,
Err(e) => {
if !json_output {
eprintln!(
" [FAIL] {} ({}): inference error: {e}",
problem.task_id, entry
);
}
results.push((problem.task_id.clone(), entry.to_string(), false));
continue;
}
};
let completion =
if let Some(code) = extract_python_code_block_targeted(&result.text, Some(entry)) {
let preamble = extract_prompt_preamble(&problem.prompt, entry);
if preamble.is_empty() {
code
} else {
format!("{preamble}\n{code}")
}
} else {
let raw = if let Some(stripped) = result.text.strip_prefix(&problem.prompt) {
stripped.to_string()
} else {
let completion_tokens = if result.tokens.len() > result.input_token_count {
&result.tokens[result.input_token_count..]
} else {
&result.tokens[..]
};
tokenizer.decode(completion_tokens)
};
let truncated = truncate_at_function_boundary(&raw);
format!(
"{}{}",
problem.prompt,
align_continuation_indent(&problem.prompt, truncated)
)
};
let full_program = format!("{completion}\n\n{}\n\ncheck({})\n", problem.test, entry);
let exec_result = execute_python_test_with_diagnostics(&full_program, 10);
let ok = exec_result.success;
if std::env::var("APR_EVAL_DEBUG").is_ok() {
write_apr_eval_debug(
&problem.task_id,
&problem.prompt,
&result.text,
&completion,
&full_program,
&exec_result,
);
}
if ok {
passed += 1;
}
results.push((problem.task_id.clone(), entry.to_string(), ok));
if !json_output && (i + 1) % 10 == 0 {
println!(
" {} {}/{} problems evaluated ({} passed)",
"→".dimmed(),
i + 1,
problems.len(),
passed
);
}
}
Ok((passed, results))
}
#[cfg(not(feature = "inference"))]
fn run_humaneval_inference(
_model_path: &Path,
_problems: &[HumanEvalProblem],
_k_values: &[usize],
_json_output: bool,
) -> std::result::Result<(usize, Vec<(String, String, bool)>), String> {
Err("Inference not available (compile with --features inference)".to_string())
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "cuda", feature = "training"))]
fn load_transformer_config(
checkpoint_dir: &Path,
) -> std::result::Result<entrenar::transformer::TransformerConfig, String> {
let config_path = checkpoint_dir.join("config.json");
let content = std::fs::read_to_string(&config_path)
.map_err(|e| format!("Cannot read config.json: {e}"))?;
let v: serde_json::Value =
serde_json::from_str(&content).map_err(|e| format!("Invalid config.json: {e}"))?;
Ok(entrenar::transformer::TransformerConfig {
hidden_size: v["hidden_size"].as_u64().unwrap_or(1024) as usize,
num_attention_heads: v["num_attention_heads"].as_u64().unwrap_or(16) as usize,
num_kv_heads: v["num_key_value_heads"].as_u64().unwrap_or(4) as usize,
intermediate_size: v["intermediate_size"].as_u64().unwrap_or(4096) as usize,
num_hidden_layers: v["num_hidden_layers"].as_u64().unwrap_or(24) as usize,
vocab_size: v["vocab_size"].as_u64().unwrap_or(32768) as usize,
max_position_embeddings: v["max_position_embeddings"].as_u64().unwrap_or(1024) as usize,
rms_norm_eps: v["rms_norm_eps"].as_f64().unwrap_or(1e-5) as f32,
rope_theta: v["rope_theta"].as_f64().unwrap_or(10000.0) as f32,
use_bias: v["use_bias"].as_bool().unwrap_or(false),
head_dim_override: None,
architecture: Default::default(),
hf_architecture: None,
hf_model_type: None,
tie_word_embeddings: false,
})
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "cuda", feature = "training"))]
fn run_humaneval_inference_cuda(
model_path: &Path,
problems: &[HumanEvalProblem],
_k_values: &[usize],
json_output: bool,
) -> std::result::Result<(usize, Vec<(String, String, bool)>), String> {
let checkpoint_dir = if model_path.is_file() {
model_path.parent().unwrap_or(model_path)
} else {
model_path
};
let config = load_transformer_config(checkpoint_dir)?;
let max_seq = config.max_position_embeddings;
if !json_output {
println!(
" {} Loading model onto GPU for inference (ALB-089)...",
"→".dimmed()
);
}
let mut trainer =
entrenar::train::CudaTransformerTrainer::for_inference(checkpoint_dir, config)
.map_err(|e| format!("CUDA inference init failed: {e}"))?;
let tokenizer = realizar::apr::AprV2Model::load_tokenizer(model_path)
.or_else(|| {
let tok_path = checkpoint_dir.join("tokenizer.json");
realizar::apr::AprV2Model::load_tokenizer_from_path(&tok_path)
})
.ok_or_else(|| format!("No tokenizer found in {}", checkpoint_dir.display()))?;
if !json_output {
println!(" {} GPU inference ready", "✓".green());
}
let mut passed = 0usize;
let mut results = Vec::new();
let mut rng_state: u64 = 42;
for (i, problem) in problems.iter().enumerate() {
let entry = problem
.entry_point
.as_deref()
.or_else(|| extract_function_name(&problem.prompt))
.unwrap_or("unknown");
let prompt_tokens = tokenizer.encode(&problem.prompt);
if prompt_tokens.is_empty() {
results.push((problem.task_id.clone(), entry.to_string(), false));
continue;
}
let mut tokens: Vec<u32> = prompt_tokens.clone();
let max_new = 256;
for _ in 0..max_new {
if tokens.len() >= max_seq {
break;
}
let logits = trainer
.forward_logits(&tokens)
.ok_or_else(|| "forward_logits failed".to_string())?;
let next = sample_token(&logits, 0.0, &mut rng_state);
tokens.push(next);
if next == 0 {
break;
}
}
let completion_tokens = &tokens[prompt_tokens.len()..];
let completion = tokenizer.decode(completion_tokens);
let completion = truncate_at_function_boundary(&completion);
let full_program = format!(
"{}{}\n\n{}\n\ncheck({})\n",
problem.prompt, completion, problem.test, entry
);
let ok = execute_python_test(&full_program, 10);
if ok {
passed += 1;
}
results.push((problem.task_id.clone(), entry.to_string(), ok));
if !json_output && (i + 1) % 10 == 0 {
println!(
" {} {}/{} problems evaluated ({} passed)",
"→".dimmed(),
i + 1,
problems.len(),
passed
);
}
}
Ok((passed, results))
}
#[cfg(not(all(feature = "cuda", feature = "training")))]
fn run_humaneval_inference_cuda(
_model_path: &Path,
_problems: &[HumanEvalProblem],
_k_values: &[usize],
_json_output: bool,
) -> std::result::Result<(usize, Vec<(String, String, bool)>), String> {
Err("CUDA not available (compile with --features cuda)".to_string())
}
pub(super) fn extract_python_code_block(text: &str) -> Option<String> {
extract_python_code_block_targeted(text, None)
}
pub(super) fn extract_python_code_block_targeted(
text: &str,
entry_point: Option<&str>,
) -> Option<String> {
let mut blocks: Vec<String> = Vec::new();
let mut cursor = 0usize;
while cursor < text.len() {
let remainder = &text[cursor..];
let mut best: Option<(usize, usize)> = None;
for fence in ["```python\n", "```py\n", "```\n"] {
if let Some(rel) = remainder.find(fence) {
let after_open = rel + fence.len();
match best {
None => best = Some((rel, after_open)),
Some((br, _)) if rel < br => best = Some((rel, after_open)),
_ => {}
}
}
}
let (_start_rel, after_open_rel) = match best {
Some(p) => p,
None => break,
};
let after_open = cursor + after_open_rel;
if let Some(rel_end) = text[after_open..].find("\n```") {
let code = &text[after_open..after_open + rel_end];
if !code.trim().is_empty() {
blocks.push(code.to_string());
}
cursor = after_open + rel_end + "\n```".len();
} else {
break;
}
}
if blocks.is_empty() {
return None;
}
if let Some(ep) = entry_point {
let needle = format!("def {ep}(");
for block in &blocks {
if block.contains(&needle) {
return Some(block.clone());
}
}
}
Some(blocks[0].clone())
}
pub(super) fn truncate_at_function_boundary(completion: &str) -> &str {
for pattern in &["\ndef ", "\nclass "] {
if let Some(pos) = completion.find(pattern) {
return &completion[..pos];
}
}
completion
}
pub(super) fn extract_prompt_preamble(prompt: &str, entry_point: &str) -> String {
if entry_point.is_empty() || entry_point == "unknown" {
return String::new();
}
let needle = format!("def {entry_point}(");
let Some(idx) = prompt.find(&needle) else {
return String::new();
};
prompt[..idx].trim_end().to_string()
}
pub(super) fn align_continuation_indent(prompt: &str, completion: &str) -> String {
let expected_indent = prompt
.lines()
.rev()
.find(|l| !l.trim().is_empty())
.map(|l| l.chars().take_while(|c| *c == ' ').count())
.unwrap_or(0);
let actual_indent = completion
.lines()
.find(|l| !l.trim().is_empty())
.map(|l| l.chars().take_while(|c| *c == ' ').count())
.unwrap_or(0);
if actual_indent <= expected_indent {
return completion.to_string();
}
let excess = actual_indent - expected_indent;
let prefix = " ".repeat(excess);
let mut in_body = true;
completion
.split_inclusive('\n')
.map(|line| {
let trimmed = line.trim_start_matches(' ').trim_end_matches('\n');
if in_body && !trimmed.is_empty() {
let leading = line.chars().take_while(|c| *c == ' ').count();
if leading == 0 {
in_body = false;
}
}
if in_body && line.starts_with(&prefix) {
line[excess..].to_string()
} else {
line.to_string()
}
})
.collect()
}
#[cfg(test)]
mod extract_python_code_block_targeted_tests {
use super::extract_python_code_block_targeted;
#[test]
fn prefers_block_containing_entry_point() {
let text = "First a sketch:\n```python\n# rough idea\nx = 1\n```\nNow the actual solution:\n```python\ndef separate_paren_groups(s):\n return [s]\n```";
let got = extract_python_code_block_targeted(text, Some("separate_paren_groups"));
assert_eq!(
got.as_deref(),
Some("def separate_paren_groups(s):\n return [s]")
);
}
#[test]
fn single_block_matching_entry() {
let text = "```python\ndef f(x):\n return x\n```";
let got = extract_python_code_block_targeted(text, Some("f"));
assert_eq!(got.as_deref(), Some("def f(x):\n return x"));
}
#[test]
fn no_entry_match_falls_back_to_first() {
let text = "```python\nimport os\n```\n```python\ndef other():\n pass\n```";
let got = extract_python_code_block_targeted(text, Some("missing_fn"));
assert_eq!(got.as_deref(), Some("import os"));
}
#[test]
fn no_entry_point_first_block_wins() {
let text = "```python\nfirst = 1\n```\n```python\ndef target():\n pass\n```";
let got = extract_python_code_block_targeted(text, None);
assert_eq!(got.as_deref(), Some("first = 1"));
}
#[test]
fn mixed_fence_tags_picks_entry_block() {
let text = "```\n# untagged junk\n```\n```py\ndef helper(): pass\n```\n```python\ndef target():\n return 42\n```";
let got = extract_python_code_block_targeted(text, Some("target"));
assert_eq!(got.as_deref(), Some("def target():\n return 42"));
}
#[test]
fn no_fence_returns_none() {
let text = "just text without fences";
let got = extract_python_code_block_targeted(text, Some("anything"));
assert!(got.is_none());
}
#[test]
fn skips_empty_fences_before_match() {
let text = "```python\n\n```\n```python\ndef target():\n pass\n```";
let got = extract_python_code_block_targeted(text, Some("target"));
assert_eq!(got.as_deref(), Some("def target():\n pass"));
}
}
#[cfg(test)]
mod extract_python_code_block_tests {
use super::extract_python_code_block;
#[test]
fn extracts_python_fenced_block() {
let text = "Certainly!\n```python\ndef f(x):\n return x + 1\n```\nLet me know if you need more.";
let got = extract_python_code_block(text);
assert_eq!(got.as_deref(), Some("def f(x):\n return x + 1"));
}
#[test]
fn extracts_py_short_fence() {
let text = "```py\ndef g():\n pass\n```";
let got = extract_python_code_block(text);
assert_eq!(got.as_deref(), Some("def g():\n pass"));
}
#[test]
fn extracts_untagged_fence() {
let text = "```\nimport os\n```";
let got = extract_python_code_block(text);
assert_eq!(got.as_deref(), Some("import os"));
}
#[test]
fn returns_none_on_no_fence() {
let text = "Just plain text with no code block.";
let got = extract_python_code_block(text);
assert!(got.is_none());
}
#[test]
fn returns_none_on_empty_fence() {
let text = "```python\n\n```";
let got = extract_python_code_block(text);
assert!(got.is_none());
}
#[test]
fn extracts_first_of_multiple_blocks() {
let text = "```python\nfirst = 1\n```\nthen:\n```python\nsecond = 2\n```";
let got = extract_python_code_block(text);
assert_eq!(got.as_deref(), Some("first = 1"));
}
}
#[cfg(test)]
mod extract_prompt_preamble_tests {
use super::extract_prompt_preamble;
#[test]
fn captures_typing_import_preamble() {
let prompt = "from typing import List\n\n\ndef separate_paren_groups(s: str) -> List[str]:\n \"\"\"...\"\"\"\n";
let got = extract_prompt_preamble(prompt, "separate_paren_groups");
assert_eq!(got, "from typing import List");
}
#[test]
fn captures_multiline_preamble() {
let prompt = "from typing import List, Tuple\nimport math\n\nPI = 3.14\n\ndef f(x: List[int]) -> Tuple[int, int]:\n pass\n";
let got = extract_prompt_preamble(prompt, "f");
assert_eq!(
got,
"from typing import List, Tuple\nimport math\n\nPI = 3.14"
);
}
#[test]
fn empty_when_def_at_start() {
let prompt = "def trivial():\n pass\n";
let got = extract_prompt_preamble(prompt, "trivial");
assert_eq!(got, "");
}
#[test]
fn empty_when_entry_missing() {
let prompt = "from typing import List\n\ndef other_fn():\n pass\n";
let got = extract_prompt_preamble(prompt, "expected_fn");
assert_eq!(got, "");
}
#[test]
fn empty_when_entry_empty() {
let prompt = "from typing import List\n\ndef f():\n pass\n";
let got = extract_prompt_preamble(prompt, "");
assert_eq!(got, "");
}
#[test]
fn empty_when_entry_unknown() {
let prompt = "from typing import List\n\ndef f():\n pass\n";
let got = extract_prompt_preamble(prompt, "unknown");
assert_eq!(got, "");
}
#[test]
fn rc3_falsifier_composed_program_is_valid_python() {
let prompt = "from typing import List\n\n\ndef separate_paren_groups(s: str) -> List[str]:\n pass\n";
let preamble = extract_prompt_preamble(prompt, "separate_paren_groups");
let extracted_code = "def separate_paren_groups(s: str) -> List[str]:\n return [s]";
let full = format!("{preamble}\n{extracted_code}\n");
assert!(
full.starts_with("from typing import List"),
"preamble must lead with import; got: {full}"
);
assert!(
full.contains("def separate_paren_groups"),
"must contain function: {full}"
);
}
}
#[cfg(test)]
mod align_indent_tests {
use super::align_continuation_indent;
#[test]
fn dedents_one_excess_space() {
let prompt = "def f(x: int) -> int:\n \"\"\" doc.\n \"\"\"\n";
let completion =
" for i in range(x):\n if i > 0:\n return i\n return 0\n";
let got = align_continuation_indent(prompt, completion);
let want =
" for i in range(x):\n if i > 0:\n return i\n return 0\n";
assert_eq!(got, want);
}
#[test]
fn passthrough_when_already_correct() {
let prompt = "def f():\n \"\"\"doc\"\"\"\n";
let completion = " return 42\n";
let got = align_continuation_indent(prompt, completion);
assert_eq!(got, completion);
}
#[test]
fn leaves_zero_indent_lines_untouched() {
let prompt = "def f():\n \"\"\"doc\"\"\"\n";
let completion = " return 1\n\n\nif __name__ == \"__main__\":\n pass\n";
let got = align_continuation_indent(prompt, completion);
let want = " return 1\n\n\nif __name__ == \"__main__\":\n pass\n";
assert_eq!(got, want);
}
#[test]
fn dedents_multi_space_excess() {
let prompt = " pass\n";
let completion = " x = 1\n nested = 2\n";
let got = align_continuation_indent(prompt, completion);
let want = " x = 1\n nested = 2\n";
assert_eq!(got, want);
}
#[test]
fn empty_completion() {
let prompt = "def f():\n pass\n";
let completion = "";
let got = align_continuation_indent(prompt, completion);
assert_eq!(got, "");
}
#[test]
fn no_indent_anywhere() {
let prompt = "x = 1\n";
let completion = "y = 2\n";
let got = align_continuation_indent(prompt, completion);
assert_eq!(got, completion);
}
}
pub(super) fn write_apr_eval_debug(
task_id: &str,
prompt: &str,
response: &str,
completion: &str,
full_program: &str,
exec: &PythonExecResult,
) {
let safe_task = task_id.replace(['/', '\\', ' '], "_");
let path = std::env::temp_dir().join(format!("apr_eval_debug_{safe_task}.json"));
let json = serde_json::json!({
"task_id": task_id,
"prompt": prompt,
"response": response,
"response_len": response.len(),
"completion": completion,
"completion_len": completion.len(),
"full_program": full_program,
"exit_code": exec.exit_code,
"stderr": exec.stderr_capture,
"timed_out": exec.timed_out,
"spawn_error": exec.spawn_error,
"success": exec.success,
});
let _ = std::fs::write(
&path,
serde_json::to_string_pretty(&json).unwrap_or_default(),
);
}
pub(super) fn execute_python_test(program: &str, timeout_secs: u64) -> bool {
execute_python_test_with_diagnostics(program, timeout_secs).success
}
pub(super) struct PythonExecResult {
pub success: bool,
pub exit_code: Option<i32>,
pub stderr_capture: String,
pub timed_out: bool,
pub spawn_error: Option<String>,
}
pub(super) fn execute_python_test_with_diagnostics(
program: &str,
timeout_secs: u64,
) -> PythonExecResult {
use std::io::Read;
use std::process::Command;
use std::time::{Duration, Instant};
let tmp = std::env::temp_dir().join(format!(
"apr_eval_{}_{}.py",
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0)
));
if let Err(e) = std::fs::write(&tmp, program) {
return PythonExecResult {
success: false,
exit_code: None,
stderr_capture: String::new(),
timed_out: false,
spawn_error: Some(format!("tmp write: {e}")),
};
}
let spawn_result = Command::new("python3")
.arg(&tmp)
.env("PYTHONDONTWRITEBYTECODE", "1")
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::piped())
.spawn();
let mut child = match spawn_result {
Ok(c) => c,
Err(e) => {
let _ = std::fs::remove_file(&tmp);
return PythonExecResult {
success: false,
exit_code: None,
stderr_capture: String::new(),
timed_out: false,
spawn_error: Some(format!("spawn: {e}")),
};
}
};
let deadline = Instant::now() + Duration::from_secs(timeout_secs);
let mut timed_out = false;
let exit_status = loop {
match child.try_wait() {
Ok(Some(status)) => break Some(status),
Ok(None) => {
if Instant::now() >= deadline {
let _ = child.kill();
let _ = child.wait();
timed_out = true;
break None;
}
std::thread::sleep(Duration::from_millis(50));
}
Err(_) => break None,
}
};
let mut stderr_capture = String::new();
if let Some(mut s) = child.stderr.take() {
let mut buf = vec![0u8; 65536];
if let Ok(n) = s.read(&mut buf) {
stderr_capture = String::from_utf8_lossy(&buf[..n]).to_string();
}
}
let _ = std::fs::remove_file(&tmp);
let exit_code = exit_status.and_then(|s| s.code());
let success = exit_status.map(|s| s.success()).unwrap_or(false);
PythonExecResult {
success,
exit_code,
stderr_capture,
timed_out,
spawn_error: None,
}
}
#[cfg(test)]
mod execute_python_test_diagnostics_tests {
use super::execute_python_test_with_diagnostics;
fn python3_available() -> bool {
std::process::Command::new("python3")
.arg("--version")
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.status()
.map(|s| s.success())
.unwrap_or(false)
}
#[test]
fn success_program_reports_zero_exit_and_empty_stderr() {
if !python3_available() {
return;
}
let program = "print('hello')\n";
let r = execute_python_test_with_diagnostics(program, 5);
assert!(r.success, "program should succeed");
assert_eq!(r.exit_code, Some(0));
assert!(
r.stderr_capture.is_empty(),
"no stderr expected, got: {}",
r.stderr_capture
);
assert!(!r.timed_out);
assert!(r.spawn_error.is_none());
}
#[test]
fn assertion_failure_reports_nonzero_and_traceback() {
if !python3_available() {
return;
}
let program = "assert 1 == 2\n";
let r = execute_python_test_with_diagnostics(program, 5);
assert!(!r.success);
assert_eq!(r.exit_code, Some(1));
assert!(
r.stderr_capture.contains("AssertionError"),
"expected traceback, got: {}",
r.stderr_capture
);
assert!(!r.timed_out);
}
#[test]
fn harness_invariant_passing_program_reports_success() {
if !python3_available() {
return;
}
let program = "def f(x):\n return x + 1\n\nassert f(1) == 2\n";
let r = execute_python_test_with_diagnostics(program, 5);
assert!(r.success, "passing program must be reported as success");
assert_eq!(r.exit_code, Some(0));
}
#[test]
fn verbose_stderr_does_not_deadlock_on_success() {
if !python3_available() {
return;
}
let program =
"import sys\nfor _ in range(200):\n print('x' * 50, file=sys.stderr)\nsys.exit(0)\n";
let r = execute_python_test_with_diagnostics(program, 10);
assert!(
r.success,
"10KB-stderr passing program timed_out={} exit_code={:?}",
r.timed_out, r.exit_code
);
assert!(!r.timed_out);
}
#[test]
fn missing_python3_reports_spawn_error() {
if python3_available() {
return; }
let r = execute_python_test_with_diagnostics("print('hello')\n", 5);
assert!(!r.success);
assert!(
r.spawn_error.is_some(),
"expected spawn_error when python3 absent"
);
assert_eq!(r.exit_code, None);
}
}
fn validate_humaneval_problem(problem: &HumanEvalProblem) -> bool {
if problem.prompt.trim().is_empty() || problem.test.trim().is_empty() {
return false;
}
if let Some(ref sol) = problem.canonical_solution {
if !sol.trim().is_empty() {
return true;
}
}
problem.prompt.contains("def ")
}
pub(super) fn extract_function_name(prompt: &str) -> Option<&str> {
for line in prompt.lines() {
let trimmed = line.trim();
if let Some(rest) = trimmed.strip_prefix("def ") {
if let Some(paren) = rest.find('(') {
return Some(&rest[..paren]);
}
}
}
None
}
pub(super) fn print_humaneval_results(
results: &[(String, String, bool)],
total: usize,
passed: usize,
k_values: &[usize],
elapsed: f32,
mode: &str,
) {
for (task_id, entry_point, ok) in results {
let status = if *ok {
"PASS".green().to_string()
} else {
"FAIL".red().to_string()
};
println!(" [{status}] {task_id} ({entry_point})");
}
println!();
for &k in k_values {
let rate = compute_pass_at_k(total, passed, k);
output::kv(&format!("pass@{k}"), format!("{:.1}%", rate * 100.0));
}
output::kv("Time", format!("{elapsed:.2}s"));
println!();
println!(
"{}",
format!("{passed}/{total} problems evaluated ({mode})").dimmed()
);
}
#[derive(Debug, serde::Deserialize)]
#[allow(dead_code)]
pub(super) struct MbppProblem {
pub(super) text: String,
#[serde(default)]
pub(super) code: Option<String>,
pub(super) task_id: serde_json::Value,
#[serde(default)]
pub(super) test_setup_code: Option<String>,
pub(super) test_list: Vec<String>,
#[serde(default)]
pub(super) challenge_test_list: Vec<String>,
}
pub(crate) fn run_mbpp(
model_path: &Path,
data_path: Option<&Path>,
k_values: &[usize],
json_output: bool,
device: &str,
num_samples: usize,
temperature: f32,
) -> Result<()> {
let data_path = data_path.ok_or_else(|| {
CliError::ValidationFailed(
"--data <mbpp.jsonl> is required for MBPP evaluation.\n\
Format: Google MBPP JSONL with text, code, task_id, test_list"
.to_string(),
)
})?;
if !data_path.exists() {
return Err(CliError::FileNotFound(data_path.to_path_buf()));
}
if !model_path.exists() {
return Err(CliError::FileNotFound(model_path.to_path_buf()));
}
let content = std::fs::read_to_string(data_path)
.map_err(|e| CliError::ValidationFailed(format!("Cannot read MBPP data: {e}")))?;
let problems: Vec<MbppProblem> = content
.lines()
.filter(|l| !l.trim().is_empty())
.enumerate()
.map(|(i, line)| {
serde_json::from_str(line).map_err(|e| {
CliError::ValidationFailed(format!("Invalid JSON on line {}: {e}", i + 1))
})
})
.collect::<Result<Vec<_>>>()?;
if problems.is_empty() {
return Err(CliError::ValidationFailed("MBPP file is empty".to_string()));
}
let problems: Vec<MbppProblem> = problems
.into_iter()
.filter(|p| {
if let Some(id) = p.task_id.as_u64() {
(11..=510).contains(&id)
} else {
true }
})
.collect();
let num_samples = num_samples.max(1);
if !json_output {
output::section("APR MBPP Evaluation (sanitized)");
println!();
output::kv("Model", model_path.display());
output::kv("Benchmark", data_path.display());
output::kv("Problems", format!("{} (sanitized subset)", problems.len()));
output::kv("k values", format!("{k_values:?}"));
if num_samples > 1 {
output::kv("Samples/problem", num_samples);
output::kv("Temperature", format!("{temperature:.2}"));
}
println!();
}
let start = Instant::now();
let mut per_problem_correct: Vec<(String, String, usize)> = problems
.iter()
.map(|p| (p.task_id.to_string(), String::new(), 0usize))
.collect();
let mut first_err: Option<String> = None;
let any_ok = run_multisample_loop(&mut per_problem_correct, num_samples, json_output, || {
let result = if device == "cuda" {
run_mbpp_inference_cuda(model_path, &problems, k_values, json_output)
} else {
run_mbpp_inference(model_path, &problems, k_values, json_output)
};
if let Err(ref e) = result {
if first_err.is_none() {
first_err = Some(format!("{e}"));
}
}
result
});
if !any_ok {
return Err(CliError::ValidationFailed(format!(
"MBPP inference failed: {}",
first_err.unwrap_or_else(|| "unknown error".to_string())
)));
}
let elapsed = start.elapsed().as_secs_f32();
emit_eval_results(
"mbpp-sanitized",
model_path,
&per_problem_correct,
num_samples,
temperature,
k_values,
elapsed,
"inference",
json_output,
Some(("subset", "sanitized (task_id 11-510)")),
);
Ok(())
}
#[cfg(feature = "inference")]
fn run_mbpp_inference(
model_path: &Path,
problems: &[MbppProblem],
_k_values: &[usize],
json_output: bool,
) -> std::result::Result<(usize, Vec<(String, String, bool)>), String> {
use realizar::{run_inference, InferenceConfig};
if !json_output {
println!(" {} Loading model for inference...", "→".dimmed());
}
let tokenizer = realizar::apr::AprV2Model::load_tokenizer(model_path)
.ok_or_else(|| "No tokenizer found".to_string())?;
if !json_output {
println!(" {} Tokenizer loaded", "✓".green());
}
let mut passed = 0usize;
let mut results = Vec::new();
for (i, problem) in problems.iter().enumerate() {
let task_id = match &problem.task_id {
serde_json::Value::Number(n) => format!("MBPP/{n}"),
serde_json::Value::String(s) => s.clone(),
v => format!("MBPP/{v}"),
};
let test_hints = if problem.test_list.is_empty() {
String::new()
} else {
format!(
"\nYour code should pass these tests:\n{}\n",
problem.test_list.join("\n")
)
};
let prompt = format!("{}{}", problem.text, test_hints);
let config_chatml = InferenceConfig::new(model_path)
.with_prompt(prompt.clone())
.with_max_tokens(512)
.with_temperature(0.0)
.with_top_k(1);
let result = match run_inference(&config_chatml) {
Ok(r) => r,
Err(e) => {
if !json_output {
eprintln!(" [FAIL] {task_id}: inference error: {e}");
}
results.push((task_id, String::new(), false));
continue;
}
};
let completion_owned =
if let Some(code) = extract_python_code_block_targeted(&result.text, None) {
code
} else {
let raw = if let Some(stripped) = result.text.strip_prefix(&prompt) {
stripped.to_string()
} else {
let completion_tokens = if result.tokens.len() > result.input_token_count {
&result.tokens[result.input_token_count..]
} else {
&result.tokens[..]
};
tokenizer.decode(completion_tokens)
};
truncate_at_function_boundary(&raw).to_string()
};
let completion: &str = &completion_owned;
let setup = problem.test_setup_code.as_deref().unwrap_or("").trim();
let tests = problem.test_list.join("\n");
let full_program = if setup.is_empty() {
format!("{completion}\n{tests}\n")
} else {
format!("{completion}\n{setup}\n{tests}\n")
};
let exec_result = execute_python_test_with_diagnostics(&full_program, 10);
let ok = exec_result.success;
if std::env::var("APR_EVAL_DEBUG").is_ok() {
write_apr_eval_debug(
&task_id,
&prompt,
&result.text,
completion,
&full_program,
&exec_result,
);
}
if ok {
passed += 1;
}
results.push((task_id, String::new(), ok));
if !json_output && (i + 1) % 50 == 0 {
println!(
" {} {}/{} problems evaluated ({} passed)",
"→".dimmed(),
i + 1,
problems.len(),
passed
);
}
}
Ok((passed, results))
}
#[cfg(not(feature = "inference"))]
fn run_mbpp_inference(
_model_path: &Path,
_problems: &[MbppProblem],
_k_values: &[usize],
_json_output: bool,
) -> std::result::Result<(usize, Vec<(String, String, bool)>), String> {
Err("Inference not available (compile with --features inference)".to_string())
}
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "cuda", feature = "training"))]
fn run_mbpp_inference_cuda(
model_path: &Path,
problems: &[MbppProblem],
_k_values: &[usize],
json_output: bool,
) -> std::result::Result<(usize, Vec<(String, String, bool)>), String> {
let checkpoint_dir = if model_path.is_file() {
model_path.parent().unwrap_or(model_path)
} else {
model_path
};
let config = load_transformer_config(checkpoint_dir)?;
let max_seq = config.max_position_embeddings;
if !json_output {
println!(
" {} Loading model onto GPU for inference (ALB-089)...",
"→".dimmed()
);
}
let mut trainer =
entrenar::train::CudaTransformerTrainer::for_inference(checkpoint_dir, config)
.map_err(|e| format!("CUDA inference init failed: {e}"))?;
let tokenizer = realizar::apr::AprV2Model::load_tokenizer(model_path)
.or_else(|| {
let tok_path = checkpoint_dir.join("tokenizer.json");
realizar::apr::AprV2Model::load_tokenizer_from_path(&tok_path)
})
.ok_or_else(|| format!("No tokenizer found in {}", checkpoint_dir.display()))?;
if !json_output {
println!(" {} GPU inference ready", "✓".green());
}
let mut passed = 0usize;
let mut results = Vec::new();
let mut rng_state: u64 = 42;
for (i, problem) in problems.iter().enumerate() {
let task_id = match &problem.task_id {
serde_json::Value::Number(n) => format!("MBPP/{n}"),
serde_json::Value::String(s) => s.clone(),
v => format!("MBPP/{v}"),
};
let prompt = format!("{}\n", problem.text);
let prompt_tokens = tokenizer.encode(&prompt);
if prompt_tokens.is_empty() {
results.push((task_id, String::new(), false));
continue;
}
let mut tokens: Vec<u32> = prompt_tokens.clone();
let max_new = 512;
for _ in 0..max_new {
if tokens.len() >= max_seq {
break;
}
let logits = trainer
.forward_logits(&tokens)
.ok_or_else(|| "forward_logits failed".to_string())?;
let next = sample_token(&logits, 0.0, &mut rng_state);
tokens.push(next);
if next == 0 {
break;
}
}
let completion_tokens = &tokens[prompt_tokens.len()..];
let completion = tokenizer.decode(completion_tokens);
let completion = truncate_at_function_boundary(&completion);
let setup = problem.test_setup_code.as_deref().unwrap_or("").trim();
let tests = problem.test_list.join("\n");
let full_program = if setup.is_empty() {
format!("{completion}\n{tests}\n")
} else {
format!("{completion}\n{setup}\n{tests}\n")
};
let exec_result = execute_python_test_with_diagnostics(&full_program, 10);
let ok = exec_result.success;
if std::env::var("APR_EVAL_DEBUG").is_ok() {
write_apr_eval_debug(
&task_id,
&prompt,
&tokenizer.decode(&tokens),
completion,
&full_program,
&exec_result,
);
}
if ok {
passed += 1;
}
results.push((task_id, String::new(), ok));
if !json_output && (i + 1) % 50 == 0 {
println!(
" {} {}/{} problems evaluated ({} passed)",
"→".dimmed(),
i + 1,
problems.len(),
passed
);
}
}
Ok((passed, results))
}
#[cfg(not(all(feature = "cuda", feature = "training")))]
fn run_mbpp_inference_cuda(
_model_path: &Path,
_problems: &[MbppProblem],
_k_values: &[usize],
_json_output: bool,
) -> std::result::Result<(usize, Vec<(String, String, bool)>), String> {
Err("CUDA not available (compile with --features cuda)".to_string())
}