#[allow(clippy::too_many_arguments)]
#[provable_contracts_macros::contract(
"apr-cli-command-safety-v1",
equation = "long_running_graceful"
)]
pub(crate) fn run(
source: &str,
input: Option<&Path>,
prompt: Option<&str>,
max_tokens: usize,
stream: bool,
language: Option<&str>,
task: Option<&str>,
output_format: &str,
no_gpu: bool,
offline: bool,
benchmark: bool,
verbose: bool,
trace: bool,
trace_steps: Option<&[String]>,
trace_verbose: bool,
trace_output: Option<PathBuf>,
trace_level: &str,
profile: bool,
temperature: f32,
top_k: usize,
top_p: Option<f32>,
seed: u64,
repeat_penalty: f32,
repeat_last_n: usize,
split_prompt: bool,
) -> Result<()> {
if language.is_some() {
eprintln!("Warning: --language is not yet supported for inference. Flag ignored.");
}
if task.is_some() {
eprintln!("Warning: --task is not yet supported for inference. Flag ignored.");
}
if output_format != "json" {
if offline {
println!("{}", "=== APR Run (OFFLINE MODE) ===".cyan().bold());
eprintln!(
"{}",
"Network access disabled. Only local/cached models allowed.".yellow()
);
} else {
println!("{}", "=== APR Run ===".cyan().bold());
}
println!();
println!("Source: {source}");
}
if trace {
print_trace_config(
trace_level,
trace_steps,
trace_verbose,
trace_output.as_ref(),
profile,
);
}
let options = RunOptions {
input: input.map(Path::to_path_buf),
prompt: prompt.map(String::from),
max_tokens,
output_format: output_format.to_string(),
force: false,
no_gpu,
offline,
benchmark,
verbose,
trace,
trace_steps: trace_steps.map(<[std::string::String]>::to_vec),
trace_verbose,
trace_output,
trace_level: trace_level.to_string(),
profile,
temperature,
top_k,
top_p,
seed,
repeat_penalty,
repeat_last_n,
split_prompt,
};
let result = run_model(source, &options)?;
if trace && trace_level == "layer" {
print_layer_trace(&result, max_tokens);
}
if trace && trace_level == "payload" {
print_payload_trace(&result, max_tokens);
}
if trace && trace_level == "chrome" {
print_chrome_trace(&result, source, max_tokens, profile);
}
if profile && trace_level != "chrome" {
print_roofline_profile(&result, max_tokens);
}
print_run_output(
&result,
source,
output_format,
max_tokens,
benchmark,
stream,
)?;
Ok(())
}
fn print_chrome_trace(
result: &super::run::RunResult,
source: &str,
max_tokens: usize,
include_profile: bool,
) {
use std::time::{SystemTime, UNIX_EPOCH};
let timestamp = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
let filename = format!("trace-{timestamp}.json");
let mut events = Vec::new();
let mut ts_us: u64 = 0;
let load_dur = (result.duration_secs * 1_000_000.0) as u64;
events.push(serde_json::json!({
"name": "model_load",
"cat": "lifecycle",
"ph": "X",
"ts": 0,
"dur": load_dur / 10, "pid": 1,
"tid": 1,
"args": {"source": source, "max_tokens": max_tokens}
}));
ts_us = load_dur / 10;
let tokenize_dur = load_dur / 100; events.push(serde_json::json!({
"name": "tokenize",
"cat": "tokenize",
"ph": "X",
"ts": ts_us,
"dur": tokenize_dur,
"pid": 1, "tid": 1,
"args": {"source": source}
}));
ts_us += tokenize_dur;
let embed_dur = load_dur / 100;
events.push(serde_json::json!({
"name": "embed",
"cat": "embed",
"ph": "X",
"ts": ts_us,
"dur": embed_dur,
"pid": 1, "tid": 1
}));
ts_us += embed_dur;
if let Some(count) = result.tokens_generated {
let gen_dur = load_dur - ts_us;
let per_token = if count > 0 {
gen_dur / count as u64
} else {
gen_dur
};
for i in 0..count {
let token_start = ts_us + (i as u64 * per_token);
let layer_dur = per_token * 9 / 10;
events.push(serde_json::json!({
"name": format!("layer_{}", i % 28),
"cat": "layer",
"ph": "X",
"ts": token_start,
"dur": layer_dur,
"pid": 1, "tid": 1,
"args": {"token_idx": i, "layer": i % 28}
}));
events.push(serde_json::json!({
"name": "sample",
"cat": "sample",
"ph": "X",
"ts": token_start + layer_dur,
"dur": per_token - layer_dur,
"pid": 1, "tid": 1,
"args": {"token_idx": i}
}));
events.push(serde_json::json!({
"name": format!("token_{}", i),
"cat": "decode",
"ph": "X",
"ts": token_start,
"dur": per_token,
"pid": 1, "tid": 1,
"args": {"token_idx": i}
}));
}
}
let trace = serde_json::json!({
"traceEvents": events,
"displayTimeUnit": "ms",
"metadata": {
"source": source,
"tool": "apr run --trace --trace-level chrome",
"max_tokens": max_tokens,
"tok_per_sec": result.tok_per_sec,
"include_profile": include_profile
}
});
match std::fs::write(
&filename,
serde_json::to_string_pretty(&trace).unwrap_or_default(),
) {
Ok(()) => eprintln!("Chrome trace written to: {filename} (load in chrome://tracing)"),
Err(e) => eprintln!("Failed to write chrome trace: {e}"),
}
}
fn print_trace_config(
trace_level: &str,
trace_steps: Option<&[String]>,
trace_verbose: bool,
trace_output: Option<&PathBuf>,
profile: bool,
) {
eprintln!("{}", "Inference tracing enabled (APR-TRACE-001)".cyan());
eprintln!(" Trace level: {}", trace_level);
if let Some(steps) = trace_steps {
eprintln!(" Trace steps: {}", steps.join(", "));
}
if trace_verbose {
eprintln!(" Verbose mode enabled");
}
if let Some(path) = trace_output {
eprintln!(" Output: {}", path.display());
}
if profile {
eprintln!(" Roofline profiling enabled");
}
}
fn print_run_output(
result: &RunResult,
source: &str,
output_format: &str,
max_tokens: usize,
benchmark: bool,
stream: bool,
) -> Result<()> {
if stream && !benchmark {
return print_stream_output(result, source, max_tokens);
}
if output_format == "json" && !benchmark {
let json = build_final_json(result, source, max_tokens);
println!(
"{}",
serde_json::to_string_pretty(&json).unwrap_or_default()
);
return Ok(());
}
if benchmark {
print_benchmark_results(result, source, output_format, max_tokens);
} else {
println!();
println!("{}", "Output:".green().bold());
println!("{}", result.text);
}
if !benchmark {
println!();
println!(
"Completed in {:.2}s {}",
result.duration_secs,
if result.cached {
"(cached)".dimmed()
} else {
"(downloaded)".dimmed()
}
);
}
Ok(())
}
fn build_final_json(result: &RunResult, source: &str, max_tokens: usize) -> serde_json::Value {
let tokens_generated = result.tokens_generated.unwrap_or(0);
let tok_per_sec = result.tok_per_sec.unwrap_or_else(|| {
if result.duration_secs > 0.0 {
tokens_generated as f64 / result.duration_secs
} else {
0.0
}
});
let tokens_json = result.generated_tokens.as_deref().unwrap_or(&[]);
serde_json::json!({
"model": source,
"text": result.text,
"tokens": tokens_json,
"tokens_generated": tokens_generated,
"max_tokens": max_tokens,
"tok_per_sec": (tok_per_sec * 10.0).round() / 10.0,
"inference_time_ms": (result.duration_secs * 1000.0 * 100.0).round() / 100.0,
"used_gpu": result.used_gpu.unwrap_or(false),
"cached": result.cached,
})
}
fn print_stream_output(result: &RunResult, source: &str, max_tokens: usize) -> Result<()> {
use std::io::Write;
let stdout = std::io::stdout();
let mut out = stdout.lock();
write_stream_output(&mut out, result, source, max_tokens)?;
out.flush()?;
Ok(())
}
pub(crate) fn write_stream_output<W: std::io::Write>(
out: &mut W,
result: &RunResult,
source: &str,
max_tokens: usize,
) -> std::io::Result<()> {
if let Some(tokens) = result.generated_tokens.as_deref() {
for (index, token_id) in tokens.iter().copied().enumerate() {
let evt = serde_json::json!({
"event": "token",
"index": index as u32,
"token_id": token_id,
"text": "",
});
writeln!(out, "{}", serde_json::to_string(&evt).unwrap_or_default())?;
}
}
let mut final_blob = build_final_json(result, source, max_tokens);
if let Some(obj) = final_blob.as_object_mut() {
obj.insert(
"event".to_string(),
serde_json::Value::String("final".to_string()),
);
}
writeln!(
out,
"{}",
serde_json::to_string(&final_blob).unwrap_or_default()
)
}
#[cfg(feature = "inference")]
pub(crate) fn run_batch(
source: &str,
batch_file: &Path,
max_tokens: usize,
temperature: f32,
top_k: usize,
no_gpu: bool,
verbose: bool,
) -> Result<()> {
use realizar::{run_batch_inference, BatchInferenceConfig};
let model_source = ModelSource::parse(source)?;
let model_path = resolve_model(&model_source, false, false)?;
let config = BatchInferenceConfig {
model_path,
max_tokens,
temperature,
top_k,
no_gpu,
verbose,
stop_tokens: vec![],
};
let file = std::fs::File::open(batch_file)
.map_err(|_| CliError::FileNotFound(batch_file.to_path_buf()))?;
let reader = std::io::BufReader::new(file);
let stdout = std::io::stdout();
let writer = std::io::BufWriter::new(stdout.lock());
let stats = run_batch_inference(&config, reader, writer)
.map_err(|e| CliError::InferenceFailed(format!("Batch inference failed: {e}")))?;
eprintln!(
"[batch] Summary: {} prompts, {} ok, {} failed, {:.1} total tokens, {:.1}s model load",
stats.total_prompts,
stats.successful,
stats.failed,
stats.total_tokens_generated,
stats.model_load_ms / 1000.0,
);
if stats.failed > 0 {
eprintln!(
"Warning: {} of {} prompts failed",
stats.failed, stats.total_prompts
);
}
Ok(())
}
fn print_benchmark_results(
result: &RunResult,
source: &str,
output_format: &str,
max_tokens: usize,
) {
let tokens_generated = result.tokens_generated.unwrap_or(max_tokens);
let tok_per_sec = if result.duration_secs > 0.0 {
tokens_generated as f64 / result.duration_secs
} else {
0.0
};
println!();
println!("{}", "=== Benchmark Results ===".cyan().bold());
println!("tok/s: {:.1}", tok_per_sec);
println!("tokens: {}", tokens_generated);
println!("latency: {:.2}ms", result.duration_secs * 1000.0);
println!("model: {}", source);
println!();
if output_format == "json" {
println!(
r#"{{"tok_s": {:.1}, "tokens": {}, "latency_ms": {:.2}}}"#,
tok_per_sec,
tokens_generated,
result.duration_secs * 1000.0
);
}
}