fn generate_main_rs(bin_name: &str, info: &ModelInfo) -> String {
let arch_desc = if info.model_type.is_empty() || info.model_type == "unknown" {
"ML model".to_string()
} else {
format!("{} model", info.model_type)
};
let param_desc = format_param_count(info.param_count);
let mut src = String::with_capacity(8192);
src.push_str(&generate_header(&arch_desc, ¶m_desc));
src.push_str(&generate_cli_struct(bin_name, &arch_desc, ¶m_desc));
src.push_str(&generate_main_fn(bin_name, &info.name, &info.model_type, ¶m_desc));
src.push_str(&generate_materialize_fn(bin_name));
src.push_str(&generate_server_fn(&info.name, ¶m_desc));
src
}
fn generate_header(arch_desc: &str, param_desc: &str) -> String {
format!(
r##"//! Auto-generated by `apr compile` (APR-SPEC ยง4.16)
//!
//! Standalone {arch_desc} binary with embedded weights.
//! Inference powered by realizar via `run_inference()`.
use clap::Parser;
use realizar::{{run_inference, InferenceConfig}};
use std::io::Write;
/// Embedded model data ({param_desc} parameters)
const MODEL_DATA: &[u8] = include_bytes!("../model.apr");
"##
)
}
fn generate_cli_struct(bin_name: &str, arch_desc: &str, param_desc: &str) -> String {
format!(
r##"/// {bin_name} - compiled {arch_desc}
#[derive(Parser)]
#[command(name = "{bin_name}")]
#[command(about = "Standalone {arch_desc} ({param_desc} parameters)")]
struct Cli {{
/// Text prompt for generation
#[arg(short, long)]
prompt: Option<String>,
/// Input file (text file with prompt)
#[arg(value_name = "INPUT")]
input: Option<String>,
/// Maximum tokens to generate
#[arg(short = 'n', long, default_value = "128")]
max_tokens: usize,
/// Disable GPU acceleration
#[arg(long)]
no_gpu: bool,
/// Show model info and exit
#[arg(long)]
info: bool,
/// Output as JSON
#[arg(long)]
json: bool,
/// Verbose output (show timing, tokens)
#[arg(short, long)]
verbose: bool,
/// Start OpenAI-compatible HTTP server
#[arg(long)]
serve: bool,
/// Server listen port (requires --serve)
#[arg(long, default_value = "8080")]
port: u16,
}}
"##
)
}
fn generate_main_fn(bin_name: &str, name: &str, model_type: &str, param_desc: &str) -> String {
format!(
r##"fn main() {{
let cli = Cli::parse();
if cli.info {{
print_info(cli.json);
return;
}}
let model_path = materialize_model();
if cli.serve {{
start_server(&model_path, cli.port, cli.no_gpu);
return;
}}
let prompt = resolve_prompt(&cli.prompt, &cli.input);
let mut config = InferenceConfig::new(&model_path)
.with_prompt(&prompt)
.with_max_tokens(cli.max_tokens)
.with_verbose(cli.verbose);
if cli.no_gpu {{
config = config.without_gpu();
}}
match run_inference(&config) {{
Ok(result) => print_result(&result, cli.json, cli.verbose),
Err(e) => {{
eprintln!("Inference failed: {{e}}");
std::process::exit(1);
}}
}}
let _ = std::fs::remove_file(&model_path);
}}
fn print_info(json: bool) {{
if json {{
println!(r#"{{{{"name":"{name}","architecture":"{model_type}","parameters":"{param_desc}","embedded_bytes":{{}}}}}}"#, MODEL_DATA.len());
}} else {{
println!("Model: {name}");
println!("Architecture: {model_type}");
println!("Parameters: {param_desc}");
println!("Embedded size: {{}} bytes", MODEL_DATA.len());
}}
}}
fn resolve_prompt(prompt_arg: &Option<String>, input_arg: &Option<String>) -> String {{
if let Some(p) = prompt_arg {{
return p.clone();
}}
if let Some(path) = input_arg {{
return std::fs::read_to_string(path).unwrap_or_else(|e| {{
eprintln!("Error reading input file: {{e}}");
std::process::exit(1);
}});
}}
eprintln!("Error: --prompt or INPUT file required");
eprintln!("Usage: {bin_name} --prompt \"your prompt here\"");
std::process::exit(1);
}}
fn print_result(result: &realizar::InferenceResult, json: bool, verbose: bool) {{
if json {{
println!(
r#"{{{{"text":"{{}}","tokens_generated":{{}},"tok_per_sec":{{:.1}},"used_gpu":{{}},"inference_ms":{{:.1}}}}}}"#,
result.text.replace('"', r#"\""#),
result.generated_token_count,
result.tok_per_sec,
result.used_gpu,
result.inference_ms,
);
}} else {{
print!("{{}}", result.text);
if verbose {{
eprintln!();
eprintln!("[{{}} tokens, {{:.1}} tok/s, {{:.0}}ms, gpu={{}}]",
result.generated_token_count,
result.tok_per_sec,
result.inference_ms,
result.used_gpu,
);
}}
}}
}}
"##,
bin_name = bin_name,
name = name,
model_type = model_type,
param_desc = param_desc,
)
}
fn generate_materialize_fn(bin_name: &str) -> String {
format!(
r##"fn materialize_model() -> std::path::PathBuf {{
let dir = std::env::temp_dir().join("{bin_name}-model");
std::fs::create_dir_all(&dir).unwrap_or_else(|e| {{
eprintln!("Failed to create temp dir: {{e}}");
std::process::exit(1);
}});
let path = dir.join("model.apr");
if let Ok(meta) = std::fs::metadata(&path) {{
if meta.len() == MODEL_DATA.len() as u64 {{
return path;
}}
}}
let mut f = std::fs::File::create(&path).unwrap_or_else(|e| {{
eprintln!("Failed to write model: {{e}}");
std::process::exit(1);
}});
f.write_all(MODEL_DATA).unwrap_or_else(|e| {{
eprintln!("Failed to write model data: {{e}}");
std::process::exit(1);
}});
path
}}
"##
)
}
fn generate_server_fn(name: &str, param_desc: &str) -> String {
format!(
r##"fn start_server(model_path: &std::path::Path, port: u16, no_gpu: bool) {{
eprintln!("Starting server on port {{port}}...");
eprintln!("Model: {name} ({param_desc} parameters)");
eprintln!("Endpoints: GET /health, POST /v1/chat/completions");
let rt = tokio::runtime::Runtime::new().unwrap_or_else(|e| {{
eprintln!("Failed to create async runtime: {{e}}");
std::process::exit(1);
}});
let model_path = model_path.to_path_buf();
rt.block_on(async move {{
use axum::{{Router, Json, extract::State, routing::{{get, post}}}};
use std::sync::Arc;
#[derive(Clone)]
struct ServerState {{
model_path: std::path::PathBuf,
no_gpu: bool,
}}
let state = Arc::new(ServerState {{ model_path, no_gpu }});
async fn health() -> axum::Json<serde_json::Value> {{
Json(serde_json::json!({{"status": "ok"}}))
}}
async fn chat_completions(
State(state): State<Arc<ServerState>>,
Json(body): Json<serde_json::Value>,
) -> Json<serde_json::Value> {{
let max_tokens = body.get("max_tokens")
.and_then(|v| v.as_u64())
.unwrap_or(128) as usize;
let prompt = body["messages"].as_array()
.and_then(|msgs| msgs.last())
.and_then(|m| m["content"].as_str())
.unwrap_or("").to_string();
let mut config = InferenceConfig::new(&state.model_path)
.with_prompt(&prompt)
.with_max_tokens(max_tokens);
if state.no_gpu {{
config = config.without_gpu();
}}
match run_inference(&config) {{
Ok(r) => Json(serde_json::json!({{
"choices": [{{"message": {{"role":"assistant","content": r.text}},
"finish_reason":"stop","index":0}}],
"usage": {{"completion_tokens": r.generated_token_count,
"total_tokens": r.input_token_count + r.generated_token_count}}
}})),
Err(e) => Json(serde_json::json!({{
"error": {{"message": format!("{{e}}"), "type":"inference_error"}}
}})),
}}
}}
let app = Router::new()
.route("/health", get(health))
.route("/v1/chat/completions", post(chat_completions))
.with_state(state);
let listener = tokio::net::TcpListener::bind(format!("0.0.0.0:{{port}}"))
.await.unwrap_or_else(|e| {{
eprintln!("Failed to bind port {{port}}: {{e}}");
std::process::exit(1);
}});
eprintln!("[ok] Listening on http://0.0.0.0:{{port}}");
axum::serve(listener, app).await.unwrap_or_else(|e| {{
eprintln!("Server error: {{e}}");
std::process::exit(1);
}});
}});
}}
"##,
name = name,
param_desc = param_desc,
)
}
fn generate_cargo_toml(bin_name: &str) -> String {
format!(
r#"[package]
name = "{bin_name}"
version = "0.1.0"
edition = "2021"
[dependencies]
clap = {{ version = "4", features = ["derive"] }}
realizar = {{ version = "0.8", default-features = false, features = ["gpu"] }}
tokio = {{ version = "1", features = ["rt-multi-thread", "macros", "net"] }}
axum = {{ version = "0.7", features = ["json"] }}
serde_json = "1"
[profile.release]
opt-level = "s"
codegen-units = 1
"#
)
}