apr-cli 0.4.17

CLI tool for APR model inspection, debugging, and operations
Documentation

/// Generate the main.rs source for the compiled binary.
///
/// Wires `realizar::run_inference()` for actual model inference.
/// The embedded model bytes are written to a temp file at runtime
/// (realizar requires a file path, not a byte slice).
fn generate_main_rs(bin_name: &str, info: &ModelInfo) -> String {
    let arch_desc = if info.model_type.is_empty() || info.model_type == "unknown" {
        "ML model".to_string()
    } else {
        format!("{} model", info.model_type)
    };

    let param_desc = format_param_count(info.param_count);

    let mut src = String::with_capacity(8192);
    src.push_str(&generate_header(&arch_desc, &param_desc));
    src.push_str(&generate_cli_struct(bin_name, &arch_desc, &param_desc));
    src.push_str(&generate_main_fn(bin_name, &info.name, &info.model_type, &param_desc));
    src.push_str(&generate_materialize_fn(bin_name));
    src.push_str(&generate_server_fn(&info.name, &param_desc));
    src
}

/// Generate file header with imports.
fn generate_header(arch_desc: &str, param_desc: &str) -> String {
    format!(
        r##"//! Auto-generated by `apr compile` (APR-SPEC ยง4.16)
//!
//! Standalone {arch_desc} binary with embedded weights.
//! Inference powered by realizar via `run_inference()`.

use clap::Parser;
use realizar::{{run_inference, InferenceConfig}};
use std::io::Write;

/// Embedded model data ({param_desc} parameters)
const MODEL_DATA: &[u8] = include_bytes!("../model.apr");

"##
    )
}

/// Generate the CLI argument struct.
fn generate_cli_struct(bin_name: &str, arch_desc: &str, param_desc: &str) -> String {
    format!(
        r##"/// {bin_name} - compiled {arch_desc}
#[derive(Parser)]
#[command(name = "{bin_name}")]
#[command(about = "Standalone {arch_desc} ({param_desc} parameters)")]
struct Cli {{
    /// Text prompt for generation
    #[arg(short, long)]
    prompt: Option<String>,

    /// Input file (text file with prompt)
    #[arg(value_name = "INPUT")]
    input: Option<String>,

    /// Maximum tokens to generate
    #[arg(short = 'n', long, default_value = "128")]
    max_tokens: usize,

    /// Disable GPU acceleration
    #[arg(long)]
    no_gpu: bool,

    /// Show model info and exit
    #[arg(long)]
    info: bool,

    /// Output as JSON
    #[arg(long)]
    json: bool,

    /// Verbose output (show timing, tokens)
    #[arg(short, long)]
    verbose: bool,

    /// Start OpenAI-compatible HTTP server
    #[arg(long)]
    serve: bool,

    /// Server listen port (requires --serve)
    #[arg(long, default_value = "8080")]
    port: u16,
}}

"##
    )
}

/// Generate the main() function body.
fn generate_main_fn(bin_name: &str, name: &str, model_type: &str, param_desc: &str) -> String {
    format!(
        r##"fn main() {{
    let cli = Cli::parse();

    if cli.info {{
        print_info(cli.json);
        return;
    }}

    let model_path = materialize_model();

    if cli.serve {{
        start_server(&model_path, cli.port, cli.no_gpu);
        return;
    }}

    let prompt = resolve_prompt(&cli.prompt, &cli.input);

    let mut config = InferenceConfig::new(&model_path)
        .with_prompt(&prompt)
        .with_max_tokens(cli.max_tokens)
        .with_verbose(cli.verbose);

    if cli.no_gpu {{
        config = config.without_gpu();
    }}

    match run_inference(&config) {{
        Ok(result) => print_result(&result, cli.json, cli.verbose),
        Err(e) => {{
            eprintln!("Inference failed: {{e}}");
            std::process::exit(1);
        }}
    }}

    let _ = std::fs::remove_file(&model_path);
}}

fn print_info(json: bool) {{
    if json {{
        println!(r#"{{{{"name":"{name}","architecture":"{model_type}","parameters":"{param_desc}","embedded_bytes":{{}}}}}}"#, MODEL_DATA.len());
    }} else {{
        println!("Model: {name}");
        println!("Architecture: {model_type}");
        println!("Parameters: {param_desc}");
        println!("Embedded size: {{}} bytes", MODEL_DATA.len());
    }}
}}

fn resolve_prompt(prompt_arg: &Option<String>, input_arg: &Option<String>) -> String {{
    if let Some(p) = prompt_arg {{
        return p.clone();
    }}
    if let Some(path) = input_arg {{
        return std::fs::read_to_string(path).unwrap_or_else(|e| {{
            eprintln!("Error reading input file: {{e}}");
            std::process::exit(1);
        }});
    }}
    eprintln!("Error: --prompt or INPUT file required");
    eprintln!("Usage: {bin_name} --prompt \"your prompt here\"");
    std::process::exit(1);
}}

fn print_result(result: &realizar::InferenceResult, json: bool, verbose: bool) {{
    if json {{
        println!(
            r#"{{{{"text":"{{}}","tokens_generated":{{}},"tok_per_sec":{{:.1}},"used_gpu":{{}},"inference_ms":{{:.1}}}}}}"#,
            result.text.replace('"', r#"\""#),
            result.generated_token_count,
            result.tok_per_sec,
            result.used_gpu,
            result.inference_ms,
        );
    }} else {{
        print!("{{}}", result.text);
        if verbose {{
            eprintln!();
            eprintln!("[{{}} tokens, {{:.1}} tok/s, {{:.0}}ms, gpu={{}}]",
                result.generated_token_count,
                result.tok_per_sec,
                result.inference_ms,
                result.used_gpu,
            );
        }}
    }}
}}

"##,
        bin_name = bin_name,
        name = name,
        model_type = model_type,
        param_desc = param_desc,
    )
}

/// Generate the model materialization function.
fn generate_materialize_fn(bin_name: &str) -> String {
    format!(
        r##"fn materialize_model() -> std::path::PathBuf {{
    let dir = std::env::temp_dir().join("{bin_name}-model");
    std::fs::create_dir_all(&dir).unwrap_or_else(|e| {{
        eprintln!("Failed to create temp dir: {{e}}");
        std::process::exit(1);
    }});
    let path = dir.join("model.apr");

    if let Ok(meta) = std::fs::metadata(&path) {{
        if meta.len() == MODEL_DATA.len() as u64 {{
            return path;
        }}
    }}

    let mut f = std::fs::File::create(&path).unwrap_or_else(|e| {{
        eprintln!("Failed to write model: {{e}}");
        std::process::exit(1);
    }});
    f.write_all(MODEL_DATA).unwrap_or_else(|e| {{
        eprintln!("Failed to write model data: {{e}}");
        std::process::exit(1);
    }});
    path
}}

"##
    )
}

/// Generate the HTTP server function.
fn generate_server_fn(name: &str, param_desc: &str) -> String {
    format!(
        r##"fn start_server(model_path: &std::path::Path, port: u16, no_gpu: bool) {{
    eprintln!("Starting server on port {{port}}...");
    eprintln!("Model: {name} ({param_desc} parameters)");
    eprintln!("Endpoints: GET /health, POST /v1/chat/completions");

    let rt = tokio::runtime::Runtime::new().unwrap_or_else(|e| {{
        eprintln!("Failed to create async runtime: {{e}}");
        std::process::exit(1);
    }});

    let model_path = model_path.to_path_buf();
    rt.block_on(async move {{
        use axum::{{Router, Json, extract::State, routing::{{get, post}}}};
        use std::sync::Arc;

        #[derive(Clone)]
        struct ServerState {{
            model_path: std::path::PathBuf,
            no_gpu: bool,
        }}

        let state = Arc::new(ServerState {{ model_path, no_gpu }});

        async fn health() -> axum::Json<serde_json::Value> {{
            Json(serde_json::json!({{"status": "ok"}}))
        }}

        async fn chat_completions(
            State(state): State<Arc<ServerState>>,
            Json(body): Json<serde_json::Value>,
        ) -> Json<serde_json::Value> {{
            let max_tokens = body.get("max_tokens")
                .and_then(|v| v.as_u64())
                .unwrap_or(128) as usize;
            let prompt = body["messages"].as_array()
                .and_then(|msgs| msgs.last())
                .and_then(|m| m["content"].as_str())
                .unwrap_or("").to_string();

            let mut config = InferenceConfig::new(&state.model_path)
                .with_prompt(&prompt)
                .with_max_tokens(max_tokens);
            if state.no_gpu {{
                config = config.without_gpu();
            }}

            match run_inference(&config) {{
                Ok(r) => Json(serde_json::json!({{
                    "choices": [{{"message": {{"role":"assistant","content": r.text}},
                        "finish_reason":"stop","index":0}}],
                    "usage": {{"completion_tokens": r.generated_token_count,
                        "total_tokens": r.input_token_count + r.generated_token_count}}
                }})),
                Err(e) => Json(serde_json::json!({{
                    "error": {{"message": format!("{{e}}"), "type":"inference_error"}}
                }})),
            }}
        }}

        let app = Router::new()
            .route("/health", get(health))
            .route("/v1/chat/completions", post(chat_completions))
            .with_state(state);

        let listener = tokio::net::TcpListener::bind(format!("0.0.0.0:{{port}}"))
            .await.unwrap_or_else(|e| {{
                eprintln!("Failed to bind port {{port}}: {{e}}");
                std::process::exit(1);
            }});
        eprintln!("[ok] Listening on http://0.0.0.0:{{port}}");
        axum::serve(listener, app).await.unwrap_or_else(|e| {{
            eprintln!("Server error: {{e}}");
            std::process::exit(1);
        }});
    }});
}}
"##,
        name = name,
        param_desc = param_desc,
    )
}

/// Generate Cargo.toml with realizar + server dependencies.
fn generate_cargo_toml(bin_name: &str) -> String {
    format!(
        r#"[package]
name = "{bin_name}"
version = "0.1.0"
edition = "2021"

[dependencies]
clap = {{ version = "4", features = ["derive"] }}
realizar = {{ version = "0.8", default-features = false, features = ["gpu"] }}
tokio = {{ version = "1", features = ["rt-multi-thread", "macros", "net"] }}
axum = {{ version = "0.7", features = ["json"] }}
serde_json = "1"

[profile.release]
opt-level = "s"
codegen-units = 1
"#
    )
}