#[cfg(feature = "inference")]
pub mod handlers;
pub mod routes;
#[cfg(feature = "inference")]
pub mod safetensors;
pub mod types;
pub use types::*;
#[cfg(test)]
mod tests;
use std::path::Path;
use colored::Colorize;
use crate::error::{CliError, Result};
pub(crate) fn run(model_path: &Path, config: &ServerConfig) -> Result<()> {
#[cfg(feature = "inference")]
if let Err(e) = realizar::inference::configure_optimal_thread_pool() {
eprintln!("[PMAT-297] Thread pool config: {e} (may already be initialized)");
}
std::env::set_var("REALIZR_CONTEXT_LENGTH", config.context_length.to_string());
if config.no_fp8_cache {
std::env::set_var("REALIZR_NO_FP8_CACHE", "1");
}
println!("{}", "=== APR Serve ===".cyan().bold());
println!();
println!("Model: {}", model_path.display());
println!("Binding: {}", config.bind_addr());
if config.context_length != 4096 {
println!(
"Context length: {} (--context-length)",
config.context_length
);
}
if config.no_fp8_cache {
println!("FP8 cache: DISABLED (--no-fp8-cache, saves ~1.5 GB)");
}
println!();
if !model_path.exists() {
return Err(CliError::FileNotFound(model_path.to_path_buf()));
}
let state = ServerState::new(model_path.to_path_buf(), config.clone())?;
println!(
"{}",
format!(
"Model loading: {}",
if state.uses_mmap { "mmap" } else { "full" }
)
.dimmed()
);
println!();
println!("{}", "Endpoints:".green().bold());
println!(" POST /predict - Model prediction (APR)");
println!(" POST /generate - Text generation (GGUF)");
println!(" GET /health - Health check");
if config.metrics {
println!(" GET /metrics - Prometheus metrics");
}
println!();
println!("{}", "Press Ctrl+C to stop".dimmed());
#[cfg(feature = "inference")]
{
handlers::start_realizar_server(model_path, config)
}
#[cfg(not(feature = "inference"))]
{
println!();
println!("{}", "[Server requires --features inference]".yellow());
Ok(())
}
}