#[cfg(feature = "server")]
mod server_commands {
use super::Result;
pub struct PreparedServer {
pub state: crate::api::AppState,
pub batch_mode_enabled: bool,
pub model_type: ModelType,
}
impl std::fmt::Debug for PreparedServer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("PreparedServer")
.field("batch_mode_enabled", &self.batch_mode_enabled)
.field("model_type", &self.model_type)
.finish_non_exhaustive()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ModelType {
Gguf,
SafeTensors,
Apr,
}
impl std::fmt::Display for ModelType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ModelType::Gguf => write!(f, "GGUF"),
ModelType::SafeTensors => write!(f, "SafeTensors"),
ModelType::Apr => write!(f, "APR"),
}
}
}
fn prepare_gguf_serve_state(
model_path: &str,
batch_mode: bool,
force_gpu: bool,
) -> Result<PreparedServer> {
use crate::gguf::MappedGGUFModel;
println!("Parsing GGUF file...");
let mapped = MappedGGUFModel::from_path(model_path).map_err(|e| {
crate::error::RealizarError::UnsupportedOperation {
operation: "load_gguf".to_string(),
reason: format!("Failed to load GGUF: {e}"),
}
})?;
println!("Successfully loaded GGUF model");
println!(" Tensors: {}", mapped.model.tensors.len());
println!(" Metadata: {} entries", mapped.model.metadata.len());
println!();
println!("Creating quantized model (fused Q4_K ops)...");
let quantized_model =
crate::gguf::OwnedQuantizedModel::from_mapped(&mapped).map_err(|e| {
crate::error::RealizarError::UnsupportedOperation {
operation: "create_quantized".to_string(),
reason: format!("Failed to create quantized model: {e}"),
}
})?;
println!("Quantized model created successfully!");
println!(" Vocab size: {}", quantized_model.config.vocab_size);
println!(" Hidden dim: {}", quantized_model.config.hidden_dim);
println!(" Layers: {}", quantized_model.layers.len());
let vocab = mapped.model.vocabulary().unwrap_or_else(|| {
eprintln!(" Warning: No vocabulary in GGUF, using placeholder tokens");
(0..quantized_model.config.vocab_size)
.map(|i| format!("token{i}"))
.collect()
});
println!(" Vocab loaded: {} tokens", vocab.len());
println!();
#[cfg(feature = "cuda")]
let use_cuda = force_gpu
|| std::env::var("REALIZAR_BACKEND")
.map(|v| v.eq_ignore_ascii_case("cuda"))
.unwrap_or(false);
#[cfg(not(feature = "cuda"))]
let use_cuda = false;
#[cfg(not(feature = "cuda"))]
if force_gpu {
eprintln!("Warning: --gpu flag requires 'cuda' feature. Falling back to CPU.");
eprintln!("Build with: cargo build --features cuda");
eprintln!();
}
let state = if use_cuda && !batch_mode {
prepare_gguf_cuda_state(quantized_model, vocab, force_gpu)?
} else if batch_mode {
prepare_gguf_batch_state(quantized_model, vocab)?
} else {
crate::api::AppState::with_quantized_model_and_vocab(quantized_model, vocab)?
};
Ok(PreparedServer {
state,
batch_mode_enabled: batch_mode,
model_type: ModelType::Gguf,
})
}
fn prepare_gguf_cuda_state(
quantized_model: crate::gguf::OwnedQuantizedModel,
vocab: Vec<String>,
force_gpu: bool,
) -> Result<crate::api::AppState> {
#[cfg(feature = "cuda")]
{
use crate::gguf::OwnedQuantizedModelCuda;
let source = if force_gpu {
"--gpu flag"
} else {
"REALIZAR_BACKEND=cuda"
};
println!("Creating CUDA model ({source})...");
let max_seq_len = std::env::var("REALIZR_CONTEXT_LENGTH")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(4096);
let cuda_model =
OwnedQuantizedModelCuda::with_max_seq_len(quantized_model, 0, max_seq_len)
.map_err(|e| e.error)?;
println!(" CUDA model created on GPU: {}", cuda_model.device_name());
println!(" Max sequence length: {} (--context-length)", max_seq_len);
println!(" TRUE STREAMING: enabled (PAR-112)");
let state = crate::api::AppState::with_cuda_model_and_vocab(cuda_model, vocab)?;
let cuda_model_arc = state.cuda_model().expect("just created").clone();
let use_iteration_scheduler =
std::env::var("ITERATION_SCHEDULER").as_deref() == Ok("1");
let batch_tx = if use_iteration_scheduler {
let iter_config =
crate::api::iteration_scheduler::IterationSchedulerConfig::default();
println!(
" ITERATION SCHEDULER: max_slots={}, prefill_chunk={} (PMAT-088)",
iter_config.max_slots, iter_config.prefill_chunk_size,
);
crate::api::iteration_scheduler::spawn_iteration_scheduler(
cuda_model_arc,
iter_config,
)
} else {
let batch_config = crate::api::cuda_batch_scheduler::CudaBatchConfig::default();
println!(
" CONTINUOUS BATCHING: max_batch={}, window={}ms (PMAT-044)",
batch_config.max_batch, batch_config.window_ms
);
crate::api::cuda_batch_scheduler::spawn_cuda_batch_scheduler(
cuda_model_arc,
batch_config,
)
};
println!();
Ok(state.with_cuda_batch_tx(batch_tx))
}
#[cfg(not(feature = "cuda"))]
{
let _ = force_gpu;
crate::api::AppState::with_quantized_model_and_vocab(quantized_model, vocab)
}
}
fn prepare_gguf_batch_state(
quantized_model: crate::gguf::OwnedQuantizedModel,
vocab: Vec<String>,
) -> Result<crate::api::AppState> {
#[cfg(feature = "gpu")]
{
use crate::gguf::OwnedQuantizedModelCachedSync;
println!("Initializing batch inference mode (PARITY-093/094)...");
let cached_model = OwnedQuantizedModelCachedSync::new(quantized_model);
println!(" Warming up GPU cache (dequantizing FFN weights)...");
match cached_model.warmup_gpu_cache() {
Ok((memory_bytes, num_layers)) => {
println!(
" GPU cache ready: {:.2} GB ({} layers)",
memory_bytes as f64 / 1e9,
num_layers
);
},
Err(e) => {
eprintln!(
" Warning: GPU cache warmup failed: {}. Falling back to CPU batch.",
e
);
},
}
let state = crate::api::AppState::with_cached_model_and_vocab(cached_model, vocab)?;
let cached_model_arc = state
.cached_model()
.expect("cached_model should exist")
.clone();
let batch_config = crate::api::BatchConfig::default();
println!(" Batch window: {}ms", batch_config.window_ms);
println!(" Min batch size: {}", batch_config.min_batch);
println!(" Optimal batch: {}", batch_config.optimal_batch);
println!(" Max batch size: {}", batch_config.max_batch);
println!(
" GPU threshold: {} (GPU GEMM for batch >= this)",
batch_config.gpu_threshold
);
let batch_tx =
crate::api::spawn_batch_processor(cached_model_arc, batch_config.clone());
println!(" Batch processor: RUNNING");
println!();
Ok(state.with_batch_config(batch_tx, batch_config))
}
#[cfg(not(feature = "gpu"))]
{
eprintln!(
"Warning: --batch requires 'gpu' feature. Falling back to single-request mode."
);
crate::api::AppState::with_quantized_model_and_vocab(quantized_model, vocab)
}
}
fn prepare_safetensors_serve_state(model_path: &str, force_gpu: bool) -> Result<PreparedServer> {
use std::path::Path;
println!("Loading SafeTensors model for serving...");
let model_path_obj = Path::new(model_path);
#[cfg(feature = "cuda")]
if force_gpu {
use crate::safetensors_cuda::SafeTensorsCudaModel;
println!(" Mode: GPU (SafeTensors CUDA — #169 format parity)");
let mut cuda_model = SafeTensorsCudaModel::load(model_path_obj, 0).map_err(|e| {
crate::error::RealizarError::UnsupportedOperation {
operation: "safetensors_cuda_load".to_string(),
reason: format!("Failed to load SafeTensors for GPU: {e}"),
}
})?;
let vocab_size = cuda_model.config().vocab_size;
#[allow(clippy::map_unwrap_or)]
let vocab = crate::apr::AprV2Model::load_tokenizer_from_sibling(model_path_obj)
.map(|(v, _, _)| v)
.unwrap_or_else(|| {
println!(" Warning: No tokenizer.json found, using simple vocabulary");
(0..vocab_size).map(|i| format!("token{i}")).collect()
});
println!(" Vocab size: {}", vocab.len());
let state = crate::api::AppState::with_safetensors_cuda_model_and_vocab(
cuda_model, vocab,
)?;
return Ok(PreparedServer {
state,
batch_mode_enabled: false,
model_type: ModelType::SafeTensors,
});
}
#[cfg(not(feature = "cuda"))]
if force_gpu {
eprintln!("Warning: --gpu flag requires 'cuda' feature for SafeTensors. Falling back to CPU.");
}
use crate::safetensors_infer::SafetensorsToAprConverter;
let transformer = SafetensorsToAprConverter::convert(model_path_obj).map_err(|e| {
crate::error::RealizarError::UnsupportedOperation {
operation: "convert_safetensors".to_string(),
reason: format!("Failed to convert SafeTensors: {e}"),
}
})?;
println!(" Architecture: {}", transformer.config.architecture);
println!(" Layers: {}", transformer.config.num_layers);
println!(" Hidden: {}", transformer.config.hidden_dim);
#[allow(clippy::map_unwrap_or)]
let vocab = crate::apr::AprV2Model::load_tokenizer_from_sibling(model_path_obj)
.map(|(v, _, _)| v)
.unwrap_or_else(|| {
println!(" Warning: No tokenizer.json found, using simple vocabulary");
(0..transformer.config.vocab_size)
.map(|i| format!("token{i}"))
.collect()
});
println!(" Vocab size: {}", vocab.len());
println!(" Mode: CPU (F32 inference)");
let state =
crate::api::AppState::with_apr_transformer_and_vocab(transformer.into_inner(), vocab)?;
Ok(PreparedServer {
state,
batch_mode_enabled: false,
model_type: ModelType::SafeTensors,
})
}
fn prepare_apr_serve_state(model_path: &str, force_gpu: bool) -> Result<PreparedServer> {
use std::path::Path;
println!("Loading APR model for serving...");
#[cfg(feature = "cuda")]
if force_gpu
|| std::env::var("REALIZAR_BACKEND")
.map(|v| v.eq_ignore_ascii_case("cuda"))
.unwrap_or(false)
{
return prepare_gguf_cuda_state_from_apr(model_path);
}
use crate::apr_transformer::AprTransformer;
let file_data = std::fs::read(model_path).map_err(|e| {
crate::error::RealizarError::UnsupportedOperation {
operation: "read_model_file".to_string(),
reason: format!("Failed to read {model_path}: {e}"),
}
})?;
let transformer = AprTransformer::from_apr_bytes(&file_data).map_err(|e| {
crate::error::RealizarError::UnsupportedOperation {
operation: "load_apr".to_string(),
reason: format!("Failed to load APR: {e}"),
}
})?;
println!(" Architecture: {}", transformer.config.architecture);
println!(" Layers: {}", transformer.config.num_layers);
println!(" Hidden: {}", transformer.config.hidden_dim);
let model_path_obj = Path::new(model_path);
let vocab = crate::apr::AprV2Model::load_tokenizer_from_sibling(model_path_obj)
.map(|(v, _, _)| v)
.or_else(|| {
crate::apr::AprV2Model::load(model_path_obj)
.ok()
.and_then(|m| m.load_embedded_tokenizer())
.map(|t| t.id_to_token.clone())
})
.unwrap_or_else(|| {
println!(" Warning: No vocabulary found, using simple vocabulary");
(0..transformer.config.vocab_size)
.map(|i| format!("token{i}"))
.collect()
});
println!(" Vocab size: {}", vocab.len());
println!(" Mode: CPU (F32 inference)");
let state = crate::api::AppState::with_apr_transformer_and_vocab(transformer, vocab)?;
Ok(PreparedServer {
state,
batch_mode_enabled: false,
model_type: ModelType::Apr,
})
}
#[cfg(feature = "cuda")]
fn prepare_gguf_cuda_state_from_apr(model_path: &str) -> Result<PreparedServer> {
use crate::apr::MappedAprModel;
use crate::gguf::{OwnedQuantizedModel, OwnedQuantizedModelCuda};
use std::path::Path;
let path = Path::new(model_path);
println!(" Mode: GPU (APR → GGUF CUDA — #170 fix)");
let mapped = MappedAprModel::from_path(path).map_err(|e| {
crate::error::RealizarError::FormatError { reason: format!("APR load: {e}") }
})?;
let model = OwnedQuantizedModel::from_apr(&mapped).map_err(|e| {
crate::error::RealizarError::FormatError { reason: format!("APR→GGUF: {e}") }
})?;
println!(" Layers: {}, Hidden: {}, Vocab: {}",
model.config.num_layers, model.config.hidden_dim, model.config.vocab_size);
let max_seq_len = std::env::var("REALIZR_CONTEXT_LENGTH")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(4096);
let cuda_model = OwnedQuantizedModelCuda::with_max_seq_len(model, 0, max_seq_len)
.map_err(|e| e.error)?;
println!(" CUDA model on GPU: {}", cuda_model.device_name());
let apr_v2 = crate::apr::AprV2Model::load(path).ok();
let vocab = apr_v2.as_ref()
.and_then(|m| m.metadata().get_embedded_vocabulary())
.or_else(|| crate::apr::AprV2Model::load_tokenizer_from_sibling(path).map(|(v, _, _)| v))
.unwrap_or_else(|| {
println!(" Warning: No tokenizer found");
(0..cuda_model.model().config.vocab_size).map(|i| format!("token{i}")).collect()
});
println!(" Vocab size: {}", vocab.len());
let state = crate::api::AppState::with_cuda_model_and_vocab(cuda_model, vocab)?;
Ok(PreparedServer {
state,
batch_mode_enabled: false,
model_type: ModelType::Apr,
})
}
#[cfg(feature = "cuda")]
fn prepare_apr_q4k_serve_state(model_path: &str) -> Result<PreparedServer> {
use crate::api::apr_q4k_scheduler;
use crate::apr::AprV2Model;
use std::path::Path;
println!(" Mode: GPU (Q4K CUDA — ALB-095)");
let model_path_obj = Path::new(model_path);
let (vocab, eos_id) = AprV2Model::load_tokenizer_from_sibling(model_path_obj)
.map(|(v, _, eos)| (v, eos))
.or_else(|| {
AprV2Model::load(model_path_obj)
.ok()
.and_then(|m| m.load_embedded_tokenizer())
.map(|t| (t.id_to_token.clone(), None))
})
.unwrap_or_else(|| {
println!(" Warning: No vocabulary found, using simple vocabulary");
((0..151936).map(|i| format!("token{i}")).collect(), None)
});
println!(" Vocab: {} tokens", vocab.len());
if let Some(eos) = eos_id {
println!(" EOS token ID: {eos}");
}
let q4k_tx =
apr_q4k_scheduler::spawn_apr_q4k_inference_thread(model_path).map_err(|e| {
crate::error::RealizarError::GpuError {
reason: format!("Q4K inference thread failed: {e}"),
}
})?;
let state = crate::api::AppState::with_apr_q4k_and_vocab_eos(q4k_tx, vocab, eos_id)?;
Ok(PreparedServer {
state,
batch_mode_enabled: false,
model_type: ModelType::Apr,
})
}
pub fn prepare_serve_state(
model_path: &str,
batch_mode: bool,
force_gpu: bool,
) -> Result<PreparedServer> {
println!("Loading model from: {model_path}");
if batch_mode {
println!("Mode: BATCH (PARITY-093 M4 parity)");
} else {
println!("Mode: SINGLE-REQUEST");
}
if force_gpu {
println!("GPU: FORCED (--gpu flag)");
}
println!();
if model_path.ends_with(".gguf") {
prepare_gguf_serve_state(model_path, batch_mode, force_gpu)
} else if model_path.ends_with(".safetensors") {
prepare_safetensors_serve_state(model_path, force_gpu)
} else if model_path.ends_with(".apr") {
prepare_apr_serve_state(model_path, force_gpu)
} else {
Err(crate::error::RealizarError::UnsupportedOperation {
operation: "detect_model_type".to_string(),
reason: "Unsupported file extension. Expected .gguf, .safetensors, or .apr"
.to_string(),
})
}
}
#[allow(clippy::too_many_arguments, clippy::fn_params_excessive_bools)]
pub async fn serve_model(
host: &str,
port: u16,
model_path: &str,
batch_mode: bool,
force_gpu: bool,
openai_api: bool,
trace: bool,
context_length: usize,
no_fp8_cache: bool,
) -> Result<()> {
if no_fp8_cache {
std::env::set_var("REALIZR_NO_FP8_CACHE", "1");
eprintln!("[GH-286] FP8 weight cache DISABLED (--no-fp8-cache). Saves ~1.5 GB RSS.");
}
std::env::set_var("REALIZR_CONTEXT_LENGTH", context_length.to_string());
let prepared = prepare_serve_state(model_path, batch_mode, force_gpu)?;
let state = if trace {
eprintln!("Tracing: ENABLED (--trace flag)");
prepared.state.with_trace(true)
} else {
prepared.state
};
let router_config = crate::api::RouterConfig { openai_api };
let app = crate::api::create_router_with_config(state, router_config);
let addr: std::net::SocketAddr = format!("{host}:{port}").parse().map_err(|e| {
crate::error::RealizarError::InvalidShape {
reason: format!("Invalid address: {e}"),
}
})?;
eprintln!("Server listening on http://{addr}");
eprintln!();
eprintln!("Endpoints:");
eprintln!(" GET /health - Health check");
if openai_api {
eprintln!(" POST /v1/completions - OpenAI-compatible completions");
}
if prepared.batch_mode_enabled && openai_api {
eprintln!(" POST /v1/batch/completions - GPU batch completions (PARITY-022)");
eprintln!(" POST /v1/gpu/warmup - Warmup GPU cache");
eprintln!(" GET /v1/gpu/status - GPU status");
}
eprintln!(" POST /generate - Generate text (Q4_K fused)");
if !openai_api {
eprintln!(" (OpenAI-compatible /v1/* routes disabled)");
}
eprintln!();
if prepared.batch_mode_enabled {
eprintln!("M4 Parity Target: 192 tok/s at concurrency >= 4");
eprintln!("Benchmark with: wrk -t4 -c4 -d30s http://{addr}/v1/completions");
eprintln!();
}
let listener = tokio::net::TcpListener::bind(addr).await.map_err(|e| {
crate::error::RealizarError::UnsupportedOperation {
operation: "bind".to_string(),
reason: format!("Failed to bind: {e}"),
}
})?;
axum::serve(listener, app).await.map_err(|e| {
crate::error::RealizarError::UnsupportedOperation {
operation: "serve".to_string(),
reason: format!("Server error: {e}"),
}
})?;
Ok(())
}
pub async fn serve_demo(host: &str, port: u16, openai_api: bool) -> Result<()> {
use std::net::SocketAddr;
eprintln!("Starting Realizar inference server (demo mode)...");
let state = crate::api::AppState::demo()?;
let router_config = crate::api::RouterConfig { openai_api };
let app = crate::api::create_router_with_config(state, router_config);
let addr: SocketAddr = format!("{host}:{port}").parse().map_err(|e| {
crate::error::RealizarError::InvalidShape {
reason: format!("Invalid address: {e}"),
}
})?;
eprintln!("Server listening on http://{addr}");
eprintln!();
eprintln!("Endpoints:");
eprintln!(" GET /health - Health check");
eprintln!(" POST /tokenize - Tokenize text");
eprintln!(" POST /generate - Generate text");
if !openai_api {
eprintln!(" (OpenAI-compatible /v1/* routes disabled)");
}
eprintln!();
eprintln!("Example:");
eprintln!(" curl http://{addr}/health");
eprintln!();
let listener = tokio::net::TcpListener::bind(addr).await.map_err(|e| {
crate::error::RealizarError::InvalidShape {
reason: format!("Failed to bind: {e}"),
}
})?;
axum::serve(listener, app).await.map_err(|e| {
crate::error::RealizarError::InvalidShape {
reason: format!("Server error: {e}"),
}
})?;
Ok(())
}
}
#[cfg(feature = "server")]
pub use server_commands::*;