use std::net::SocketAddr;
use clap::{Parser, Subcommand};
#[cfg(feature = "registry")]
use pacha::resolver::{ModelResolver, ModelSource};
#[cfg(feature = "registry")]
use pacha::uri::ModelUri;
use realizar::{
api::{create_router, AppState},
cli,
error::Result,
};
#[derive(Parser)]
#[command(name = "realizar")]
#[command(version, about, long_about = None)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
Run {
#[arg(value_name = "MODEL")]
model: String,
#[arg(value_name = "PROMPT")]
prompt: Option<String>,
#[arg(short = 'n', long, default_value = "256")]
max_tokens: usize,
#[arg(short, long, default_value = "0.7")]
temperature: f32,
#[arg(short, long, default_value = "text")]
format: String,
},
Chat {
#[arg(value_name = "MODEL")]
model: String,
#[arg(short, long)]
system: Option<String>,
#[arg(long)]
history: Option<String>,
},
List {
#[arg(short, long)]
remote: Option<String>,
#[arg(short, long, default_value = "table")]
format: String,
},
Pull {
#[arg(value_name = "MODEL")]
model: String,
#[arg(short, long)]
force: bool,
#[arg(short, long)]
quantize: Option<String>,
},
Push {
#[arg(value_name = "MODEL")]
model: String,
#[arg(long)]
to: Option<String>,
},
Serve {
#[arg(short = 'H', long, default_value = "127.0.0.1")]
host: String,
#[arg(short, long, default_value = "8080")]
port: u16,
#[arg(short, long)]
model: Option<String>,
#[arg(long)]
demo: bool,
#[arg(long, default_value = "true")]
openai_api: bool,
#[arg(long)]
batch: bool,
},
Bench {
#[arg(value_name = "SUITE")]
suite: Option<String>,
#[arg(short, long)]
list: bool,
#[arg(long)]
runtime: Option<String>,
#[arg(long)]
model: Option<String>,
#[arg(long)]
url: Option<String>,
#[arg(short, long)]
output: Option<String>,
},
BenchConvoy {
#[arg(long)]
runtime: Option<String>,
#[arg(long)]
model: Option<String>,
#[arg(short, long)]
output: Option<String>,
},
BenchSaturation {
#[arg(long)]
runtime: Option<String>,
#[arg(long)]
model: Option<String>,
#[arg(short, long)]
output: Option<String>,
},
BenchCompare {
#[arg(value_name = "FILE1")]
file1: String,
#[arg(value_name = "FILE2")]
file2: String,
#[arg(short, long, default_value = "5.0")]
threshold: f64,
},
BenchRegression {
#[arg(value_name = "BASELINE")]
baseline: String,
#[arg(value_name = "CURRENT")]
current: String,
#[arg(long)]
strict: bool,
},
Viz {
#[arg(short, long)]
color: bool,
#[arg(short, long, default_value = "100")]
samples: usize,
},
Info,
}
#[tokio::main]
async fn main() -> Result<()> {
let parsed = Cli::parse();
match parsed.command {
Commands::Run {
model,
prompt,
max_tokens,
temperature,
format,
} => {
run_model(&model, prompt.as_deref(), max_tokens, temperature, &format).await?;
},
Commands::Chat {
model,
system,
history,
} => {
run_chat(&model, system.as_deref(), history.as_deref()).await?;
},
Commands::List { remote, format } => {
list_models(remote.as_deref(), &format)?;
},
Commands::Pull {
model,
force,
quantize,
} => {
pull_model(&model, force, quantize.as_deref()).await?;
},
Commands::Push { model, to } => {
push_model(&model, to.as_deref()).await?;
},
Commands::Serve {
host,
port,
model,
demo,
openai_api: _,
batch,
} => {
if demo {
serve_demo(&host, port).await?;
} else if let Some(model_path) = model {
serve_model(&host, port, &model_path, batch).await?;
} else {
eprintln!("Error: Either --model or --demo must be specified");
eprintln!();
eprintln!("Usage:");
eprintln!(" realizar serve --demo # Use demo model");
eprintln!(" realizar serve --model path.gguf # Load GGUF model");
eprintln!(
" realizar serve --model path.gguf --batch # Enable M4 parity batch mode"
);
std::process::exit(1);
}
},
Commands::Bench {
suite,
list,
runtime,
model,
url,
output,
} => {
cli::run_benchmarks(suite, list, runtime, model, url, output)?;
},
Commands::BenchConvoy {
runtime,
model,
output,
} => {
cli::run_convoy_test(runtime, model, output)?;
},
Commands::BenchSaturation {
runtime,
model,
output,
} => {
cli::run_saturation_test(runtime, model, output)?;
},
Commands::BenchCompare {
file1,
file2,
threshold,
} => {
cli::run_bench_compare(&file1, &file2, threshold)?;
},
Commands::BenchRegression {
baseline,
current,
strict,
} => {
if cli::run_bench_regression(&baseline, ¤t, strict).is_err() {
std::process::exit(1);
}
},
Commands::Viz { color, samples } => {
cli::run_visualization(color, samples);
},
Commands::Info => {
cli::print_info();
},
}
Ok(())
}
async fn serve_demo(host: &str, port: u16) -> Result<()> {
println!("Starting Realizar inference server (demo mode)...");
let state = AppState::demo()?;
let app = create_router(state);
let addr: SocketAddr = format!("{host}:{port}").parse().map_err(|e| {
realizar::error::RealizarError::InvalidShape {
reason: format!("Invalid address: {e}"),
}
})?;
println!("Server listening on http://{addr}");
println!();
println!("Endpoints:");
println!(" GET /health - Health check");
println!(" POST /tokenize - Tokenize text");
println!(" POST /generate - Generate text");
println!();
println!("Example:");
println!(" curl http://{addr}/health");
println!();
let listener = tokio::net::TcpListener::bind(addr).await.map_err(|e| {
realizar::error::RealizarError::InvalidShape {
reason: format!("Failed to bind: {e}"),
}
})?;
axum::serve(listener, app)
.await
.map_err(|e| realizar::error::RealizarError::InvalidShape {
reason: format!("Server error: {e}"),
})?;
Ok(())
}
async fn serve_model(host: &str, port: u16, model_path: &str, batch_mode: bool) -> Result<()> {
use realizar::gguf::MappedGGUFModel;
println!("Loading model from: {model_path}");
if batch_mode {
println!("Mode: BATCH (PARITY-093 M4 parity)");
} else {
println!("Mode: SINGLE-REQUEST");
}
println!();
if model_path.ends_with(".gguf") {
println!("Parsing GGUF file...");
let mapped = MappedGGUFModel::from_path(model_path).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "load_gguf".to_string(),
reason: format!("Failed to load GGUF: {e}"),
}
})?;
println!("Successfully loaded GGUF model");
println!(" Tensors: {}", mapped.model.tensors.len());
println!(" Metadata: {} entries", mapped.model.metadata.len());
println!();
println!("Creating quantized model (fused Q4_K ops)...");
let quantized_model =
realizar::gguf::OwnedQuantizedModel::from_mapped(&mapped).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "create_quantized".to_string(),
reason: format!("Failed to create quantized model: {e}"),
}
})?;
println!("Quantized model created successfully!");
println!(" Vocab size: {}", quantized_model.config.vocab_size);
println!(" Hidden dim: {}", quantized_model.config.hidden_dim);
println!(" Layers: {}", quantized_model.layers.len());
println!();
#[cfg(feature = "cuda")]
let mut quantized_model = quantized_model;
#[cfg(feature = "cuda")]
if std::env::var("REALIZAR_BACKEND")
.map(|v| v.eq_ignore_ascii_case("cuda"))
.unwrap_or(false)
{
println!("Enabling CUDA backend (REALIZAR_BACKEND=cuda)...");
match quantized_model.enable_cuda(0) {
Ok(()) => {
println!(" CUDA enabled on GPU 0");
println!(" cuda_enabled: {}", quantized_model.cuda_enabled());
},
Err(e) => {
eprintln!(" Warning: CUDA enable failed: {}. Falling back to CPU.", e);
},
}
println!();
}
let state = {
#[cfg(feature = "gpu")]
{
if batch_mode {
use realizar::gguf::OwnedQuantizedModelCachedSync;
println!("Initializing batch inference mode (PARITY-093/094)...");
let cached_model = OwnedQuantizedModelCachedSync::new(quantized_model);
println!(" Warming up GPU cache (dequantizing FFN weights)...");
match cached_model.warmup_gpu_cache() {
Ok((memory_bytes, num_layers)) => {
println!(
" GPU cache ready: {:.2} GB ({} layers)",
memory_bytes as f64 / 1e9,
num_layers
);
},
Err(e) => {
eprintln!(
" Warning: GPU cache warmup failed: {}. Falling back to CPU batch.",
e
);
},
}
let state = realizar::api::AppState::with_cached_model(cached_model)?;
let cached_model_arc = state
.cached_model()
.expect("cached_model should exist")
.clone();
let batch_config = realizar::api::BatchConfig::default();
println!(" Batch window: {}ms", batch_config.window_ms);
println!(" Min batch size: {}", batch_config.min_batch);
println!(" Optimal batch: {}", batch_config.optimal_batch);
println!(" Max batch size: {}", batch_config.max_batch);
println!(
" GPU threshold: {} (GPU GEMM for batch >= this)",
batch_config.gpu_threshold
);
let batch_tx = realizar::api::spawn_batch_processor(
cached_model_arc,
batch_config.clone(),
);
println!(" Batch processor: RUNNING");
println!();
state.with_batch_config(batch_tx, batch_config)
} else {
realizar::api::AppState::with_quantized_model(quantized_model)?
}
}
#[cfg(not(feature = "gpu"))]
{
if batch_mode {
eprintln!(
"Warning: --batch requires 'gpu' feature. Falling back to single-request mode."
);
}
realizar::api::AppState::with_quantized_model(quantized_model)?
}
};
let app = realizar::api::create_router(state);
let addr: std::net::SocketAddr = format!("{host}:{port}").parse().map_err(|e| {
realizar::error::RealizarError::InvalidShape {
reason: format!("Invalid address: {e}"),
}
})?;
println!("Server listening on http://{addr}");
println!();
println!("Endpoints:");
println!(" GET /health - Health check");
println!(" POST /v1/completions - OpenAI-compatible completions");
if batch_mode {
println!(" POST /v1/batch/completions - GPU batch completions (PARITY-022)");
println!(" POST /v1/gpu/warmup - Warmup GPU cache");
println!(" GET /v1/gpu/status - GPU status");
}
println!(" POST /generate - Generate text (Q4_K fused)");
println!();
if batch_mode {
println!("M4 Parity Target: 192 tok/s at concurrency >= 4");
println!("Benchmark with: wrk -t4 -c4 -d30s http://{addr}/v1/completions");
println!();
}
let listener = tokio::net::TcpListener::bind(addr).await.map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "bind".to_string(),
reason: format!("Failed to bind: {e}"),
}
})?;
axum::serve(listener, app).await.map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "serve".to_string(),
reason: format!("Server error: {e}"),
}
})?;
} else if model_path.ends_with(".safetensors") {
let file_data = std::fs::read(model_path).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "read_model_file".to_string(),
reason: format!("Failed to read {model_path}: {e}"),
}
})?;
cli::load_safetensors_model(&file_data)?;
} else if model_path.ends_with(".apr") {
let file_data = std::fs::read(model_path).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "read_model_file".to_string(),
reason: format!("Failed to read {model_path}: {e}"),
}
})?;
cli::load_apr_model(&file_data)?;
} else {
return Err(realizar::error::RealizarError::UnsupportedOperation {
operation: "detect_model_type".to_string(),
reason: "Unsupported file extension. Expected .gguf, .safetensors, or .apr".to_string(),
});
}
Ok(())
}
#[cfg(feature = "registry")]
async fn run_model(
model_ref: &str,
prompt: Option<&str>,
max_tokens: usize,
temperature: f32,
format: &str,
) -> Result<()> {
println!("Loading model: {model_ref}");
let file_data = match ModelUri::parse(model_ref) {
Ok(uri) => {
let resolver = ModelResolver::new_default().map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "init_resolver".to_string(),
reason: format!("Failed to initialize Pacha resolver: {e}"),
}
})?;
match resolver.resolve(&uri) {
Ok(resolved) => {
match &resolved.source {
ModelSource::LocalFile(path) => {
println!(" Source: local file ({path})");
},
ModelSource::PachaLocal { name, version } => {
println!(" Source: Pacha registry ({name}:{version})");
},
ModelSource::PachaRemote {
host,
name,
version,
} => {
println!(" Source: Remote registry {host} ({name}:{version})");
},
ModelSource::HuggingFace { repo_id, revision } => {
let rev = revision.as_deref().unwrap_or("main");
println!(" Source: HuggingFace ({repo_id}@{rev})");
},
}
resolved.data
},
Err(e) => {
if std::path::Path::new(model_ref).exists() {
println!(" Source: local file");
std::fs::read(model_ref).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "read_model".to_string(),
reason: format!("Failed to read {model_ref}: {e}"),
}
})?
} else {
return Err(realizar::error::RealizarError::UnsupportedOperation {
operation: "resolve_model".to_string(),
reason: format!("Failed to resolve model: {e}"),
});
}
},
}
},
Err(_) => {
if !std::path::Path::new(model_ref).exists() {
return Err(realizar::error::RealizarError::ModelNotFound(
model_ref.to_string(),
));
}
println!(" Source: local file");
std::fs::read(model_ref).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "read_model".to_string(),
reason: format!("Failed to read {model_ref}: {e}"),
}
})?
},
};
cli::display_model_info(model_ref, &file_data)?;
println!();
if let Some(prompt_text) = prompt {
println!("Prompt: {prompt_text}");
println!("Max tokens: {max_tokens}");
println!("Temperature: {temperature}");
println!("Format: {format}");
println!();
run_gguf_inference(
model_ref,
&file_data,
prompt_text,
max_tokens,
temperature,
format,
)?;
} else {
println!("Interactive mode (Ctrl+D to exit)");
println!();
println!("Model loaded ({} bytes)", file_data.len());
println!("Use a prompt argument:");
println!(" realizar run {model_ref} \"Your prompt here\"");
}
Ok(())
}
fn run_gguf_inference(
model_ref: &str,
_file_data: &[u8],
prompt: &str,
max_tokens: usize,
temperature: f32,
format: &str,
) -> Result<()> {
use realizar::gguf::{MappedGGUFModel, OwnedQuantizedKVCache, QuantizedGGUFTransformer};
use std::time::Instant;
let load_start = Instant::now();
let mapped = MappedGGUFModel::from_path(model_ref).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "mmap_gguf".to_string(),
reason: format!("Failed to mmap GGUF: {e}"),
}
})?;
let model = QuantizedGGUFTransformer::from_gguf(&mapped.model, mapped.data()).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "load_model".to_string(),
reason: format!("Failed to load model: {e}"),
}
})?;
let load_time = load_start.elapsed();
println!("Model loaded in {:.2}ms", load_time.as_secs_f64() * 1000.0);
let mut prompt_tokens: Vec<u32> = mapped
.model
.encode(prompt)
.unwrap_or_else(|| prompt.chars().map(|c| c as u32).collect());
if let Some(bos) = mapped.model.bos_token_id() {
prompt_tokens.insert(0, bos);
}
let prompt_len = prompt_tokens.len();
let eos_token_id = mapped.model.eos_token_id();
let config = model.config();
println!(
"Architecture: {:?}, Hidden: {}, Layers: {}, Heads: {}/{} (KV)",
mapped.model.architecture(),
config.hidden_dim,
config.num_layers,
config.num_heads,
config.num_kv_heads
);
println!(
"Prompt tokens: {} (BOS={:?}, EOS={:?})",
prompt_len,
mapped.model.bos_token_id(),
eos_token_id
);
println!("Temperature: {:.1}", temperature);
println!();
let gen_start = Instant::now();
let max_seq_len = prompt_tokens.len() + max_tokens;
let mut cache = OwnedQuantizedKVCache::from_config(config, max_seq_len);
let mut all_tokens = prompt_tokens.clone();
let mut logits = vec![];
for (pos, &token_id) in prompt_tokens.iter().enumerate() {
logits = model.forward_cached(token_id, &mut cache, pos)?;
}
for i in 0..max_tokens {
if i > 0 {
let position = prompt_tokens.len() + i - 1;
let last_token = *all_tokens.last().unwrap();
logits = model.forward_cached(last_token, &mut cache, position)?;
}
let next_token = if temperature <= 0.01 {
logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map_or(0, |(idx, _)| idx as u32)
} else {
QuantizedGGUFTransformer::sample_topk(&logits, temperature, 40)
};
if let Some(eos) = eos_token_id {
if next_token == eos {
break;
}
}
all_tokens.push(next_token);
}
let generated = all_tokens;
let gen_time = gen_start.elapsed();
let tokens_generated = generated.len() - prompt_len;
let tokens_per_sec = if gen_time.as_secs_f64() > 0.0 {
tokens_generated as f64 / gen_time.as_secs_f64()
} else {
0.0
};
let output_text = mapped
.model
.decode(&generated[prompt_len..])
.replace('▁', " ");
match format {
"json" => {
let json = serde_json::json!({
"model": model_ref,
"prompt": prompt,
"generated_text": output_text,
"tokens_generated": tokens_generated,
"generation_time_ms": gen_time.as_secs_f64() * 1000.0,
"tokens_per_second": tokens_per_sec,
"temperature": temperature,
});
println!(
"{}",
serde_json::to_string_pretty(&json).unwrap_or_default()
);
},
_ => {
println!(
"Generated ({tokens_generated} tokens in {:.2}ms):",
gen_time.as_secs_f64() * 1000.0
);
println!("{prompt}{output_text}");
println!();
println!("Performance: {:.1} tok/s", tokens_per_sec);
},
}
Ok(())
}
#[allow(dead_code)] fn run_apr_inference(
model_ref: &str,
file_data: &[u8],
prompt: &str,
max_tokens: usize,
temperature: f32,
format: &str,
) -> Result<()> {
use realizar::apr_transformer::AprTransformer;
use std::time::Instant;
let load_start = Instant::now();
let transformer = AprTransformer::from_apr_bytes(file_data).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "parse_apr".to_string(),
reason: format!("Failed to parse APR: {e}"),
}
})?;
let load_time = load_start.elapsed();
println!("Model loaded in {:.2}ms", load_time.as_secs_f64() * 1000.0);
let prompt_tokens: Vec<u32> = prompt.chars().map(|c| c as u32).collect();
let prompt_len = prompt_tokens.len();
println!("Prompt tokens: {}", prompt_len);
println!("Temperature: {:.1} (using greedy decoding)", temperature);
println!();
let gen_start = Instant::now();
let generated = transformer.generate(&prompt_tokens, max_tokens)?;
let gen_time = gen_start.elapsed();
let tokens_generated = generated.len().saturating_sub(prompt_len);
let tokens_per_sec = if gen_time.as_secs_f64() > 0.0 {
tokens_generated as f64 / gen_time.as_secs_f64()
} else {
0.0
};
let output_text: String = generated[prompt_len..]
.iter()
.map(|&t| char::from_u32(t.min(127)).unwrap_or('?'))
.collect();
match format {
"json" => {
let json = serde_json::json!({
"model": model_ref,
"format": "APR",
"prompt": prompt,
"generated_text": output_text,
"tokens_generated": tokens_generated,
"generation_time_ms": gen_time.as_secs_f64() * 1000.0,
"tokens_per_second": tokens_per_sec,
"temperature": temperature,
});
println!(
"{}",
serde_json::to_string_pretty(&json).unwrap_or_default()
);
},
_ => {
println!(
"Generated ({tokens_generated} tokens in {:.2}ms):",
gen_time.as_secs_f64() * 1000.0
);
println!("{prompt}{output_text}");
println!();
println!("Performance: {:.1} tok/s", tokens_per_sec);
},
}
Ok(())
}
#[cfg(not(feature = "registry"))]
async fn run_model(
model_ref: &str,
prompt: Option<&str>,
max_tokens: usize,
temperature: f32,
format: &str,
) -> Result<()> {
println!("Loading model: {model_ref}");
if cli::is_local_file_path(model_ref) {
if !std::path::Path::new(model_ref).exists() {
return Err(realizar::error::RealizarError::ModelNotFound(
model_ref.to_string(),
));
}
println!(" Source: local file");
} else if model_ref.starts_with("pacha://") || model_ref.contains(':') {
println!(" Source: Pacha registry");
println!();
println!("Enable registry support: --features registry");
println!("Or use a local file path:");
println!(" realizar run ./model.gguf \"Your prompt\"");
return Ok(());
} else if model_ref.starts_with("hf://") {
println!(" Source: HuggingFace Hub");
println!();
println!("Enable registry support: --features registry");
println!("Or download manually and use:");
println!(" realizar run ./model.gguf \"Your prompt\"");
return Ok(());
}
let file_data = std::fs::read(model_ref).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "read_model".to_string(),
reason: format!("Failed to read {model_ref}: {e}"),
}
})?;
cli::display_model_info(model_ref, &file_data)?;
println!();
if let Some(prompt_text) = prompt {
println!("Prompt: {prompt_text}");
println!("Max tokens: {max_tokens}");
println!("Temperature: {temperature}");
println!("Format: {format}");
println!();
use realizar::format::{detect_format, ModelFormat};
let detected_format = detect_format(&file_data).unwrap_or(ModelFormat::Gguf);
match detected_format {
ModelFormat::Apr => {
run_apr_inference(
model_ref,
&file_data,
prompt_text,
max_tokens,
temperature,
format,
)?;
},
ModelFormat::Gguf | ModelFormat::SafeTensors => {
run_gguf_inference(
model_ref,
&file_data,
prompt_text,
max_tokens,
temperature,
format,
)?;
},
}
} else {
println!("Interactive mode (Ctrl+D to exit)");
println!();
println!("Model loaded ({} bytes)", file_data.len());
println!("Use a prompt argument:");
println!(" realizar run {model_ref} \"Your prompt here\"");
}
Ok(())
}
#[cfg(feature = "registry")]
async fn run_chat(
model_ref: &str,
system_prompt: Option<&str>,
history_file: Option<&str>,
) -> Result<()> {
use std::io::{BufRead, Write};
println!("Loading model: {model_ref}");
let file_data = match ModelUri::parse(model_ref) {
Ok(uri) => {
let resolver = ModelResolver::new_default().map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "init_resolver".to_string(),
reason: format!("Failed to initialize resolver: {e}"),
}
})?;
match resolver.resolve(&uri) {
Ok(resolved) => {
println!(" Source: {:?}", resolved.source);
resolved.data
},
Err(e) => {
if std::path::Path::new(model_ref).exists() {
std::fs::read(model_ref).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "read_model".to_string(),
reason: format!("Failed to read: {e}"),
}
})?
} else {
return Err(realizar::error::RealizarError::UnsupportedOperation {
operation: "resolve_model".to_string(),
reason: format!("Failed to resolve: {e}"),
});
}
},
}
},
Err(_) => {
if !std::path::Path::new(model_ref).exists() {
return Err(realizar::error::RealizarError::ModelNotFound(
model_ref.to_string(),
));
}
std::fs::read(model_ref).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "read_model".to_string(),
reason: format!("Failed to read: {e}"),
}
})?
},
};
cli::display_model_info(model_ref, &file_data)?;
println!(" Size: {} bytes", file_data.len());
println!();
let mut history: Vec<(String, String)> = if let Some(path) = history_file {
if std::path::Path::new(path).exists() {
let content = std::fs::read_to_string(path).unwrap_or_default();
serde_json::from_str(&content).unwrap_or_default()
} else {
Vec::new()
}
} else {
Vec::new()
};
if let Some(sys) = system_prompt {
println!("System: {sys}");
println!();
}
println!("Chat mode active. Type 'exit' or Ctrl+D to quit.");
println!("Commands: /clear (clear history), /history (show history)");
println!();
let stdin = std::io::stdin();
let mut stdout = std::io::stdout();
loop {
print!(">>> ");
stdout.flush().ok();
let mut input = String::new();
match stdin.lock().read_line(&mut input) {
Ok(0) => {
println!();
break;
},
Ok(_) => {
let input = input.trim();
if input.is_empty() {
continue;
}
if input == "exit" || input == "/exit" || input == "/quit" {
break;
}
if input == "/clear" {
history.clear();
println!("History cleared.");
continue;
}
if input == "/history" {
if history.is_empty() {
println!("No history.");
} else {
for (i, (user, assistant)) in history.iter().enumerate() {
println!("[{}] User: {}", i + 1, user);
println!(" Assistant: {}", assistant);
}
}
continue;
}
let response = format!("[Model loaded: {} bytes] Echo: {}", file_data.len(), input);
println!();
println!("{response}");
println!();
history.push((input.to_string(), response));
},
Err(e) => {
eprintln!("Error reading input: {e}");
break;
},
}
}
if let Some(path) = history_file {
if let Ok(json) = serde_json::to_string_pretty(&history) {
let _ = std::fs::write(path, json);
println!("History saved to {path}");
}
}
println!("Goodbye!");
Ok(())
}
#[cfg(not(feature = "registry"))]
async fn run_chat(
model_ref: &str,
system_prompt: Option<&str>,
history_file: Option<&str>,
) -> Result<()> {
use std::io::{BufRead, Write};
println!("Loading model: {model_ref}");
if !std::path::Path::new(model_ref).exists()
&& !model_ref.starts_with("pacha://")
&& !model_ref.starts_with("hf://")
{
return Err(realizar::error::RealizarError::ModelNotFound(
model_ref.to_string(),
));
}
if model_ref.starts_with("pacha://") || model_ref.starts_with("hf://") {
println!("Registry URIs require --features registry");
println!("Use a local file path instead.");
return Ok(());
}
let file_data = std::fs::read(model_ref).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "read_model".to_string(),
reason: format!("Failed to read: {e}"),
}
})?;
cli::display_model_info(model_ref, &file_data)?;
println!(" Size: {} bytes", file_data.len());
println!();
let mut history: Vec<(String, String)> = if let Some(path) = history_file {
if std::path::Path::new(path).exists() {
let content = std::fs::read_to_string(path).unwrap_or_default();
serde_json::from_str(&content).unwrap_or_default()
} else {
Vec::new()
}
} else {
Vec::new()
};
if let Some(sys) = system_prompt {
println!("System: {sys}");
println!();
}
println!("Chat mode active. Type 'exit' or Ctrl+D to quit.");
println!("Commands: /clear (clear history), /history (show history)");
println!();
let stdin = std::io::stdin();
let mut stdout = std::io::stdout();
loop {
print!(">>> ");
stdout.flush().ok();
let mut input = String::new();
match stdin.lock().read_line(&mut input) {
Ok(0) => {
println!();
break;
},
Ok(_) => {
let input = input.trim();
if input.is_empty() {
continue;
}
if input == "exit" || input == "/exit" || input == "/quit" {
break;
}
if input == "/clear" {
history.clear();
println!("History cleared.");
continue;
}
if input == "/history" {
if history.is_empty() {
println!("No history.");
} else {
for (i, (user, assistant)) in history.iter().enumerate() {
println!("[{}] User: {}", i + 1, user);
println!(" Assistant: {}", assistant);
}
}
continue;
}
let response = format!("[Model loaded: {} bytes] Echo: {}", file_data.len(), input);
println!();
println!("{response}");
println!();
history.push((input.to_string(), response));
},
Err(e) => {
eprintln!("Error reading input: {e}");
break;
},
}
}
if let Some(path) = history_file {
if let Ok(json) = serde_json::to_string_pretty(&history) {
let _ = std::fs::write(path, json);
println!("History saved to {path}");
}
}
println!("Goodbye!");
Ok(())
}
#[cfg(feature = "registry")]
fn list_models(remote: Option<&str>, format: &str) -> Result<()> {
println!("Available Models");
println!("================");
println!();
if let Some(remote_url) = remote {
println!("Remote registry: {remote_url}");
println!();
println!("Note: Remote registry listing requires --features remote in Pacha.");
return Ok(());
}
let resolver = match ModelResolver::new_default() {
Ok(r) => r,
Err(_) => {
println!("No Pacha registry found.");
println!();
println!("Initialize registry:");
println!(" pacha init");
println!();
println!("Or run a local file:");
println!(" realizar run ./model.gguf \"prompt\"");
return Ok(());
},
};
if !resolver.has_registry() {
println!("No Pacha registry found.");
println!();
println!("Initialize registry:");
println!(" pacha init");
return Ok(());
}
let models = match resolver.list_models() {
Ok(m) => m,
Err(e) => {
println!("Failed to list models: {e}");
return Ok(());
},
};
if models.is_empty() {
println!("No models found in local registry.");
println!();
println!("Pull a model:");
println!(" realizar pull llama3:8b");
println!();
println!("Or run a local file:");
println!(" realizar run ./model.gguf \"prompt\"");
} else {
match format {
"json" => {
let json_models: Vec<_> = models
.iter()
.map(|name| {
let versions = resolver.list_versions(name).unwrap_or_default();
serde_json::json!({
"name": name,
"versions": versions.len()
})
})
.collect();
println!(
"{}",
serde_json::to_string_pretty(&json_models).unwrap_or_default()
);
},
_ => {
println!("{:<40} {:>12}", "NAME", "VERSIONS");
println!("{}", "-".repeat(54));
for name in &models {
let versions = resolver.list_versions(name).unwrap_or_default();
println!("{:<40} {:>12}", name, versions.len());
}
},
}
}
Ok(())
}
#[cfg(not(feature = "registry"))]
fn list_models(remote: Option<&str>, format: &str) -> Result<()> {
println!("Available Models");
println!("================");
println!();
if let Some(remote_url) = remote {
println!("Remote registry: {remote_url}");
println!();
println!("Note: Remote registry listing requires --features registry.");
return Ok(());
}
let pacha_dir = cli::home_dir()
.map(|h| h.join(".pacha").join("models"))
.unwrap_or_else(|| std::path::PathBuf::from(".pacha/models"));
if !pacha_dir.exists() {
println!("No models found in local registry.");
println!();
println!("Pull a model:");
println!(" realizar pull llama3:8b");
println!();
println!("Or run a local file:");
println!(" realizar run ./model.gguf \"prompt\"");
return Ok(());
}
let mut models_found = Vec::new();
if let Ok(entries) = std::fs::read_dir(&pacha_dir) {
for entry in entries.flatten() {
let path = entry.path();
if path.is_file() {
let name = path.file_name().unwrap_or_default().to_string_lossy();
if name.ends_with(".gguf")
|| name.ends_with(".safetensors")
|| name.ends_with(".apr")
{
let size = std::fs::metadata(&path).map(|m| m.len()).unwrap_or(0);
models_found.push((name.to_string(), size));
}
}
}
}
if models_found.is_empty() {
println!("No models found in {}", pacha_dir.display());
} else {
match format {
"json" => {
let json_models: Vec<_> = models_found
.iter()
.map(|(name, size)| {
serde_json::json!({
"name": name,
"size_bytes": size,
"size_human": cli::format_size(*size)
})
})
.collect();
println!(
"{}",
serde_json::to_string_pretty(&json_models).unwrap_or_default()
);
},
_ => {
println!("{:<40} {:>12}", "NAME", "SIZE");
println!("{}", "-".repeat(54));
for (name, size) in &models_found {
println!("{:<40} {:>12}", name, cli::format_size(*size));
}
},
}
}
Ok(())
}
#[cfg(feature = "registry")]
async fn pull_model(model_ref: &str, force: bool, quantize: Option<&str>) -> Result<()> {
println!("Pulling model: {model_ref}");
if force {
println!(" Force: re-downloading even if cached");
}
if let Some(q) = quantize {
println!(" Quantize: {q}");
}
println!();
let uri = ModelUri::parse(model_ref).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "parse_uri".to_string(),
reason: format!("Invalid model reference: {e}"),
}
})?;
let resolver = ModelResolver::new_default().map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "init_resolver".to_string(),
reason: format!("Failed to initialize Pacha resolver: {e}"),
}
})?;
if !force && resolver.exists(&uri) {
println!("Model already cached locally.");
println!("Use --force to re-download.");
return Ok(());
}
println!("Downloading...");
let resolved = resolver.resolve(&uri).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "pull_model".to_string(),
reason: format!("Failed to pull model: {e}"),
}
})?;
println!(" Downloaded: {} bytes", resolved.data.len());
match &resolved.source {
ModelSource::LocalFile(path) => {
println!(" Source: local file ({path})");
},
ModelSource::PachaLocal { name, version } => {
println!(" Source: Pacha local ({name}:{version})");
},
ModelSource::PachaRemote {
host,
name,
version,
} => {
println!(" Source: Remote {host} ({name}:{version})");
println!(" Cached to local registry.");
},
ModelSource::HuggingFace { repo_id, revision } => {
let rev = revision.as_deref().unwrap_or("main");
println!(" Source: HuggingFace ({repo_id}@{rev})");
},
}
println!();
println!("Model ready! Run with:");
println!(" realizar run {model_ref} \"Your prompt\"");
Ok(())
}
#[cfg(not(feature = "registry"))]
async fn pull_model(model_ref: &str, force: bool, quantize: Option<&str>) -> Result<()> {
println!("Pulling model: {model_ref}");
if force {
println!(" Force: re-downloading even if cached");
}
if let Some(q) = quantize {
println!(" Quantize: {q}");
}
println!();
if let Some(hf_path) = model_ref.strip_prefix("hf://") {
println!("Source: HuggingFace Hub");
println!("Model: {hf_path}");
println!();
println!("Enable registry support: --features registry");
println!("Or manual download:");
println!(" huggingface-cli download {hf_path}");
} else if let Some(pacha_path) = model_ref.strip_prefix("pacha://") {
println!("Source: Pacha Registry");
println!("Model: {pacha_path}");
println!();
println!("Enable registry support: --features registry");
} else {
println!("Source: Default registry (Pacha)");
println!("Model: {model_ref}");
println!();
println!("Enable registry support: --features registry");
println!("Or download manually and use:");
println!(" realizar run ./downloaded-model.gguf \"prompt\"");
}
Ok(())
}
#[cfg(feature = "registry")]
async fn push_model(model_ref: &str, target: Option<&str>) -> Result<()> {
use pacha::Registry;
println!("Pushing model: {model_ref}");
let (name, version_str) = if let Some(idx) = model_ref.rfind(':') {
(&model_ref[..idx], &model_ref[idx + 1..])
} else {
(model_ref, "latest")
};
println!(" Name: {name}");
println!(" Version: {version_str}");
if let Some(t) = target {
println!(" Target: {t}");
println!();
println!("Remote push requires --features remote in Pacha.");
println!("Use pacha CLI for remote operations:");
println!(" pacha push {model_ref} --to {t}");
} else {
println!(" Target: local Pacha registry");
println!();
let local_path = format!("{name}.gguf");
if !std::path::Path::new(&local_path).exists() {
println!("Local file not found: {local_path}");
println!();
println!("To push a model to registry:");
println!(" 1. Have the model file: {name}.gguf");
println!(" 2. Run: realizar push {name}:{version_str}");
return Ok(());
}
let data = std::fs::read(&local_path).map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "read_model".to_string(),
reason: format!("Failed to read {local_path}: {e}"),
}
})?;
let registry = Registry::open_default().map_err(|e| {
realizar::error::RealizarError::UnsupportedOperation {
operation: "open_registry".to_string(),
reason: format!("Failed to open Pacha registry: {e}"),
}
})?;
let version = parse_model_version(version_str)?;
let card = pacha::model::ModelCard::new(format!("Model {name} pushed via realizar"));
registry
.register_model(name, &version, &data, card)
.map_err(|e| realizar::error::RealizarError::UnsupportedOperation {
operation: "register_model".to_string(),
reason: format!("Failed to register model: {e}"),
})?;
println!("Model registered successfully!");
println!();
println!("Run with:");
println!(" realizar run pacha://{name}:{version_str} \"Your prompt\"");
}
Ok(())
}
#[cfg(not(feature = "registry"))]
async fn push_model(model_ref: &str, target: Option<&str>) -> Result<()> {
println!("Pushing model: {model_ref}");
if let Some(t) = target {
println!(" Target: {t}");
} else {
println!(" Target: default Pacha registry");
}
println!();
println!("Enable registry support: --features registry");
Ok(())
}
#[cfg(feature = "registry")]
fn parse_model_version(s: &str) -> Result<pacha::model::ModelVersion> {
let parts: Vec<&str> = s.split('.').collect();
if parts.len() == 3 {
let major: u32 =
parts[0]
.parse()
.map_err(|_| realizar::error::RealizarError::UnsupportedOperation {
operation: "parse_version".to_string(),
reason: format!("Invalid version: {s}"),
})?;
let minor: u32 =
parts[1]
.parse()
.map_err(|_| realizar::error::RealizarError::UnsupportedOperation {
operation: "parse_version".to_string(),
reason: format!("Invalid version: {s}"),
})?;
let patch: u32 =
parts[2]
.parse()
.map_err(|_| realizar::error::RealizarError::UnsupportedOperation {
operation: "parse_version".to_string(),
reason: format!("Invalid version: {s}"),
})?;
return Ok(pacha::model::ModelVersion::new(major, minor, patch));
}
if s == "latest" {
return Ok(pacha::model::ModelVersion::new(1, 0, 0));
}
if let Ok(major) = s.parse::<u32>() {
return Ok(pacha::model::ModelVersion::new(major, 0, 0));
}
Err(realizar::error::RealizarError::UnsupportedOperation {
operation: "parse_version".to_string(),
reason: format!("Invalid version format: {s}. Expected: x.y.z"),
})
}