#![allow(clippy::missing_errors_doc)]
#![allow(clippy::needless_pass_by_value)]
#![allow(clippy::unreadable_literal)]
#![allow(clippy::case_sensitive_file_extension_comparisons)]
use crate::error::{RealizarError, Result};
pub mod inference;
#[cfg(feature = "cuda")]
pub use inference::run_gguf_inference_gpu;
pub use inference::{run_apr_inference, run_gguf_inference, run_safetensors_inference};
pub mod handlers;
pub use handlers::{Cli, Commands, RunConfig, ServeConfig};
pub async fn entrypoint(cli: Cli) -> Result<()> {
match cli.command {
Commands::Run {
model,
prompt,
max_tokens,
temperature,
format,
system,
raw,
gpu,
verbose,
trace,
} => {
run_model_command(
&model,
prompt.as_deref(),
max_tokens,
temperature,
&format,
system.as_deref(),
raw,
gpu,
verbose,
trace,
)
.await
},
Commands::Chat {
model,
system,
history,
} => run_chat_command(&model, system.as_deref(), history.as_deref()).await,
Commands::List { remote, format } => handlers::handle_list(remote.as_deref(), &format),
Commands::Pull {
model,
force,
quantize,
} => handlers::handle_pull(&model, force, quantize.as_deref()).await,
Commands::Push { model, to } => handlers::handle_push(&model, to.as_deref()).await,
Commands::Serve {
host,
port,
model,
demo,
openai_api,
batch,
gpu,
trace,
context_length,
no_fp8_cache,
} => {
handlers::handle_serve(ServeConfig {
host,
port,
model,
demo,
batch,
gpu,
openai_api,
trace,
context_length,
no_fp8_cache,
})
.await
},
Commands::Bench {
suite,
list,
runtime,
model,
url,
output,
} => run_benchmarks(suite, list, runtime, model, url, output),
Commands::BenchConvoy {
runtime,
model,
output,
} => run_convoy_test(runtime, model, output),
Commands::BenchSaturation {
runtime,
model,
output,
} => run_saturation_test(runtime, model, output),
Commands::BenchCompare {
file1,
file2,
threshold,
} => run_bench_compare(&file1, &file2, threshold),
Commands::BenchRegression {
baseline,
current,
strict,
} => {
if run_bench_regression(&baseline, ¤t, strict).is_err() {
std::process::exit(1);
}
Ok(())
},
Commands::Viz { color, samples } => {
run_visualization(color, samples);
Ok(())
},
Commands::Info => {
print_info();
Ok(())
},
}
}
fn format_model_prompt(
model_ref: &str,
prompt_text: &str,
system_prompt: Option<&str>,
raw_mode: bool,
) -> String {
use crate::chat_template::{auto_detect_template, ChatMessage};
if raw_mode {
return prompt_text.to_string();
}
let template = auto_detect_template(model_ref);
let mut messages = Vec::new();
if let Some(sys) = system_prompt {
messages.push(ChatMessage::system(sys));
}
messages.push(ChatMessage::user(prompt_text));
template
.format_conversation(&messages)
.unwrap_or_else(|_| prompt_text.to_string())
}
enum ModelSource {
Local,
RegistryPacha,
RegistryHf,
}
fn detect_model_source(model_ref: &str, verbose: bool) -> Result<ModelSource> {
if is_local_file_path(model_ref) {
handlers::validate_model_path(model_ref)?;
if verbose {
println!(" Source: local file");
}
return Ok(ModelSource::Local);
}
if model_ref.starts_with("pacha://") || model_ref.contains(':') {
return Ok(ModelSource::RegistryPacha);
}
if model_ref.starts_with("hf://") {
return Ok(ModelSource::RegistryHf);
}
Ok(ModelSource::Local)
}
#[allow(clippy::option_option)]
fn setup_trace_config(
trace: Option<Option<String>>,
) -> Option<crate::inference_trace::TraceConfig> {
let trace_config = handlers::parse_trace_config(trace);
if trace_config.is_some() {
std::env::set_var("GPU_DEBUG", "1");
eprintln!("[TRACE] Inference tracing enabled - GPU_DEBUG=1");
}
trace_config
}
#[allow(clippy::too_many_arguments)]
fn dispatch_inference(
model_ref: &str,
file_data: &[u8],
formatted_prompt: &str,
max_tokens: usize,
temperature: f32,
format: &str,
force_gpu: bool,
verbose: bool,
trace_config: Option<crate::inference_trace::TraceConfig>,
) -> Result<()> {
use crate::format::{detect_format, ModelFormat};
match detect_format(file_data).unwrap_or(ModelFormat::Gguf) {
ModelFormat::Apr => run_apr_inference(
model_ref,
file_data,
formatted_prompt,
max_tokens,
temperature,
format,
force_gpu,
verbose,
trace_config.clone(),
),
ModelFormat::SafeTensors => run_safetensors_inference(
model_ref,
formatted_prompt,
max_tokens,
temperature,
format,
trace_config.clone(),
),
ModelFormat::Gguf => run_gguf_inference(
model_ref,
file_data,
formatted_prompt,
max_tokens,
temperature,
format,
force_gpu,
verbose,
trace_config,
),
}
}
#[allow(
clippy::too_many_arguments,
clippy::unused_async,
clippy::option_option
)]
async fn run_model_command(
model_ref: &str,
prompt: Option<&str>,
max_tokens: usize,
temperature: f32,
format: &str,
system_prompt: Option<&str>,
raw_mode: bool,
force_gpu: bool,
verbose: bool,
trace: Option<Option<String>>,
) -> Result<()> {
let trace_config = setup_trace_config(trace);
match detect_model_source(model_ref, verbose)? {
ModelSource::Local => {},
ModelSource::RegistryPacha => {
println!(" Source: Pacha registry");
println!("Enable registry support: --features registry");
return Ok(());
},
ModelSource::RegistryHf => {
println!(" Source: HuggingFace Hub");
println!("Enable registry support: --features registry");
return Ok(());
},
}
let file_data = std::fs::read(model_ref).map_err(|e| RealizarError::UnsupportedOperation {
operation: "read_model".to_string(),
reason: format!("Failed to read {model_ref}: {e}"),
})?;
if verbose {
display_model_info(model_ref, &file_data)?;
}
if let Some(prompt_text) = prompt {
let formatted_prompt = format_model_prompt(model_ref, prompt_text, system_prompt, raw_mode);
dispatch_inference(
model_ref,
&file_data,
&formatted_prompt,
max_tokens,
temperature,
format,
force_gpu,
verbose,
trace_config,
)?;
} else {
println!("Interactive mode - use a prompt argument");
}
Ok(())
}
enum ChatAction {
Continue,
Exit,
Respond(String),
}
fn process_chat_input(input: &str, history: &mut Vec<(String, String)>) -> ChatAction {
if input.is_empty() {
return ChatAction::Continue;
}
if input == "exit" || input == "/quit" {
return ChatAction::Exit;
}
if input == "/clear" {
history.clear();
println!("Cleared.");
return ChatAction::Continue;
}
if input == "/history" {
for (i, (u, a)) in history.iter().enumerate() {
println!("[{}] {}: {}", i + 1, u, a);
}
return ChatAction::Continue;
}
ChatAction::Respond(input.to_string())
}
fn validate_chat_model(model_ref: &str) -> Result<()> {
if model_ref.starts_with("pacha://") || model_ref.starts_with("hf://") {
println!("Registry URIs require --features registry");
return Ok(());
}
if !std::path::Path::new(model_ref).exists() {
return Err(RealizarError::ModelNotFound(model_ref.to_string()));
}
Ok(())
}
fn load_chat_history(history_file: Option<&str>) -> Vec<(String, String)> {
history_file
.and_then(|p| std::fs::read_to_string(p).ok())
.and_then(|c| serde_json::from_str(&c).ok())
.unwrap_or_default()
}
fn save_chat_history(history_file: Option<&str>, history: &[(String, String)]) {
if let Some(path) = history_file {
if let Ok(json) = serde_json::to_string_pretty(&history) {
let _ = std::fs::write(path, json);
}
}
}
fn run_chat_loop(history: &mut Vec<(String, String)>) {
use std::io::{BufRead, Write};
let stdin = std::io::stdin();
let mut stdout = std::io::stdout();
loop {
print!(">>> ");
stdout.flush().ok();
let mut input = String::new();
match stdin.lock().read_line(&mut input) {
Ok(0) | Err(_) => break,
Ok(_) => match process_chat_input(input.trim(), history) {
ChatAction::Exit => break,
ChatAction::Respond(user_input) => {
let response = format!("[Echo] {}", user_input);
println!("{response}");
history.push((user_input, response));
},
ChatAction::Continue => {},
},
}
}
}
#[allow(clippy::unused_async)]
async fn run_chat_command(
model_ref: &str,
system_prompt: Option<&str>,
history_file: Option<&str>,
) -> Result<()> {
validate_chat_model(model_ref)?;
if model_ref.starts_with("pacha://") || model_ref.starts_with("hf://") {
return Ok(());
}
let file_data = std::fs::read(model_ref).map_err(|e| RealizarError::UnsupportedOperation {
operation: "read_model".to_string(),
reason: format!("Failed to read: {e}"),
})?;
display_model_info(model_ref, &file_data)?;
let mut history = load_chat_history(history_file);
if let Some(sys) = system_prompt {
println!("System: {sys}");
}
println!("Chat mode active. Type 'exit' to quit.");
run_chat_loop(&mut history);
save_chat_history(history_file, &history);
println!("Goodbye!");
Ok(())
}
pub const BENCHMARK_SUITES: &[(&str, &str)] = &[
(
"tensor_ops",
"Core tensor operations (add, mul, matmul, softmax)",
),
("inference", "End-to-end inference pipeline benchmarks"),
("cache", "KV cache operations and memory management"),
("tokenizer", "BPE and SentencePiece tokenization"),
("quantize", "Quantization/dequantization (Q4_0, Q8_0)"),
("lambda", "AWS Lambda cold start and warm invocation"),
(
"comparative",
"Framework comparison (MNIST, CIFAR-10, Iris)",
),
];
include!("display_utils.rs");
include!("mod_print_benchmark.rs");
include!("mod_gguf_info.rs");
include!("mod_server_commands.rs");
include!("mod_06.rs");