use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use std::time::Duration;
#[derive(Debug, Clone)]
pub struct ShowcaseConfig {
pub tier: ModelTier,
#[allow(dead_code)] pub model: String,
#[allow(dead_code)] pub quant: String,
pub model_dir: PathBuf,
pub auto_verify: bool,
pub step: Option<ShowcaseStep>,
pub baselines: Vec<Baseline>,
pub zram: bool,
pub bench_runs: usize,
pub export_format: ExportFormat,
pub export_path: Option<PathBuf>,
pub gpu: bool,
#[allow(dead_code)] pub verbose: bool,
#[allow(dead_code)] pub quiet: bool,
}
impl Default for ShowcaseConfig {
fn default() -> Self {
let tier = ModelTier::Small; Self {
tier,
model: tier.model_path().to_string(),
quant: "Q4_K_M".to_string(),
model_dir: PathBuf::from("./models"),
auto_verify: false,
step: None,
baselines: vec![Baseline::LlamaCpp, Baseline::Ollama],
zram: true,
bench_runs: 30,
export_format: ExportFormat::None,
export_path: None,
gpu: false,
verbose: false,
quiet: false,
}
}
}
impl ShowcaseConfig {
#[must_use]
#[cfg(test)]
pub fn with_tier(tier: ModelTier) -> Self {
Self {
tier,
model: tier.model_path().to_string(),
..Default::default()
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ShowcaseStep {
Import,
GgufInference,
Convert,
AprInference,
Benchmark,
Chat,
Visualize,
ZramDemo,
CudaDemo,
BrickDemo,
All,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Baseline {
LlamaCpp,
Ollama,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum ModelTier {
Tiny,
#[default]
Small,
Medium,
Large,
}
#[allow(clippy::trivially_copy_pass_by_ref)] impl ModelTier {
#[must_use]
pub fn model_path(&self) -> &'static str {
match self {
Self::Tiny => "Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF",
Self::Small => "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
Self::Medium => "bartowski/Qwen2.5-Coder-7B-Instruct-GGUF",
Self::Large => "bartowski/Qwen2.5-Coder-32B-Instruct-GGUF",
}
}
#[must_use]
pub fn gguf_filename(&self) -> &'static str {
match self {
Self::Tiny => "qwen2.5-coder-0.5b-instruct-q4_k_m.gguf",
Self::Small => "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
Self::Medium => "Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf",
Self::Large => "Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf",
}
}
#[must_use]
pub fn size_gb(&self) -> f32 {
match self {
Self::Tiny => 0.4,
Self::Small => 1.1,
Self::Medium => 4.5,
Self::Large => 19.0,
}
}
#[must_use]
pub fn params(&self) -> &'static str {
match self {
Self::Tiny => "0.5B",
Self::Small => "1.5B",
Self::Medium => "7B",
Self::Large => "32B",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
#[allow(dead_code)] pub enum ExportFormat {
#[default]
None,
Json,
Csv,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkComparison {
pub apr_tps: f64,
pub llama_cpp_tps: Option<f64>,
pub ollama_tps: Option<f64>,
pub apr_ttft_ms: f64,
pub llama_cpp_ttft_ms: Option<f64>,
pub ollama_ttft_ms: Option<f64>,
pub speedup_vs_llama: Option<f64>,
pub speedup_vs_ollama: Option<f64>,
pub apr_tps_stddev: f64,
pub runs: usize,
}
#[derive(Debug, Clone)]
pub struct BenchMeasurement {
pub tokens_generated: usize,
pub duration: Duration,
pub ttft: Duration,
}
impl BenchMeasurement {
pub fn tokens_per_second(&self) -> f64 {
if self.duration.as_secs_f64() > 0.0 {
self.tokens_generated as f64 / self.duration.as_secs_f64()
} else {
0.0
}
}
}
#[derive(Debug, Default)]
pub(super) struct ShowcaseResults {
pub(super) import: bool,
pub(super) gguf_inference: bool,
pub(super) convert: bool,
pub(super) apr_inference: bool,
pub(super) benchmark: Option<BenchmarkComparison>,
pub(super) visualize: bool,
pub(super) chat: bool,
pub(super) zram_demo: Option<ZramDemoResult>,
pub(super) cuda_demo: Option<CudaDemoResult>,
pub(super) brick_demo: Option<BrickDemoResult>,
}
#[derive(Debug, Clone, Default)]
#[allow(dead_code)] pub struct BrickDemoResult {
pub layers_measured: usize,
pub layer_timings_us: Vec<f64>,
pub bottleneck: Option<(String, f64)>,
pub total_us: f64,
pub tokens_per_sec: f64,
pub assertions_passed: bool,
}
#[derive(Debug, Clone)]
pub struct ZramDemoResult {
pub lz4_ratio: f64,
pub zstd_ratio: f64,
pub zero_page_gbps: f64,
pub lz4_gbps: f64,
pub simd_backend: String,
pub context_extension: f64,
}
#[derive(Debug, Clone)]
#[allow(dead_code)] pub struct CudaDemoResult {
pub device_count: usize,
pub device_name: String,
pub total_vram_gb: f64,
pub free_vram_gb: f64,
pub cuda_available: bool,
pub graph_capture_available: bool,
pub graph_speedup: f64,
pub dp4a_available: bool,
pub dp4a_arithmetic_intensity: f64,
}