use oxibonsai_core::config::Qwen3Config;
use crate::model_registry::ModelVariant;
#[derive(Debug, Clone)]
pub struct ModelSpec {
pub name: &'static str,
pub variant: ModelVariant,
pub config: Qwen3Config,
pub param_count: u64,
pub weights_size_bytes: u64,
pub kv_cache_4k_bytes: u64,
pub min_ram_bytes: u64,
pub description: &'static str,
}
pub fn bonsai_8b_spec() -> ModelSpec {
let config = Qwen3Config::bonsai_8b();
let param_count: u64 = 8_030_000_000;
let weights_size_bytes: u64 = 2_200_000_000;
let kv_cache_4k_bytes: u64 = kv_cache_size_bytes(&config, 4096);
let min_ram_bytes = weights_size_bytes + kv_cache_4k_bytes + 64 * 1024 * 1024;
ModelSpec {
name: "Bonsai-8B",
variant: ModelVariant::Bonsai8B,
config,
param_count,
weights_size_bytes,
kv_cache_4k_bytes,
min_ram_bytes,
description: "Bonsai-8B is the flagship variant based on Qwen3-8B. \
36-layer GQA transformer with 4096-dimensional hidden state, 32 query heads, \
8 KV heads, and a 65 536-token context window. Recommended for highest quality \
output where >= 4 GB RAM is available.",
}
}
pub fn bonsai_4b_spec() -> ModelSpec {
let config = Qwen3Config::bonsai_4b();
let param_count: u64 = 4_020_000_000;
let weights_size_bytes: u64 = 1_300_000_000;
let kv_cache_4k_bytes: u64 = kv_cache_size_bytes(&config, 4096);
let min_ram_bytes = weights_size_bytes + kv_cache_4k_bytes + 48 * 1024 * 1024;
ModelSpec {
name: "Bonsai-4B",
variant: ModelVariant::Bonsai4B,
config,
param_count,
weights_size_bytes,
kv_cache_4k_bytes,
min_ram_bytes,
description: "Bonsai-4B provides a balanced quality/memory trade-off. \
24-layer GQA transformer with 2560-dimensional hidden state, 20 query heads, \
4 KV heads, and a 65 536-token context window. Recommended when 2 GB RAM \
is available and maximum quality is not required.",
}
}
pub fn bonsai_1_7b_spec() -> ModelSpec {
let config = Qwen3Config::bonsai_1_7b();
let param_count: u64 = 1_720_000_000;
let weights_size_bytes: u64 = 700_000_000;
let kv_cache_4k_bytes: u64 = kv_cache_size_bytes(&config, 4096);
let min_ram_bytes = weights_size_bytes + kv_cache_4k_bytes + 32 * 1024 * 1024;
ModelSpec {
name: "Bonsai-1.7B",
variant: ModelVariant::Bonsai1_7B,
config,
param_count,
weights_size_bytes,
kv_cache_4k_bytes,
min_ram_bytes,
description: "Bonsai-1.7B is the smallest and fastest variant, designed for \
resource-constrained environments. 16-layer GQA transformer with 1536-dimensional \
hidden state, 12 query heads, 2 KV heads, and a 65 536-token context window. \
Runs with under 1 GB RAM.",
}
}
pub fn ternary_bonsai_8b_spec() -> ModelSpec {
let config = Qwen3Config::ternary_bonsai_8b();
let param_count: u64 = 8_030_000_000;
let weights_size_bytes: u64 = 1_750_000_000;
let kv_cache_4k_bytes: u64 = kv_cache_size_bytes(&config, 4096);
let min_ram_bytes = weights_size_bytes + kv_cache_4k_bytes + 64 * 1024 * 1024;
ModelSpec {
name: "Ternary-Bonsai-8B",
variant: ModelVariant::TernaryBonsai8B,
config,
param_count,
weights_size_bytes,
kv_cache_4k_bytes,
min_ram_bytes,
description: "Ternary-Bonsai-8B uses the same Qwen3-8B architecture as Bonsai-8B, \
but stores transformer weights in TQ2_0_g128 ternary format ({-1,0,+1}). \
Approximately 0.266 bytes/weight versus 0.14 bytes/weight for the 1-bit variant, \
trading a small size increase for ternary expressivity.",
}
}
pub fn ternary_bonsai_4b_spec() -> ModelSpec {
let config = Qwen3Config::ternary_bonsai_4b();
let param_count: u64 = 4_020_000_000;
let weights_size_bytes: u64 = 900_000_000;
let kv_cache_4k_bytes: u64 = kv_cache_size_bytes(&config, 4096);
let min_ram_bytes = weights_size_bytes + kv_cache_4k_bytes + 48 * 1024 * 1024;
ModelSpec {
name: "Ternary-Bonsai-4B",
variant: ModelVariant::TernaryBonsai4B,
config,
param_count,
weights_size_bytes,
kv_cache_4k_bytes,
min_ram_bytes,
description: "Ternary-Bonsai-4B uses the same Qwen3-4B architecture as Bonsai-4B, \
but stores transformer weights in TQ2_0_g128 ternary format ({-1,0,+1}).",
}
}
pub fn ternary_bonsai_1_7b_spec() -> ModelSpec {
let config = Qwen3Config::ternary_bonsai_1_7b();
let param_count: u64 = 1_720_000_000;
let weights_size_bytes: u64 = 390_000_000;
let kv_cache_4k_bytes: u64 = kv_cache_size_bytes(&config, 4096);
let min_ram_bytes = weights_size_bytes + kv_cache_4k_bytes + 32 * 1024 * 1024;
ModelSpec {
name: "Ternary-Bonsai-1.7B",
variant: ModelVariant::TernaryBonsai1_7B,
config,
param_count,
weights_size_bytes,
kv_cache_4k_bytes,
min_ram_bytes,
description: "Ternary-Bonsai-1.7B uses the same Qwen3-1.7B architecture as Bonsai-1.7B, \
but stores transformer weights in TQ2_0_g128 ternary format ({-1,0,+1}). \
Designed for resource-constrained environments where ternary weights are preferred.",
}
}
pub fn fp8_bonsai_8b_spec() -> ModelSpec {
let config = Qwen3Config::bonsai_8b();
let param_count: u64 = 8_030_000_000;
let weights_size_bytes: u64 = 8_500_000_000;
let kv_cache_4k_bytes: u64 = kv_cache_size_bytes(&config, 4096);
let min_ram_bytes = weights_size_bytes + kv_cache_4k_bytes + 64 * 1024 * 1024;
ModelSpec {
name: "FP8-Bonsai-8B",
variant: ModelVariant::FP8Bonsai8B,
config,
param_count,
weights_size_bytes,
kv_cache_4k_bytes,
min_ram_bytes,
description: "FP8-Bonsai-8B uses the same Qwen3-8B architecture as Bonsai-8B, \
but stores transformer weights in FP8 format (E4M3FN or E5M2). \
Approximately 1.0625 bytes/weight — higher precision than 1-bit or ternary, \
closer to FP16 quality with half the storage.",
}
}
pub fn fp8_bonsai_4b_spec() -> ModelSpec {
let config = Qwen3Config::bonsai_4b();
let param_count: u64 = 4_020_000_000;
let weights_size_bytes: u64 = 5_000_000_000;
let kv_cache_4k_bytes: u64 = kv_cache_size_bytes(&config, 4096);
let min_ram_bytes = weights_size_bytes + kv_cache_4k_bytes + 48 * 1024 * 1024;
ModelSpec {
name: "FP8-Bonsai-4B",
variant: ModelVariant::FP8Bonsai4B,
config,
param_count,
weights_size_bytes,
kv_cache_4k_bytes,
min_ram_bytes,
description: "FP8-Bonsai-4B uses the same Qwen3-4B architecture as Bonsai-4B, \
but stores transformer weights in FP8 format (E4M3FN or E5M2).",
}
}
pub fn fp8_bonsai_1_7b_spec() -> ModelSpec {
let config = Qwen3Config::bonsai_1_7b();
let param_count: u64 = 1_720_000_000;
let weights_size_bytes: u64 = 2_300_000_000;
let kv_cache_4k_bytes: u64 = kv_cache_size_bytes(&config, 4096);
let min_ram_bytes = weights_size_bytes + kv_cache_4k_bytes + 32 * 1024 * 1024;
ModelSpec {
name: "FP8-Bonsai-1.7B",
variant: ModelVariant::FP8Bonsai1_7B,
config,
param_count,
weights_size_bytes,
kv_cache_4k_bytes,
min_ram_bytes,
description: "FP8-Bonsai-1.7B uses the same Qwen3-1.7B architecture as Bonsai-1.7B, \
but stores transformer weights in FP8 format (E4M3FN or E5M2). \
Designed for resource-constrained environments where FP8 precision is preferred.",
}
}
pub fn all_specs() -> &'static [ModelSpec] {
use std::sync::OnceLock;
static SPECS: OnceLock<[ModelSpec; 9]> = OnceLock::new();
SPECS.get_or_init(|| {
[
bonsai_8b_spec(),
bonsai_4b_spec(),
bonsai_1_7b_spec(),
ternary_bonsai_8b_spec(),
ternary_bonsai_4b_spec(),
ternary_bonsai_1_7b_spec(),
fp8_bonsai_8b_spec(),
fp8_bonsai_4b_spec(),
fp8_bonsai_1_7b_spec(),
]
})
}
pub fn spec_for_variant(v: ModelVariant) -> Option<&'static ModelSpec> {
all_specs().iter().find(|s| s.variant == v)
}
fn kv_cache_size_bytes(config: &Qwen3Config, seq_len: usize) -> u64 {
let layers = config.num_layers as u64;
let kv_heads = config.num_kv_heads as u64;
let head_dim = config.head_dim as u64;
let seq = seq_len as u64;
layers * 2 * kv_heads * head_dim * seq * 4
}
#[derive(Debug, Clone)]
pub struct CapabilityProfile {
pub max_context_len: usize,
pub supports_system_prompt: bool,
pub supports_streaming: bool,
pub recommended_temperature: f32,
pub recommended_top_p: f32,
pub languages: &'static [&'static str],
pub use_cases: &'static [&'static str],
}
pub fn capability_profile(v: ModelVariant) -> CapabilityProfile {
const LANGUAGES: &[&str] = &[
"en", "zh", "ja", "ko", "de", "fr", "es", "pt", "it", "ru", "ar", "hi", "th", "vi", "id", "tr", "pl", "nl", "cs", "sv", ];
match v {
ModelVariant::Bonsai8B => CapabilityProfile {
max_context_len: 65536,
supports_system_prompt: true,
supports_streaming: true,
recommended_temperature: 0.7,
recommended_top_p: 0.9,
languages: LANGUAGES,
use_cases: &[
"Long-document summarisation",
"Complex multi-turn dialogue",
"Code generation and debugging",
"Structured data extraction",
"Creative writing and story-telling",
"Multilingual translation",
"Retrieval-augmented generation (RAG)",
],
},
ModelVariant::Bonsai4B => CapabilityProfile {
max_context_len: 65536,
supports_system_prompt: true,
supports_streaming: true,
recommended_temperature: 0.72,
recommended_top_p: 0.9,
languages: LANGUAGES,
use_cases: &[
"Short-to-medium document summarisation",
"Conversational chat assistants",
"Code completion and review",
"Data extraction and classification",
"On-device inference with moderate hardware",
],
},
ModelVariant::Bonsai1_7B => CapabilityProfile {
max_context_len: 65536,
supports_system_prompt: true,
supports_streaming: true,
recommended_temperature: 0.75,
recommended_top_p: 0.85,
languages: LANGUAGES,
use_cases: &[
"Edge / IoT on-device inference",
"Low-latency chatbot responses",
"Simple Q&A over short documents",
"Keyword extraction",
"Fast text classification",
"WASM browser deployment",
],
},
ModelVariant::TernaryBonsai8B => CapabilityProfile {
max_context_len: 65536,
supports_system_prompt: true,
supports_streaming: true,
recommended_temperature: 0.7,
recommended_top_p: 0.9,
languages: LANGUAGES,
use_cases: &[
"Long-document summarisation (ternary weights)",
"Complex multi-turn dialogue",
"Code generation and debugging",
"Structured data extraction",
"Creative writing and story-telling",
"Multilingual translation",
"Retrieval-augmented generation (RAG)",
],
},
ModelVariant::TernaryBonsai4B => CapabilityProfile {
max_context_len: 65536,
supports_system_prompt: true,
supports_streaming: true,
recommended_temperature: 0.72,
recommended_top_p: 0.9,
languages: LANGUAGES,
use_cases: &[
"Short-to-medium document summarisation (ternary weights)",
"Conversational chat assistants",
"Code completion and review",
"Data extraction and classification",
"On-device inference with moderate hardware",
],
},
ModelVariant::TernaryBonsai1_7B => CapabilityProfile {
max_context_len: 65536,
supports_system_prompt: true,
supports_streaming: true,
recommended_temperature: 0.75,
recommended_top_p: 0.85,
languages: LANGUAGES,
use_cases: &[
"Edge / IoT on-device inference (ternary weights)",
"Low-latency chatbot responses",
"Simple Q&A over short documents",
"Keyword extraction",
"Fast text classification",
"WASM browser deployment",
],
},
ModelVariant::FP8Bonsai8B => CapabilityProfile {
max_context_len: 65536,
supports_system_prompt: true,
supports_streaming: true,
recommended_temperature: 0.7,
recommended_top_p: 0.9,
languages: LANGUAGES,
use_cases: &[
"Long-document summarisation (FP8 weights)",
"Complex multi-turn dialogue",
"Code generation and debugging",
"Structured data extraction",
"Creative writing and story-telling",
"Multilingual translation",
"Retrieval-augmented generation (RAG)",
],
},
ModelVariant::FP8Bonsai4B => CapabilityProfile {
max_context_len: 65536,
supports_system_prompt: true,
supports_streaming: true,
recommended_temperature: 0.72,
recommended_top_p: 0.9,
languages: LANGUAGES,
use_cases: &[
"Short-to-medium document summarisation (FP8 weights)",
"Conversational chat assistants",
"Code completion and review",
"Data extraction and classification",
"On-device inference with moderate hardware",
],
},
ModelVariant::FP8Bonsai1_7B => CapabilityProfile {
max_context_len: 65536,
supports_system_prompt: true,
supports_streaming: true,
recommended_temperature: 0.75,
recommended_top_p: 0.85,
languages: LANGUAGES,
use_cases: &[
"Edge / IoT on-device inference (FP8 weights)",
"Low-latency chatbot responses",
"Simple Q&A over short documents",
"Keyword extraction",
"Fast text classification",
"WASM browser deployment",
],
},
ModelVariant::Custom => CapabilityProfile {
max_context_len: 65536,
supports_system_prompt: true,
supports_streaming: true,
recommended_temperature: 0.7,
recommended_top_p: 0.9,
languages: LANGUAGES,
use_cases: &["Custom architecture — use-cases depend on training data"],
},
}
}
#[cfg(test)]
mod tests {
use super::*;
fn all_known_variants() -> [ModelVariant; 9] {
[
ModelVariant::Bonsai8B,
ModelVariant::Bonsai4B,
ModelVariant::Bonsai1_7B,
ModelVariant::TernaryBonsai8B,
ModelVariant::TernaryBonsai4B,
ModelVariant::TernaryBonsai1_7B,
ModelVariant::FP8Bonsai8B,
ModelVariant::FP8Bonsai4B,
ModelVariant::FP8Bonsai1_7B,
]
}
#[test]
fn all_variants_produce_valid_configs() {
for v in all_known_variants() {
let spec = spec_for_variant(v).expect("known variant must have a spec");
assert!(
spec.config.num_layers > 0,
"{}: num_layers must be > 0",
spec.name
);
assert!(
spec.config.hidden_size > 0,
"{}: hidden_size must be > 0",
spec.name
);
assert!(
spec.config.intermediate_size > 0,
"{}: intermediate_size must be > 0",
spec.name
);
assert!(
spec.config.num_attention_heads > 0,
"{}: num_attention_heads must be > 0",
spec.name
);
assert!(
spec.config.num_kv_heads > 0,
"{}: num_kv_heads must be > 0",
spec.name
);
assert!(
spec.config.vocab_size > 0,
"{}: vocab_size must be > 0",
spec.name
);
assert_eq!(
spec.config.num_attention_heads % spec.config.num_kv_heads,
0,
"{}: num_attention_heads must be divisible by num_kv_heads",
spec.name
);
assert_eq!(
spec.config.hidden_size / spec.config.num_attention_heads,
spec.config.head_dim,
"{}: head_dim inconsistency",
spec.name
);
}
}
#[test]
fn param_count_is_reasonable() {
let s8b = bonsai_8b_spec();
assert!(
s8b.param_count > 1_000_000_000,
"8B: param_count must exceed 1B"
);
assert!(
s8b.param_count < 10_000_000_000,
"8B: param_count must be under 10B"
);
let s4b = bonsai_4b_spec();
assert!(
s4b.param_count > 1_000_000_000,
"4B: param_count must exceed 1B"
);
assert!(
s4b.param_count < 10_000_000_000,
"4B: param_count must be under 10B"
);
let s1_7b = bonsai_1_7b_spec();
assert!(
s1_7b.param_count > 1_000_000_000,
"1.7B: param_count must exceed 1B"
);
assert!(
s1_7b.param_count < 10_000_000_000,
"1.7B: param_count must be under 10B"
);
assert!(
s8b.param_count > s4b.param_count,
"8B should have more params than 4B"
);
assert!(
s4b.param_count > s1_7b.param_count,
"4B should have more params than 1.7B"
);
}
#[test]
fn weights_size_matches_q1_0_g128_expectation() {
for spec in all_specs() {
let lower = spec.param_count / 8;
let upper = spec.param_count * 2;
assert!(
spec.weights_size_bytes >= lower,
"{}: weights_size_bytes {} is below the 1-bit lower bound {}",
spec.name,
spec.weights_size_bytes,
lower
);
assert!(
spec.weights_size_bytes <= upper,
"{}: weights_size_bytes {} exceeds the FP16 upper bound {}",
spec.name,
spec.weights_size_bytes,
upper
);
}
}
#[test]
fn kv_cache_4k_bytes_is_reasonable() {
for spec in all_specs() {
let min_bytes: u64 = 1024 * 1024;
let max_bytes: u64 = 4 * 1024 * 1024 * 1024;
assert!(
spec.kv_cache_4k_bytes >= min_bytes,
"{}: kv_cache_4k_bytes {} is suspiciously small",
spec.name,
spec.kv_cache_4k_bytes
);
assert!(
spec.kv_cache_4k_bytes <= max_bytes,
"{}: kv_cache_4k_bytes {} exceeds 4 GiB sanity limit",
spec.name,
spec.kv_cache_4k_bytes
);
}
let s8b = bonsai_8b_spec();
let s4b = bonsai_4b_spec();
let s1_7b = bonsai_1_7b_spec();
assert!(s8b.kv_cache_4k_bytes > s4b.kv_cache_4k_bytes);
assert!(s4b.kv_cache_4k_bytes > s1_7b.kv_cache_4k_bytes);
}
#[test]
fn min_ram_includes_weights_and_kv_cache() {
for spec in all_specs() {
assert!(
spec.min_ram_bytes >= spec.weights_size_bytes + spec.kv_cache_4k_bytes,
"{}: min_ram_bytes must be at least weights + kv_cache",
spec.name
);
}
}
#[test]
fn all_specs_returns_nine_entries() {
assert_eq!(all_specs().len(), 9);
}
#[test]
fn spec_for_known_variants_returns_some() {
for v in all_known_variants() {
assert!(
spec_for_variant(v).is_some(),
"spec_for_variant({:?}) should return Some",
v
);
}
}
#[test]
fn spec_for_custom_returns_none() {
assert!(spec_for_variant(ModelVariant::Custom).is_none());
}
#[test]
fn spec_variant_field_matches_lookup_key() {
for spec in all_specs() {
let looked_up = spec_for_variant(spec.variant)
.expect("spec_for_variant must succeed for variants in all_specs()");
assert_eq!(
spec.variant, looked_up.variant,
"spec lookup returned wrong variant for {}",
spec.name
);
}
}
#[test]
fn capability_profile_returns_valid_data() {
for v in all_known_variants() {
let profile = capability_profile(v);
assert!(
profile.max_context_len > 0,
"{:?}: max_context_len must be > 0",
v
);
assert!(
profile.max_context_len <= 1_000_000,
"{:?}: max_context_len exceeds sanity ceiling",
v
);
assert!(
profile.recommended_temperature > 0.0,
"{:?}: temperature must be > 0",
v
);
assert!(
profile.recommended_temperature <= 2.0,
"{:?}: temperature must be <= 2.0",
v
);
assert!(
profile.recommended_top_p > 0.0,
"{:?}: top_p must be > 0",
v
);
assert!(
profile.recommended_top_p <= 1.0,
"{:?}: top_p must be <= 1.0",
v
);
assert!(
!profile.languages.is_empty(),
"{:?}: languages must not be empty",
v
);
assert!(
!profile.use_cases.is_empty(),
"{:?}: use_cases must not be empty",
v
);
assert!(
profile.languages.contains(&"en"),
"{:?}: English (\"en\") must be in languages",
v
);
assert!(
profile.supports_streaming,
"{:?}: all Bonsai variants support streaming",
v
);
assert!(
profile.supports_system_prompt,
"{:?}: all Bonsai variants support system prompts",
v
);
}
}
#[test]
fn capability_profile_for_custom_variant_is_valid() {
let profile = capability_profile(ModelVariant::Custom);
assert!(profile.max_context_len > 0);
assert!(!profile.languages.is_empty());
assert!(!profile.use_cases.is_empty());
}
#[test]
fn kv_cache_helper_formula_is_correct() {
let config = Qwen3Config::bonsai_8b();
let expected: u64 = 36 * 2 * 8 * 128 * 4096 * 4;
assert_eq!(kv_cache_size_bytes(&config, 4096), expected);
}
}