#[derive(Clone, Debug)]
pub struct LlmRuntimeConfig {
pub hidden_size: usize,
pub num_layers: usize,
pub num_kv_heads: usize,
pub head_dim: usize,
pub vocab_size: usize,
pub max_seq_len: usize,
}
pub trait DecoderOnlyLLM: Send + Sync {
fn config(&self) -> &LlmRuntimeConfig;
fn cache_metrics_snapshot(&self) -> Option<serde_json::Value> {
None
}
fn lora_metrics_snapshot(&self) -> Option<serde_json::Value> {
None
}
fn set_lora_adapter_for_cache(
&mut self,
cache_id: &str,
adapter: Option<crate::lora::ActiveLoraAdapter>,
) -> std::result::Result<(), ferrum_types::FerrumError> {
let _ = cache_id;
if let Some(adapter) = adapter {
return Err(ferrum_types::FerrumError::unsupported(format!(
"LoRA inference is not supported by this model/backend for adapter {} at {}",
adapter.name,
adapter.path.display()
)));
}
Ok(())
}
fn prepare(&mut self, cache_id: &str, max_tokens: usize) {
let _ = (cache_id, max_tokens);
}
fn kv_capacity(&self) -> usize {
self.config().max_seq_len
}
fn prefill(&mut self, cache_id: &str, tokens: &[u32]) -> Vec<f32>;
fn decode(&mut self, cache_id: &str, token: u32, pos: u32) -> Vec<f32>;
fn decode_batch(&mut self, batch: &[(String, u32, u32)]) -> Vec<Vec<f32>> {
batch
.iter()
.map(|(cid, tok, p)| self.decode(cid, *tok, *p))
.collect()
}
fn decode_batch_with_full_logits(
&mut self,
batch: &[(String, u32, u32)],
_force_full_logits: bool,
) -> Vec<Vec<f32>> {
self.decode_batch(batch)
}
fn forward_verify(&mut self, cache_id: &str, tokens: &[u32]) -> Vec<f32> {
let mut out = Vec::with_capacity(tokens.len() * self.config().vocab_size);
let start_pos = 0u32; for (i, &tok) in tokens.iter().enumerate() {
out.extend_from_slice(&self.decode(cache_id, tok, start_pos + i as u32));
}
out
}
#[allow(clippy::type_complexity)]
fn unified_forward(
&mut self,
_items: &[(String, Vec<u32>, usize, bool)],
) -> std::result::Result<Vec<Option<Vec<f32>>>, ferrum_types::FerrumError> {
Err(ferrum_types::FerrumError::unsupported(
"unified_forward not implemented for this model",
))
}
fn release(&mut self, cache_id: &str);
fn truncate_kv(&mut self, cache_id: &str, new_len: usize) {
let _ = (cache_id, new_len);
panic!("truncate_kv not implemented for this DecoderOnlyLLM");
}
fn reset(&mut self) {}
}