use ferrum_kernels::backend::{Backend, BackendQuantMarlin};
use ferrum_kernels::Linear;
use ferrum_types::Result;
use std::sync::Arc;
pub struct GptqLinear<B: Backend + BackendQuantMarlin> {
inner: Box<dyn Linear<B> + Send + Sync>,
}
impl<B: Backend + BackendQuantMarlin> GptqLinear<B> {
#[allow(clippy::too_many_arguments)]
pub fn from_raw(
qweight: &[i32],
scales: &[f32],
qzeros: &[i32],
g_idx: Option<&[i32]>,
bias: Option<&[f32]>,
bits: u32,
group_size: usize,
in_features: usize,
out_features: usize,
) -> Result<Self> {
let inner = B::load_gptq(
qweight,
scales,
qzeros,
g_idx,
bias,
bits,
group_size,
in_features,
out_features,
)?;
Ok(Self { inner })
}
}
impl<B: Backend + BackendQuantMarlin> Linear<B> for GptqLinear<B> {
fn in_features(&self) -> usize {
self.inner.in_features()
}
fn out_features(&self) -> usize {
self.inner.out_features()
}
fn forward(&self, ctx: &mut B::Context, input: &B::Buffer, out: &mut B::Buffer, m: usize) {
self.inner.forward(ctx, input, out, m);
}
}
pub struct StackedExpertLinear<B: Backend + BackendQuantMarlin> {
inner: Box<dyn Linear<B> + Send + Sync>,
k: usize,
expert_n: usize,
}
impl<B: Backend + BackendQuantMarlin> StackedExpertLinear<B> {
pub fn new(
stack: Arc<dyn ferrum_kernels::MarlinExpertStack<B>>,
expert_offset: usize,
expert_n: usize,
) -> Result<Self> {
let k = stack.k();
let inner = stack.make_expert_linear(expert_offset, expert_n, None)?;
Ok(Self { inner, k, expert_n })
}
pub fn new_with_bias(
stack: Arc<dyn ferrum_kernels::MarlinExpertStack<B>>,
expert_offset: usize,
expert_n: usize,
bias: &[f32],
) -> Result<Self> {
let k = stack.k();
let inner = stack.make_expert_linear(expert_offset, expert_n, Some(bias))?;
Ok(Self { inner, k, expert_n })
}
}
impl<B: Backend + BackendQuantMarlin> Linear<B> for StackedExpertLinear<B> {
fn in_features(&self) -> usize {
self.k
}
fn out_features(&self) -> usize {
self.expert_n
}
fn forward(&self, ctx: &mut B::Context, input: &B::Buffer, out: &mut B::Buffer, m: usize) {
self.inner.forward(ctx, input, out, m);
}
}