#![allow(dead_code)]
use super::QuantizedLayer;
use candle_core::{DType, Result, Tensor};
pub struct AwqLinear {
pub qweight: Tensor,
pub qzeros: Tensor,
pub scales: Tensor,
pub g_idx: Option<Tensor>,
pub bias: Option<Tensor>,
}
impl AwqLinear {
pub fn new(qweight: Tensor, qzeros: Tensor, scales: Tensor, bias: Option<Tensor>) -> Self {
Self {
qweight,
qzeros,
scales,
g_idx: None,
bias,
}
}
}
impl QuantizedLayer for AwqLinear {
fn forward(&self, x: &Tensor) -> Result<Tensor> {
let weight_f16 = self.unpack_weights()?;
let res = x.matmul(&weight_f16)?;
if let Some(bias) = &self.bias {
res.broadcast_add(bias)
} else {
Ok(res)
}
}
fn unpack_weights(&self) -> Result<Tensor> {
self.qweight.to_dtype(DType::F16)
}
}