Trait QuantMethod

Source

pub trait QuantMethod:
    Send
    + Sync
    + Debug
    + QuantizedSerde {
Show 17 methods    // Required methods
    fn new(method: QuantMethodConfig) -> Result<Self>
       where Self: Sized;
    fn dequantize_w(&self) -> Result<Tensor>;
    fn forward_raw(&self, a: &Tensor) -> Result<Tensor>;
    fn quantized_act_type(&self) -> Option<DType>;
    fn dtype_and_device(&self) -> (DType, Device);
    fn add_delta_w(&self, delta: &Tensor) -> Result<Arc<dyn QuantMethod>>;
    fn apply_isq(
        self: Arc<Self>,
        dtype: Option<IsqType>,
        device: Device,
        n_quantized: &AtomicUsize,
        imatrix_weight: Option<Vec<f32>>,
        guard: QuantizeOntoGuard,
    ) -> Result<Arc<dyn QuantMethod>>;

    // Provided methods
    fn forward(&self, a: &Tensor) -> Result<Tensor> { ... }
    fn gather_forward(&self, a: &Tensor, indices: &Tensor) -> Result<Tensor> { ... }
    fn gather_forward_raw(
        &self,
        _a: &Tensor,
        _indices: &Tensor,
    ) -> Result<Tensor> { ... }
    fn afq_inner(&self) -> Option<AfqInner<'_>> { ... }
    fn unquant_weight_bias(&self) -> Option<(Tensor, Option<Tensor>)> { ... }
    fn has_bias(&self) -> bool { ... }
    fn begin_track_stats(&mut self) -> Result<()> { ... }
    fn end_track_stats(&self) -> Result<Tensor> { ... }
    fn is_distributed(&self) -> Option<DistributedKind> { ... }
    fn dummy_info(&self) -> Option<&DummyLayerInfo> { ... }
}

Expand description

Quantized method for a quantized matmul.

Required Methods§

Source

fn new(method: QuantMethodConfig) -> Result<Self>
where Self: Sized,

Source

fn dequantize_w(&self) -> Result<Tensor>

Source

fn forward_raw(&self, a: &Tensor) -> Result<Tensor>

Raw matmul without dtype casting. Implementors override this. Callers should use forward instead.

Source

fn quantized_act_type(&self) -> Option<DType>

If a quantized method, return the activation dtype.

Source

fn dtype_and_device(&self) -> (DType, Device)

Weight dtype and device

Source

fn add_delta_w(&self, delta: &Tensor) -> Result<Arc<dyn QuantMethod>>

Add a delta weight from LoRA to the weights. This should be prescaled with alpha.

Source

fn apply_isq( self: Arc<Self>, dtype: Option<IsqType>, device: Device, n_quantized: &AtomicUsize, imatrix_weight: Option<Vec<f32>>, guard: QuantizeOntoGuard, ) -> Result<Arc<dyn QuantMethod>>

If the quant is backed by a qmatmul.

Provided Methods§

Source

fn forward(&self, a: &Tensor) -> Result<Tensor>

Compute matmul of self and a. self should contain the weights. Automatically casts to the required quantization activation type and back.

Source

fn gather_forward(&self, a: &Tensor, indices: &Tensor) -> Result<Tensor>

Compute gather matmul of self and a. self should contain the weights. Automatically casts to the required quantization activation type and back.

If a is (n_tokens, n_experts, cols), self weights are (n_experts, rows, cols), then the indices are (n_tokens, n_experts).

Source