baracuda_kernels::quantize::quantized_linear

Struct QuantizedLinearArgs

pub struct QuantizedLinearArgs<'a, TIn: Element, TWQ: IntElement> {
    pub activation: TensorRef<'a, TIn, 2>,
    pub weight_q: TensorRef<'a, TWQ, 2>,
    pub weight_scale: TensorRef<'a, TIn, 1>,
    pub output: TensorMut<'a, TIn, 2>,
    pub act_q_scratch: TensorMut<'a, S8, 2>,
    pub act_scale_scratch: TensorMut<'a, TIn, 1>,
}

Expand description

Args bundle for a quantized_linear launch.

The caller supplies the already-quantized weight + its per-channel scale (offline-computed). The activation is FP; per-token activation quantization happens inside QuantizedLinearPlan::run via an internally orchestrated super::DynamicRangeQuantizePlan pass.

act_q_scratch and act_scale_scratch are caller-owned scratch buffers for the quantized activation + computed per-row activation scale. They are part of the args bundle (not workspace) so callers can reuse them across launches without re-allocation — the Plan’s workspace_size() returns 0.

Fields§

§activation: TensorRef<'a, TIn, 2>

FP activation [M, K].

§weight_q: TensorRef<'a, TWQ, 2>

Already-quantized int8 weight [C_out, K].

§weight_scale: TensorRef<'a, TIn, 1>

Per-output-channel weight scale [C_out] in FP.

§output: TensorMut<'a, TIn, 2>

FP output [M, C_out].

§act_q_scratch: TensorMut<'a, S8, 2>

Scratch for the per-token quantized activation [M, K] in int8. Caller-owned; reused across launches.

§act_scale_scratch: TensorMut<'a, TIn, 1>

Scratch for the per-token activation scale [M] in FP. Caller-owned; reused across launches. Populated by the internally orchestrated dynamic-range pass.

Auto Trait Implementations§

§

impl<'a, TIn, TWQ> !UnwindSafe for QuantizedLinearArgs<'a, TIn, TWQ>

§

impl<'a, TIn, TWQ> Freeze for QuantizedLinearArgs<'a, TIn, TWQ>

§

impl<'a, TIn, TWQ> RefUnwindSafe for QuantizedLinearArgs<'a, TIn, TWQ>
where TIn: RefUnwindSafe, TWQ: RefUnwindSafe,

§

impl<'a, TIn, TWQ> Send for QuantizedLinearArgs<'a, TIn, TWQ>
where TIn: Sync + Send, TWQ: Sync,

§

impl<'a, TIn, TWQ> Sync for QuantizedLinearArgs<'a, TIn, TWQ>
where TIn: Sync, TWQ: Sync,

§

QuantizedLinearArgs

Struct QuantizedLinearArgs Copy item path

Fields§

Auto Trait Implementations§

impl<'a, TIn, TWQ> !UnwindSafe for QuantizedLinearArgs<'a, TIn, TWQ>

impl<'a, TIn, TWQ> Freeze for QuantizedLinearArgs<'a, TIn, TWQ>

impl<'a, TIn, TWQ> RefUnwindSafe for QuantizedLinearArgs<'a, TIn, TWQ>where TIn: RefUnwindSafe, TWQ: RefUnwindSafe,

impl<'a, TIn, TWQ> Send for QuantizedLinearArgs<'a, TIn, TWQ>where TIn: Sync + Send, TWQ: Sync,

impl<'a, TIn, TWQ> Sync for QuantizedLinearArgs<'a, TIn, TWQ>where TIn: Sync, TWQ: Sync,

impl<'a, TIn, TWQ> Unpin for QuantizedLinearArgs<'a, TIn, TWQ>

impl<'a, TIn, TWQ> UnsafeUnpin for QuantizedLinearArgs<'a, TIn, TWQ>

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct QuantizedLinearArgs

impl<'a, TIn, TWQ> RefUnwindSafe for QuantizedLinearArgs<'a, TIn, TWQ>
where TIn: RefUnwindSafe, TWQ: RefUnwindSafe,

impl<'a, TIn, TWQ> Send for QuantizedLinearArgs<'a, TIn, TWQ>
where TIn: Sync + Send, TWQ: Sync,

impl<'a, TIn, TWQ> Sync for QuantizedLinearArgs<'a, TIn, TWQ>
where TIn: Sync, TWQ: Sync,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,