cubecl_std/
quantization.rs

1/// Represent the quantization of a f32 into an i8 using the symmetric scheme.
2
3#[derive(Clone, Copy)]
4pub struct SymQ8;
5
6impl SymQ8 {
7    pub fn quantize(val: f32, scaling: f32) -> i8 {
8        let min = -scaling * (i8::MIN as f32);
9        let max = min + 255.0 * scaling;
10        if val < min {
11            i8::MIN
12        } else if val > max {
13            i8::MAX
14        } else {
15            ((val - min) / scaling).round() as i8
16        }
17    }
18
19    pub fn dequantize(val: i8, scaling: f32) -> f32 {
20        val as f32 * scaling
21    }
22}