cubecl_std/quantization.rs
1/// Represent the quantization of a f32 into an i8 using the symmetric scheme.
2
3#[derive(Clone, Copy)]
4pub struct SymQ8;
5
6impl SymQ8 {
7 pub fn quantize(val: f32, scaling: f32) -> i8 {
8 let min = -scaling * (i8::MIN as f32);
9 let max = min + 255.0 * scaling;
10 if val < min {
11 i8::MIN
12 } else if val > max {
13 i8::MAX
14 } else {
15 ((val - min) / scaling).round() as i8
16 }
17 }
18
19 pub fn dequantize(val: i8, scaling: f32) -> f32 {
20 val as f32 * scaling
21 }
22}