tract-core 0.19.2

use ops::matmul::mir_quant::wire_matmul_quant;

use crate::internal::*;
use crate::ops;
use crate::ops::matmul::mir_quant::{combine_scales, requant, wire_offset_u8_as_i8};
use crate::ops::matmul::*;
use mir_quant::MatMulQParams;
use mir_quant::QParamKind;

#[derive(Debug, Clone, new, Hash)]
pub struct QMatMulUnary {
    pub a: Arc<Tensor>,
    pub bias: Option<Arc<Tensor>>, // must be scalar OR rank 1
    pub axes: MatMulAxes,
    pub output_type: DatumType,
    pub params: MatMulQParams,
}

impl_dyn_hash!(QMatMulUnary);

impl Op for QMatMulUnary {
    fn name(&self) -> Cow<str> {
        "QMatMulUnary".into()
    }

    op_as_typed_op!();
}

impl EvalOp for QMatMulUnary {
    fn is_stateless(&self) -> bool {
        true
    }

    fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>> {
        ensure!(inputs[0].rank() == self.a.rank(), "Rank mismatch {:?} vs {:?}", inputs[0], self.a);

        let mut model = TypedModel::default();
        let t_a = self.a.offset_u8_as_i8();
        let a = model.add_const("source_a", self.a.clone())?;
        let b = model.add_const("source_b", inputs[0].clone().into_arc_tensor())?;
        let bias = if let Some(bias) = self.bias.clone() {
            Some(model.add_const("source_bias", bias)?)
        } else {
            None
        };

        let mut input_outlets = tvec![a];
        for (i, t) in inputs.iter().enumerate().skip(1) {
            input_outlets
                .push(model.add_const(format!("source_{i}"), t.clone().into_arc_tensor())?)
        }

        let mut params = self.params.as_outlet_ids(
            &mut model,
            "qmatmul_unary",
            &input_outlets,
            self.a.datum_type(),
            inputs[0].datum_type(),
            self.output_type,
        )?;
        let a = wire_offset_u8_as_i8(&mut model, "adhoc", a, "a", &mut params[0], "a0")?;
        let b = wire_offset_u8_as_i8(&mut model, "adhoc", b, "b", &mut params[2], "b0")?;

        let new_op = MatMulUnary { a: t_a, axes: self.axes };
        let result = model.wire_node("adhoc.matmul", new_op, &[b])?[0];
        let result = wire_matmul_quant(
            &mut model,
            "adhoc",
            a,
            b,
            bias,
            self.axes,
            result,
            self.output_type,
            &params,
        )?;
        model.set_output_outlets(&[result])?;
        model.into_runnable()?.run(tvec![])
    }
}

impl TypedOp for QMatMulUnary {
    fn output_facts(&self, inputs: &[&TypedFact]) -> TractResult<TVec<TypedFact>> {
        if inputs.len() != 1 + self.params.input_count() {
            bail!(
                "Inconsistent q matmul unary. expects {} inputs, got {}",
                1 + self.params.input_count(),
                inputs.len()
            );
        }
        if inputs[0].rank() != self.a.rank() {
            bail!("Inconsistent matmul between {:?} and {:?} (rank mismatch)", inputs[0], self.a);
        }
        let (_m, _k, _n, c_shape) = compute_shape(
            &self.a.shape().iter().map(|d| d.to_dim()).collect::<TVec<_>>(),
            &inputs[0].shape,
            self.axes,
        )?;

        #[allow(clippy::comparison_chain)]
        if let Some(bias) = &self.bias {
            if bias.rank() > 1 {
                anyhow::bail!("Bias must be either scalar or vector (rank 0 or 1).");
            } else if bias.rank() == 1 {
                let expected_len = c_shape[self.axes.c_m].to_usize()?;
                anyhow::ensure!(
                    bias.len() == expected_len,
                    "got: {:?} expected len: {:?}",
                    bias,
                    expected_len
                );
            };
        }

        Ok(tvec!(self.output_type.fact(c_shape)))
    }

    fn invariants(&self, inputs: &[&TypedFact], outputs: &[&TypedFact]) -> TractResult<Invariants> {
        /*
        dbg!(inputs);
        dbg!(&self.params);
        */
        // FIXME: why ?
        if self.params.iter().any(|qp| match qp.1 {
            QParamKind::Attr(t) => t.len() > 1,
            QParamKind::FromInput(ix) => !inputs[*ix].shape.volume().is_one(),
            QParamKind::FromQType => false,
        }) {
            Ok(Invariants::none())
        } else {
            let mut invs =
                super::mir_unary::mir_unary_invariants(inputs[0], outputs[0], self.axes)?;
            for axis in &mut invs.axes {
                axis.inputs.extend(std::iter::repeat(None).take(inputs.len() - 1));
            }
            Ok(invs)
        }
    }

    fn change_axes(
        &self,
        model: &TypedModel,
        node: &TypedNode,
        io: InOut,
        change: &AxisOp,
    ) -> TractResult<Option<AxisChangeConsequence>> {
        if let Some((a, axes, wire_changes)) =
            super::mir_unary::mir_unary_change_axes(model, node, io, change, &self.axes, &self.a)?
        {
            let op = Self { axes, a: a.into_arc_tensor(), ..self.clone() };
            Ok(Some(AxisChangeConsequence { substitute_op: Some(Box::new(op)), wire_changes }))
        } else {
            Ok(None)
        }
    }

    fn declutter(
        &self,
        model: &TypedModel,
        node: &TypedNode,
    ) -> TractResult<Option<TypedModelPatch>> {
        use crate::ops::array::TypedConcat;
        if let Some(concat) = model.nodes()[node.inputs[0].node].op().downcast_ref::<TypedConcat>()
        {
            let mut patch = TypedModelPatch::new("split over k-concatenated input");
            let k_axis = self.axes.a_k;
            if concat.axis == self.axes.b_k {
                let concat_node = model.node(node.inputs[0].node);
                let offsets = concat
                    .offsets(&model.node_input_facts(concat_node.id)?)?
                    .iter()
                    .map(|x| x.to_usize())
                    .collect::<TractResult<Vec<usize>>>()?;
                let mut wires = vec![];
                let mut params_for_split = self.params.clone();
                params_for_split.a_scale = tensor0(1.0f32).into();
                params_for_split.b_scale = tensor0(1.0f32).into();
                params_for_split.c_scale = tensor0(1.0f32).into();
                params_for_split.c0 = tensor0(0i32).into();
                let input_outlets = node
                    .inputs
                    .iter()
                    .skip(1)
                    .map(|o| patch.tap_model(model, *o))
                    .collect::<TractResult<TVec<_>>>()?;
                let params_outlets = self.params.as_outlet_ids(
                    &mut patch,
                    &node.name,
                    &input_outlets,
                    self.a.datum_type(),
                    model.node_input_facts(node.id)?[0].datum_type,
                    self.output_type,
                )?;

                let scale = combine_scales(
                    &mut patch,
                    &node.name,
                    params_outlets[1],
                    params_outlets[3],
                    params_outlets[5],
                )?;
                let c0 = params_outlets[4];

                for (ix, input) in concat_node.inputs.iter().enumerate() {
                    let wire = patch.tap_model(model, *input)?;
                    let a = self.a.slice(k_axis, offsets[ix], offsets[ix + 1])?;
                    let wire = patch
                        .wire_node(
                            format!("{}.k-{}-{}", node.name, offsets[ix], offsets[ix + 1]),
                            Self {
                                a: a.into_arc_tensor(),
                                output_type: DatumType::I32,
                                bias: self.bias.clone().filter(|_| ix == 0),
                                params: params_for_split.clone(),
                                ..self.clone()
                            },
                            &[wire],
                        )
                        .context("wiring new matmulunary")?[0];
                    wires.push(wire)
                }
                let mut wire = wires[0];
                for (ix, w) in wires[1..].iter().enumerate() {
                    wire = patch.wire_node(
                        format!("{}.k-add-{}", node.name, ix),
                        crate::ops::binary::TypedBinOp(Box::new(crate::ops::math::Add)),
                        &[wire, *w],
                    )?[0];
                }
                wire = requant(&mut patch, &node.name, wire, self.output_type, scale, c0)?;
                patch.shunt_outside(model, OutletId::new(node.id, 0), wire)?;
                return Ok(Some(patch));
            }
        }
        Ok(None)
    }

    fn cost(&self, inputs: &[&TypedFact]) -> TractResult<TVec<(Cost, TDim)>> {
        cost(self.a.shape(), &inputs[0].shape.to_tvec(), inputs[0].datum_type, self.axes)
    }

    fn codegen(
        &self,
        model: &TypedModel,
        node: &TypedNode,
    ) -> TractResult<Option<TypedModelPatch>> {
        let mut patch = TypedModelPatch::default();
        let t_a = self.a.offset_u8_as_i8();

        if let Some((inputs, qp)) = self.params.inline_static(model, node)? {
            let mut patch = TypedModelPatch::new("inlining matmul quantized params");
            let inputs: Vec<OutletId> =
                inputs.iter().map(|i| patch.tap_model(model, *i)).collect::<TractResult<_>>()?;
            let op = Self {
                a: t_a,
                params: MatMulQParams { a0: qp.a0.offset_u8_as_i8(&patch, &inputs)?, ..qp },
                ..self.clone()
            };
            let wire = patch.wire_node(&node.name, op, &inputs)?;
            patch.shunt_outside(model, node.id.into(), wire[0])?;
            return Ok(Some(patch));
        }

        let a = patch.wire_node(
            format!("{}.a_const", &node.name),
            ops::konst::Const(self.a.clone()),
            &[],
        )?[0];
        let b = patch.tap_model(model, node.inputs[0])?;
        let bias = if let Some(bias) = self.bias.clone() {
            Some(patch.add_const(format!("{}.bias_const", &node.name), bias)?)
        } else {
            None
        };
        let mut input_outlets = tvec![a];
        for i in node.inputs.iter().skip(1) {
            input_outlets.push(patch.tap_model(model, *i)?)
        }
        let mut params = self.params.as_outlet_ids(
            &mut patch,
            &node.name,
            &input_outlets,
            self.a.datum_type(),
            model.node_input_facts(node.id)?[0].datum_type,
            self.output_type,
        )?;

        let a = wire_offset_u8_as_i8(&mut patch, &node.name, a, "a", &mut params[0], "a0")?;
        let b = wire_offset_u8_as_i8(&mut patch, &node.name, b, "b", &mut params[2], "b0")?;

        let new_op = MatMulUnary { a: t_a, axes: self.axes };
        let result = patch.wire_node(format!("{}.matmul", &node.name), new_op, &[b])?[0];
        let result = wire_matmul_quant(
            &mut patch,
            &node.name,
            a,
            b,
            bias,
            self.axes,
            result,
            self.output_type,
            &params,
        )?;
        patch.shunt_outside(model, node.id.into(), result)?;
        Ok(Some(patch))
    }

    as_op!();
}

#[cfg(test)]
mod test {
    use super::*;
    use proptest::collection::vec;
    use proptest::prelude::*;
    use tract_ndarray::prelude::*;
    proptest! {
        #[test]
        fn prop_i8_i8_i8(pb in any::<QMatMulUnaryProblemI8I8I8>()) {
            pb.check();
        }

        #[test]
        fn prop_i8_i8_u8(pb in any::<QMatMulUnaryProblemI8I8U8>()) {
            pb.check();
        }

        #[test]
        fn prop_i8_u8_i8(pb in any::<QMatMulUnaryProblemI8U8I8>()) {
            pb.check();
        }

        #[test]
        fn prop_u8_i8_i8(pb in any::<QMatMulUnaryProblemU8I8I8>()) {
            pb.check();
        }

        #[test]
        fn prop_i8_u8_u8(pb in any::<QMatMulUnaryProblemI8U8U8>()) {
            pb.check();
        }

        #[test]
        fn prop_u8_i8_u8(pb in any::<QMatMulUnaryProblemU8I8U8>()) {
            pb.check();
        }

        #[test]
        fn prop_u8_u8_i8(pb in any::<QMatMulUnaryProblemU8U8I8>()) {
            pb.check();
        }

        #[test]
        fn prop_u8_u8_u8(pb in any::<QMatMulUnaryProblemU8U8U8>()) {
            pb.check();
        }
    }

    #[test]
    fn c0() {
        QMatMulUnaryProblemI8I8I8 {
            a: arr2(&[[0]]),
            b: arr2(&[[0]]),
            bias: tensor0(0i32),
            a0: 0,
            b0: 0,
            c0: 1,
            a_scale: 1.0,
            b_scale: 1.0,
            c_scale: 1.0,
            opt: true,
            dyn_qp: true,
        }
        .check();
    }

    #[test]
    fn b_scale() {
        QMatMulUnaryProblemI8I8I8 {
            a: arr2(&[[0]]),
            b: arr2(&[[0]]),
            bias: tensor0(0i32),
            a0: 0,
            b0: 0,
            c0: 1,
            a_scale: 1.0,
            b_scale: 2.0,
            c_scale: 1.0,
            opt: true,
            dyn_qp: true,
        }
        .check();
    }

    #[test]
    fn sat() {
        QMatMulUnaryProblemI8I8I8 {
            a: arr2(&[[0]]),
            b: arr2(&[[34]]),
            bias: tensor0(0i32),
            a0: -17,
            b0: 1,
            c0: 0,
            a_scale: 1.0,
            b_scale: 0.05,
            c_scale: 0.25,
            opt: true,
            dyn_qp: true,
        }
        .check();
    }

    #[test]
    fn rounding() {
        QMatMulUnaryProblemI8I8I8 {
            a: arr2(&[[26]]),
            b: arr2(&[[0]]),
            bias: tensor0(0i32),
            a0: 27,
            b0: -1,
            c0: 1,
            a_scale: 1.0,
            b_scale: 0.05,
            c_scale: 1.0,
            opt: true,
            dyn_qp: true,
        }
        .check();
    }

    #[test]
    fn neg_rounding() {
        QMatMulUnaryProblemI8I8I8 {
            a: arr2(&[[-23]]),
            b: arr2(&[[-2]]),
            bias: tensor0(0i32),
            a0: -11,
            b0: -45,
            c0: 0,
            a_scale: 0.1,
            b_scale: 1.0,
            c_scale: 1.0,
            opt: true,
            dyn_qp: true,
        }
        .check();
    }

    #[test]
    fn rounding_ties_2() {
        QMatMulUnaryProblemI8I8I8 {
            a: arr2(&[[47], [0]]),
            b: arr2(&[[1, 0, 30]]),
            bias: tensor0(0i32),
            a0: 86,
            b0: 19,
            c0: 0,
            a_scale: 0.1,
            b_scale: 1.0,
            c_scale: 0.6,
            opt: true,
            dyn_qp: true,
        }
        .check();
    }

    #[test]
    fn rounding_ties_3() {
        QMatMulUnaryProblemI8I8I8 {
            a: arr2(&[[-30]]),
            b: arr2(&[[0, 107, 0]]),
            bias: tensor0(0i32),
            a0: -59,
            b0: 117,
            c0: 0,
            a_scale: 1.0,
            b_scale: 0.15,
            c_scale: 0.6,
            opt: true,
            dyn_qp: true,
        }
        .check();
    }

    #[test]
    fn onnx_test_matmulinteger() {
        QMatMulUnaryProblemI8I8I8 {
            a: arr2(&[[11, 7, 3], [10, 6, 2], [9, 5, 1], [8, 4, 0]]),
            b: arr2(&[[1, 4], [2, 5], [3, 6]]),
            bias: tensor0(0i32),
            a0: 12,
            b0: 0,
            c0: 0,
            a_scale: 1.0,
            b_scale: 1.0,
            c_scale: 1.0,
            opt: true,
            dyn_qp: true,
        }
        .check();
    }

    #[test]
    fn scale_big() {
        QMatMulUnaryProblemI8I8I8 {
            a: arr2(&[[0], [0]]),
            b: arr2(&[[0, 0]]),
            bias: tensor0(0i32),
            a0: -1,
            b0: 1,
            c0: 0,
            a_scale: 1.0,
            b_scale: 0.2,
            c_scale: 0.05,
            opt: true,
            dyn_qp: false,
        }
        .check();
    }

    fn round_ties_to_right(x: f32) -> i32 {
        (x + 0.5).floor() as i32
    }

    fn scale() -> BoxedStrategy<f32> {
        prop_oneof![Just(1.0), (1i32..=20).prop_map(|x| x as f32 / 20.0)].boxed()
    }

    macro_rules! impl_qmmup {
        ($name:ident, $a:ty, $b:ty, $c:ty $(,)?) => {
            #[derive(Debug)]
            struct $name {
                a: Array2<$a>,
                b: Array2<$b>,
                bias: Tensor,
                a0: $a,
                b0: $b,
                c0: $c,
                a_scale: f32,
                b_scale: f32,
                c_scale: f32,
                opt: bool,
                dyn_qp: bool,
            }

            impl $name {
                fn check(&self) {
                    let _ = env_logger::Builder::from_env("TRACT_LOG").try_init();
                    let r = self.reference();
                    let t = self.tract();
                    assert!(
                        r.iter().zip(t.iter()).all(|(r, t)| r.max(t) - r.min(t) <= 1),
                        "mismatch! optimized plan: {}, dynamic qparams: {}, reference: {:?}, tract: {:?}",
                        self.opt,
                        self.dyn_qp,
                        r,
                        t,
                        );

                }

                fn reference(&self) -> Array2<$c> {
                    let a = self.a.map(|&x| (x as f32 - self.a0 as f32) * self.a_scale);
                    let b = self.b.map(|&x| (x as f32 - self.b0 as f32) * self.b_scale);
                    let c = a.dot(&b);
                    let c = c.map(|&x| round_ties_to_right(x / self.c_scale) + self.c0 as i32);
                    c.map(|&x| x.max(<$c>::MIN as i32).min(<$c>::MAX as i32) as $c)
                }

                fn tract(&self) -> Array2<$c> {
                    let mut model = TypedModel::default();
                    let mut inputs = tvec![];
                    inputs.push(
                        model
                        .add_source("b", <$b>::datum_type().fact(&[self.b.nrows(), self.b.ncols()]))
                        .unwrap(),
                        );
                    let qparams = if self.dyn_qp {
                        inputs.push(model.add_source("a0", TypedFact::scalar::<$a>()).unwrap());
                        inputs.push(model.add_source("a_scale", TypedFact::scalar::<f32>()).unwrap());
                        inputs.push(model.add_source("b0", TypedFact::scalar::<$b>()).unwrap());
                        inputs.push(model.add_source("b_scale", TypedFact::scalar::<f32>()).unwrap());
                        inputs.push(model.add_source("c0", TypedFact::scalar::<$c>()).unwrap());
                        inputs.push(model.add_source("c_scale", TypedFact::scalar::<f32>()).unwrap());
                        MatMulQParams::all_dynamic(1)
                    } else {
                        MatMulQParams {
                            a0: tensor0::<$a>(self.a0).into(),
                            a_scale: tensor0::<f32>(self.a_scale).into(),
                            b0: tensor0::<$b>(self.b0).into(),
                            b_scale: tensor0::<f32>(self.b_scale).into(),
                            c0: tensor0::<$c>(self.c0).into(),
                            c_scale: tensor0::<f32>(self.c_scale).into(),
                        }
                    };
                    let result = model
                        .wire_node(
                            "qmmu",
                            QMatMulUnary::new(
                                self.a.clone().into_arc_tensor(),
                                Some(self.bias.clone().into_arc_tensor()),
                                MatMulAxes::default(),
                                <$c>::datum_type(),
                                qparams,
                                ),
                                &inputs,
                                )
                        .unwrap();
                    model.set_output_outlets(&result).unwrap();

                    let inputs:TVec<TValue> = if self.dyn_qp {
                        tvec![
                            self.b.clone().into_tensor().into(),
                            tensor0(self.a0).into(),
                            tensor0(self.a_scale).into(),
                            tensor0(self.b0).into(),
                            tensor0(self.b_scale).into(),
                            tensor0(self.c0).into(),
                            tensor0(self.c_scale).into(),
                        ]
                    } else {
                        tvec![self.b.clone().into_tensor().into()]
                    };
                    let model = if self.opt { model.into_optimized().unwrap() } else { model };
                    let mut outputs = model
                        .into_runnable()
                        .unwrap()
                        .run(inputs)
                        .unwrap();
                    outputs
                        .remove(0)
                        .into_tensor()
                        .into_array::<$c>()
                        .unwrap()
                        .into_dimensionality()
                        .unwrap()
                }
            }

            impl Arbitrary for $name {
                type Parameters = ();
                type Strategy = BoxedStrategy<$name>;
                fn arbitrary_with(_args: Self::Parameters) -> Self::Strategy {
                    (1usize..=4, 1usize..=4, 1usize..=4)
                        .prop_flat_map(|(m, k, n)| {
                            (
                                Just((m, k, n)),
                                vec(any::<$a>(), m * k..=m * k),
                                vec(any::<$b>(), k * n..=k * n),
                                any::<$a>(),
                                any::<$b>(),
                                any::<$c>(),
                                scale(),
                                scale(),
                                scale(),
                                any::<bool>(),
                                any::<bool>(),
                                )
                        })
                    .prop_map(|((m, k, n), a, b, a0, b0, c0, a_scale, b_scale, c_scale, opt, dyn_qp)| {
                        $name {
                            a: Array2::from_shape_vec((m, k), a).unwrap(),
                            b: Array2::from_shape_vec((k, n), b).unwrap(),
                            bias: tensor0(0i32),
                            a0,
                            b0,
                            c0,
                            a_scale,
                            b_scale,
                            c_scale,
                            opt,
                            dyn_qp
                        }
                    })
                    .boxed()
                }
            }
        };
    }

    impl_qmmup! { QMatMulUnaryProblemI8I8I8, i8, i8, i8 }
    impl_qmmup! { QMatMulUnaryProblemI8I8U8, i8, i8, u8 }
    impl_qmmup! { QMatMulUnaryProblemI8U8I8, i8, u8, i8 }
    impl_qmmup! { QMatMulUnaryProblemU8I8I8, u8, i8, i8 }
    impl_qmmup! { QMatMulUnaryProblemI8U8U8, i8, u8, u8 }
    impl_qmmup! { QMatMulUnaryProblemU8I8U8, u8, i8, u8 }
    impl_qmmup! { QMatMulUnaryProblemU8U8I8, u8, u8, i8 }
    impl_qmmup! { QMatMulUnaryProblemU8U8U8, u8, u8, u8 }

    #[test]
    fn test_qmmup_i8_i8_i8() {
        QMatMulUnaryProblemI8I8I8 {
            a: arr2(&[[76, 76, 76], [127, -127, 102]]),
            b: arr2(&[[25, 51, 76, 102, 127], [-51, -25, 0, 25, 51], [-25, -51, -76, -102, -127]]),
            bias: tensor0(0i32),
            a0: 51,
            b0: 0,
            c0: -31,
            a_scale: 0.039215688,
            b_scale: 0.039215688,
            c_scale: 0.09411765,
            opt: true,
            dyn_qp: true,
        }
        .check(); // c: [[-52, -41, -31, -21, -10], [127, 64, 0, -62, -126]]
    }

    #[test]
    fn test_qmmup_i8_i8_u8() {
        QMatMulUnaryProblemI8I8U8 {
            a: arr2(&[[76, 76, 76], [127, -127, 102]]),
            b: arr2(&[[25, 51, 76, 102, 127], [-51, -25, 0, 25, 51], [-25, -51, -76, -102, -127]]),
            bias: tensor0(0i32),
            a0: 51,
            b0: 0,
            c0: 96,
            a_scale: 0.039215688,
            b_scale: 0.039215688,
            c_scale: 0.09411765,
            opt: true,
            dyn_qp: true,
        }
        .check(); // c: [[75, 86, 96, 106, 117], [255, 191, 127, 65, 1]]
    }

    #[test]
    fn test_qmmup_i8_i8_u8_2() {
        QMatMulUnaryProblemI8I8U8 {
            a: arr2(&[[8], [8]]),
            b: arr2(&[[0, 0, 0]]),
            bias: tensor0(0),
            a0: 0,
            b0: 0,
            c0: 0,
            a_scale: 1.0,
            b_scale: 1.0,
            c_scale: 1.0,
            opt: true,
            dyn_qp: true,
        }
        .check()
    }

    #[test]
    fn test_qmmup_i8_u8_i8() {
        QMatMulUnaryProblemI8U8I8 {
            a: arr2(&[[76, 76, 76], [127, -127, 102]]),
            b: arr2(&[[152, 178, 203, 229, 254], [76, 102, 127, 152, 178], [102, 76, 51, 25, 0]]),
            bias: tensor0(0i32),
            a0: 51,
            b0: 127,
            c0: -31,
            a_scale: 0.039215688,
            b_scale: 0.039215688,
            c_scale: 0.09411765,
            opt: true,
            dyn_qp: true,
        }
        .check(); // c: [[-52, -41, -31, -21, -10], [127, 64, 0, -62, -126]]
    }

    #[test]
    fn test_qmmup_u8_i8_i8() {
        QMatMulUnaryProblemU8I8I8 {
            a: arr2(&[[204, 204, 204], [255, 1, 230]]),
            b: arr2(&[[25, 51, 76, 102, 127], [-51, -25, 0, 25, 51], [-25, -51, -76, -102, -127]]),
            bias: tensor0(0i32),
            a0: 179,
            b0: 0,
            c0: -31,
            a_scale: 0.039215688,
            b_scale: 0.039215688,
            c_scale: 0.09411765,
            opt: true,
            dyn_qp: true,
        }
        .check(); // c: [[-52, -41, -31, -21, -10], [127, 64, 0, -62, -126]]
    }

    #[test]
    fn test_qmmup_i8_u8_u8() {
        QMatMulUnaryProblemI8U8U8 {
            a: arr2(&[[76, 76, 76], [127, -127, 102]]),
            b: arr2(&[[152, 178, 203, 229, 254], [76, 102, 127, 152, 178], [102, 76, 51, 25, 0]]),
            bias: tensor0(0i32),
            a0: 51,
            b0: 127,
            c0: 96,
            a_scale: 0.039215688,
            b_scale: 0.039215688,
            c_scale: 0.09411765,
            opt: true,
            dyn_qp: true,
        }
        .check(); // c: [[75, 86, 96, 106, 117], [255, 191, 127, 65, 1]]
    }

    #[test]
    fn test_qmmup_u8_i8_u8() {
        QMatMulUnaryProblemU8I8U8 {
            a: arr2(&[[204, 204, 204], [255, 1, 230]]),
            b: arr2(&[[25, 51, 76, 102, 127], [-51, -25, 0, 25, 51], [-25, -51, -76, -102, -127]]),
            bias: tensor0(0i32),
            a0: 179,
            b0: 0,
            c0: 96,
            a_scale: 0.039215688,
            b_scale: 0.039215688,
            c_scale: 0.09411765,
            opt: true,
            dyn_qp: true,
        }
        .check(); // c: [[75, 86, 96, 106, 117], [255, 191, 127, 65, 1]]
    }

    #[test]
    fn test_qmmup_u8_u8_i8() {
        QMatMulUnaryProblemU8U8I8 {
            a: arr2(&[[204, 204, 204], [255, 1, 230]]),
            b: arr2(&[[152, 178, 203, 229, 254], [76, 102, 127, 152, 178], [102, 76, 51, 25, 0]]),
            bias: tensor0(0i32),
            a0: 179,
            b0: 127,
            c0: -31,
            a_scale: 0.039215688,
            b_scale: 0.039215688,
            c_scale: 0.09411765,
            opt: true,
            dyn_qp: true,
        }
        .check(); // c: [[-52, -41, -31, -21, -10], [127, 64, 0, -62, -126]]
    }

    #[test]
    fn test_qmmup_u8_u8_u8() {
        QMatMulUnaryProblemU8U8U8 {
            a: arr2(&[[204, 204, 204], [255, 1, 230]]),
            b: arr2(&[[152, 178, 203, 229, 254], [76, 102, 127, 152, 178], [102, 76, 51, 25, 0]]),
            bias: tensor0(0i32),
            a0: 179,
            b0: 127,
            c0: 96,
            a_scale: 0.039215688,
            b_scale: 0.039215688,
            c_scale: 0.09411765,
            opt: true,
            dyn_qp: true,
        }
        .check(); // c: [[75, 86, 96, 106, 117], [255, 191, 127, 65, 1]]
    }

    #[test]
    fn test_qmmup_u8_u8_u8_2() {
        QMatMulUnaryProblemU8U8U8 {
            a: arr2(&[[129, 129], [129, 128]]),
            b: arr2(&[[129, 0], [0, 129]]),
            bias: tensor0(0i32),
            a0: 128,
            b0: 128,
            c0: 0,
            a_scale: 1.,
            b_scale: 1.,
            c_scale: 1.,
            opt: true,
            dyn_qp: true,
        }
        .check(); // c: [[75, 86, 96, 106, 117], [255, 191, 127, 65, 1]]
    }

    #[test]
    fn test_qmmup_u8_u8_u8_3() {
        QMatMulUnaryProblemU8U8U8 {
            a: arr2(&[[60, 196], [114, 142]]),
            b: arr2(&[[0, 0, 0], [0, 0, 0]]),
            bias: tensor0(0i32),
            a0: 0,
            b0: 0,
            c0: 0,
            a_scale: 1.,
            b_scale: 1.,
            c_scale: 1.,
            opt: true,
            dyn_qp: true,
        }
        .check();
    }

    #[test]
    fn test_qmmup_u8_u8_u8_4() {
        QMatMulUnaryProblemU8U8U8 {
            a: arr2(&[[0], [0]]),
            b: arr2(&[[0]]),
            bias: tensor0(0),
            a0: 0,
            b0: 0,
            c0: 0,
            a_scale: 0.05,
            b_scale: 1.0,
            c_scale: 1.0,
            opt: true,
            dyn_qp: false,
        }
        .check()
    }
}