vortex-tensor 0.72.0

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Inner product expression for tensor-like types.

use num_traits::Float;
use vortex_array::ArrayRef;
use vortex_array::ExecutionCtx;
use vortex_array::IntoArray;
use vortex_array::arrays::Constant;
use vortex_array::arrays::ConstantArray;
use vortex_array::arrays::Dict;
use vortex_array::arrays::Extension;
use vortex_array::arrays::ExtensionArray;
use vortex_array::arrays::FixedSizeList;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::ScalarFnArray;
use vortex_array::arrays::dict::DictArraySlotsExt;
use vortex_array::arrays::extension::ExtensionArrayExt;
use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt;
use vortex_array::arrays::scalar_fn::ExactScalarFn;
use vortex_array::arrays::scalar_fn::ScalarFnArrayView;
use vortex_array::arrays::scalar_fn::plugin::ScalarFnArrayParts;
use vortex_array::arrays::scalar_fn::plugin::ScalarFnArrayVTable;
use vortex_array::dtype::DType;
use vortex_array::dtype::NativePType;
use vortex_array::dtype::Nullability;
use vortex_array::dtype::PType;
use vortex_array::expr::Expression;
use vortex_array::expr::and;
use vortex_array::match_each_float_ptype;
use vortex_array::scalar_fn::Arity;
use vortex_array::scalar_fn::ChildName;
use vortex_array::scalar_fn::EmptyOptions;
use vortex_array::scalar_fn::ExecutionArgs;
use vortex_array::scalar_fn::ScalarFnId;
use vortex_array::scalar_fn::ScalarFnVTable;
use vortex_array::scalar_fn::TypedScalarFnInstance;
use vortex_array::serde::ArrayChildren;
use vortex_buffer::Buffer;
use vortex_buffer::BufferMut;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use vortex_session::VortexSession;

use crate::matcher::AnyTensor;
use crate::scalar_fns::l2_denorm::DenormOrientation;
use crate::scalar_fns::sorf_transform::SorfMatrix;
use crate::scalar_fns::sorf_transform::SorfTransform;
use crate::types::vector::Vector;
use crate::utils::BinaryTensorOpMetadata;
use crate::utils::extract_constant_flat_row;
use crate::utils::extract_flat_elements;
use crate::utils::extract_l2_denorm_children;
use crate::utils::validate_binary_tensor_float_inputs;

/// Inner product (dot product) between two columns.
///
/// Computes `sum(a_i * b_i)` over the flat backing buffer of each tensor or vector. For vectors
/// this is the standard dot product; for higher-rank ([`FixedShapeTensor`]) arrays this is the
/// Frobenius inner product.
///
/// Both inputs must be tensor-like extension arrays ([`FixedShapeTensor`] or [`Vector`]) with the
/// same dtype and a float element type. The output is a float column of the same float type.
///
/// [`FixedShapeTensor`]: crate::fixed_shape_tensor::FixedShapeTensor
/// [`Vector`]: crate::vector::Vector
#[derive(Clone)]
pub struct InnerProduct;

impl InnerProduct {
    /// Creates a new [`TypedScalarFnInstance`] wrapping the inner product operation.
    pub fn new() -> TypedScalarFnInstance<InnerProduct> {
        TypedScalarFnInstance::new(InnerProduct, EmptyOptions)
    }

    /// Constructs a [`ScalarFnArray`] that lazily computes the inner product between `lhs` and
    /// `rhs`.
    ///
    /// # Errors
    ///
    /// Returns an error if the [`ScalarFnArray`] cannot be constructed (e.g. due to dtype
    /// mismatches).
    pub fn try_new_array(lhs: ArrayRef, rhs: ArrayRef, len: usize) -> VortexResult<ScalarFnArray> {
        ScalarFnArray::try_new(InnerProduct::new().erased(), vec![lhs, rhs], len)
    }
}

impl ScalarFnVTable for InnerProduct {
    type Options = EmptyOptions;

    fn id(&self) -> ScalarFnId {
        ScalarFnId::new("vortex.tensor.inner_product")
    }

    fn arity(&self, _options: &Self::Options) -> Arity {
        Arity::Exact(2)
    }

    fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName {
        match child_idx {
            0 => ChildName::from("lhs"),
            1 => ChildName::from("rhs"),
            _ => unreachable!("InnerProduct must have exactly two children"),
        }
    }

    fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
        let lhs = &arg_dtypes[0];
        let rhs = &arg_dtypes[1];

        // TODO(connor): relax the float-only gate once integer tensors are supported.
        let tensor_match = validate_binary_tensor_float_inputs(lhs, rhs)?;
        let ptype = tensor_match.element_ptype();
        let nullability = Nullability::from(lhs.is_nullable() || rhs.is_nullable());
        Ok(DType::Primitive(ptype, nullability))
    }

    fn execute(
        &self,
        _options: &Self::Options,
        args: &dyn ExecutionArgs,
        ctx: &mut ExecutionCtx,
    ) -> VortexResult<ArrayRef> {
        let lhs_ref = args.get(0)?;
        let rhs_ref = args.get(1)?;
        let len = args.row_count();

        // Take any L2Denorm-wrapped fast path that applies.
        match DenormOrientation::classify(&lhs_ref, &rhs_ref) {
            DenormOrientation::Both { lhs, rhs } => {
                return self.execute_both_denorm(lhs, rhs, len, ctx);
            }
            DenormOrientation::One { denorm, plain } => {
                return self.execute_one_denorm(denorm, plain, len, ctx);
            }
            DenormOrientation::Neither => {}
        }

        // Reduction case 1: `InnerProduct(SorfTransform(x), const)` rewrites to
        // `InnerProduct(x, forward_rotate(zero_pad(const)))`. Re-executes recursively so
        // case 2 can fire on the rewritten tree.
        if let Some(rewritten) = self.try_execute_sorf_constant(&lhs_ref, &rhs_ref, len, ctx)? {
            return Ok(rewritten);
        }

        // Reduction case 2: `InnerProduct(Vector[FSL(Dict(u8, f32))], const)` is computed by
        // gather-summing `q[j] * values[codes[j] as usize]` per row, reading the codebook
        // directly instead of decoding the column into dense vectors.
        if let Some(result) = self.try_execute_dict_constant(&lhs_ref, &rhs_ref, len, ctx)? {
            return Ok(result);
        }

        // Compute combined validity.
        let validity = lhs_ref.validity()?.and(rhs_ref.validity()?)?;

        // Canonicalize so we can perform the math directly.
        let lhs: ExtensionArray = lhs_ref.execute(ctx)?;
        let rhs: ExtensionArray = rhs_ref.execute(ctx)?;

        // We validated that both inputs have the same type.
        let ext = lhs.dtype().as_extension();
        let tensor_match = ext
            .metadata_opt::<AnyTensor>()
            .vortex_expect("we already validated this in `return_dtype`");
        let dimensions = tensor_match.list_size() as usize;

        // Extract the storage array from each extension input. We pass the storage (FSL) rather
        // than the extension array to avoid canonicalizing the extension wrapper.
        let lhs_storage = lhs.storage_array();
        let rhs_storage = rhs.storage_array();

        let lhs_flat = extract_flat_elements(lhs_storage, dimensions, ctx)?;
        let rhs_flat = extract_flat_elements(rhs_storage, dimensions, ctx)?;

        match_each_float_ptype!(lhs_flat.ptype(), |T| {
            let buffer: Buffer<T> = (0..len)
                .map(|i| inner_product_row(lhs_flat.row::<T>(i), rhs_flat.row::<T>(i)))
                .collect();

            // SAFETY: The buffer length equals `row_count`, which matches the source validity
            // length.
            Ok(unsafe { PrimitiveArray::new_unchecked(buffer, validity) }.into_array())
        })
    }

    fn validity(
        &self,
        _options: &Self::Options,
        expression: &Expression,
    ) -> VortexResult<Option<Expression>> {
        // The result is null if either input tensor is null.
        let lhs_validity = expression.child(0).validity()?;
        let rhs_validity = expression.child(1).validity()?;

        Ok(Some(and(lhs_validity, rhs_validity)))
    }

    fn is_null_sensitive(&self, _options: &Self::Options) -> bool {
        false
    }

    fn is_fallible(&self, _options: &Self::Options) -> bool {
        false
    }
}

impl ScalarFnArrayVTable for InnerProduct {
    fn serialize(
        &self,
        view: &ScalarFnArrayView<Self>,
        _session: &VortexSession,
    ) -> VortexResult<Option<Vec<u8>>> {
        Ok(Some(BinaryTensorOpMetadata::encode_from_view(view)?))
    }

    fn deserialize(
        &self,
        _dtype: &DType,
        len: usize,
        metadata: &[u8],
        children: &dyn ArrayChildren,
        session: &VortexSession,
    ) -> VortexResult<ScalarFnArrayParts<Self>> {
        let reconstructed =
            BinaryTensorOpMetadata::decode_children(metadata, len, children, session)?;
        Ok(ScalarFnArrayParts {
            options: EmptyOptions,
            children: reconstructed,
        })
    }
}

impl InnerProduct {
    /// Both sides are `L2Denorm`: `inner_product = s_l * s_r * dot(n_l, n_r)`.
    fn execute_both_denorm(
        &self,
        lhs_ref: &ArrayRef,
        rhs_ref: &ArrayRef,
        len: usize,
        ctx: &mut ExecutionCtx,
    ) -> VortexResult<ArrayRef> {
        let validity = lhs_ref.validity()?.and(rhs_ref.validity()?)?;

        let (normalized_l, norms_l) = extract_l2_denorm_children(lhs_ref);
        let (normalized_r, norms_r) = extract_l2_denorm_children(rhs_ref);

        let norms_l: PrimitiveArray = norms_l.execute(ctx)?;
        let norms_r: PrimitiveArray = norms_r.execute(ctx)?;

        let dot: PrimitiveArray = InnerProduct::try_new_array(normalized_l, normalized_r, len)?
            .into_array()
            .execute(ctx)?;

        match_each_float_ptype!(dot.ptype(), |T| {
            let dots = dot.as_slice::<T>();
            let nl = norms_l.as_slice::<T>();
            let nr = norms_r.as_slice::<T>();
            let buffer: Buffer<T> = (0..len).map(|i| nl[i] * nr[i] * dots[i]).collect();

            // SAFETY: The buffer length equals `len`, which matches the source validity length.
            Ok(unsafe { PrimitiveArray::new_unchecked(buffer, validity) }.into_array())
        })
    }

    /// One side is `L2Denorm`: `inner_product = s * dot(n, other)`.
    ///
    /// The caller must pass the denorm array as `denorm_ref` and the plain array as `plain_ref`.
    fn execute_one_denorm(
        &self,
        denorm_ref: &ArrayRef,
        plain_ref: &ArrayRef,
        len: usize,
        ctx: &mut ExecutionCtx,
    ) -> VortexResult<ArrayRef> {
        let validity = denorm_ref.validity()?.and(plain_ref.validity()?)?;

        let (normalized, norms) = extract_l2_denorm_children(denorm_ref);
        let denorm_norms: PrimitiveArray = norms.execute(ctx)?;

        let dot: PrimitiveArray = InnerProduct::try_new_array(normalized, plain_ref.clone(), len)?
            .into_array()
            .execute(ctx)?;

        match_each_float_ptype!(dot.ptype(), |T| {
            let dots = dot.as_slice::<T>();
            let ns = denorm_norms.as_slice::<T>();
            let buffer: Buffer<T> = (0..len).map(|i| ns[i] * dots[i]).collect();

            // SAFETY: The buffer length equals `len`, which matches the source validity length.
            Ok(unsafe { PrimitiveArray::new_unchecked(buffer, validity) }.into_array())
        })
    }

    /// Fast path when one side is `ExactScalarFn<SorfTransform>` and the other side is a
    /// constant-backed tensor-like extension. Rewrites to
    /// `InnerProduct(sorf_child, forward_rotate(zero_pad(const_query)))` because SORF is
    /// orthogonal, so `<T(R^{-1} x), c> = <x, R · zero_pad(c)>` where `T` is the truncation from
    /// `padded_dim` to `dim` applied by `SorfTransform` and `R` is the SORF forward matrix. See the
    /// proof in the crate-level docs and in the plan file.
    ///
    /// Returns `Ok(None)` if neither side matches, when the operand element type is not `F32`, or
    /// when the constant side is not a constant-backed tensor extension. The caller is expected to
    /// fall through to the standard path in that case.
    ///
    /// # F32-only
    ///
    /// TODO(connor): this rewrite is only sound for `PType::F32` because `SorfTransform` applies an
    /// `f32 -> element_ptype` cast at the end of its `execute`. For `F16`/`F64` the cast changes
    /// the inner product's rounding and the rewrite would not be semantically equivalent. Until we
    /// push the cast through `InnerProduct`, both the SorfTransform output ptype and the
    /// constant-side element ptype must be `F32` here.
    fn try_execute_sorf_constant(
        &self,
        lhs_ref: &ArrayRef,
        rhs_ref: &ArrayRef,
        len: usize,
        ctx: &mut ExecutionCtx,
    ) -> VortexResult<Option<ArrayRef>> {
        // Identify which side is the SorfTransform, if any.
        let (sorf_view, const_ref) =
            if let Some(view) = lhs_ref.as_opt::<ExactScalarFn<SorfTransform>>() {
                (view, rhs_ref)
            } else if let Some(view) = rhs_ref.as_opt::<ExactScalarFn<SorfTransform>>() {
                (view, lhs_ref)
            } else {
                return Ok(None);
            };

        if sorf_view.options.element_ptype != PType::F32 {
            return Ok(None);
        }

        // The other side must be a constant tensor.
        let Some(const_storage) = constant_tensor_storage(const_ref) else {
            return Ok(None);
        };

        let dim = sorf_view.options.dimensions as usize;
        let num_rounds = sorf_view.options.num_rounds as usize;
        let seed = sorf_view.options.seed;
        let padded_dim = dim.next_power_of_two();

        // Extract the single stored row of the constant.
        let flat = extract_constant_flat_row(&const_storage, ctx)?;
        if flat.ptype() != PType::F32 {
            return Ok(None);
        }

        // Zero-pad the query from `dim` to `padded_dim` and forward-rotate.
        let mut padded_query = vec![0.0f32; padded_dim];
        padded_query[..dim].copy_from_slice(flat.as_slice::<f32>());

        let rotation = SorfMatrix::try_new_padded(padded_dim, num_rounds, seed)?;
        let mut rotated_query = vec![0.0f32; padded_dim];
        rotation.rotate(&padded_query, &mut rotated_query);

        // Wrap the rotated query as a `Vector<padded_dim, f32>` constant broadcast to `len`
        // rows. The new extension dtype has `padded_dim` instead of `dim`, matching the
        // SorfTransform child we are about to dot it with.
        let new_constant = Vector::constant_array(&rotated_query, len)?;

        // Extract the SorfTransform child (the already-padded Vector<padded_dim, f32>).
        let sorf_child = sorf_view
            .nth_child(0)
            .vortex_expect("SorfTransform must have exactly one child");

        // Recursively execute the rewritten inner product. This allows case 2 to fire on
        // the rewritten tree if the sorf child is `Vector[FSL(Dict)]`. Termination is
        // guaranteed because the rewrite strictly removes a `SorfTransform` scalar-fn node
        // from the tree and SORFs cannot be nested.
        let rewritten = InnerProduct::try_new_array(sorf_child, new_constant, len)?
            .into_array()
            .execute(ctx)?;
        Ok(Some(rewritten))
    }

    /// Fast path when one side is an extension whose storage is `FSL(Dict(u8, f32))` and
    /// the other side is a constant-backed tensor extension with an F32 element ptype.
    ///
    /// Computes each row's inner product as
    ///   `out[i] = sum_{j in 0..padded_dim} q[j] * values[codes[i * padded_dim + j] as usize]`
    /// using a direct codebook lookup in the hot loop. An explicit product table
    /// `P[j, k] = q[j] * values[k]` (size `padded_dim * num_centroids * 4B`, ~1 MiB for the
    /// common 1024/256 case) was tried and measured ~10% *slower* on the
    /// `similarity_search` bench because the 1 KiB `values` table stays in L1 across all
    /// rows, while the 1 MiB product table does not.
    ///
    /// Returns `Ok(None)` when the pattern doesn't match; the caller should fall through to
    /// the standard path.
    fn try_execute_dict_constant(
        &self,
        lhs_ref: &ArrayRef,
        rhs_ref: &ArrayRef,
        len: usize,
        ctx: &mut ExecutionCtx,
    ) -> VortexResult<Option<ArrayRef>> {
        // Try each orientation. The oriented helper navigates each side exactly once, so
        // the only redundant work here is the failed navigation of the first side when the
        // dict happens to be on the right.
        if let Some(result) = self.try_execute_dict_constant_oriented(lhs_ref, rhs_ref, len, ctx)? {
            return Ok(Some(result));
        }
        self.try_execute_dict_constant_oriented(rhs_ref, lhs_ref, len, ctx)
    }

    /// Orientation-specific helper for [`Self::try_execute_dict_constant`]. `dict_candidate`
    /// is tried as `Extension[FSL[Dict]]`; `const_candidate` is tried as a constant-backed
    /// tensor extension. Returns `Ok(None)` if either navigation fails or any gate rejects.
    fn try_execute_dict_constant_oriented(
        &self,
        dict_candidate: &ArrayRef,
        const_candidate: &ArrayRef,
        len: usize,
        ctx: &mut ExecutionCtx,
    ) -> VortexResult<Option<ArrayRef>> {
        // Navigate the dict side.
        let Some(dict_ext) = dict_candidate.as_opt::<Extension>() else {
            return Ok(None);
        };
        let Some(fsl) = dict_ext.storage_array().as_opt::<FixedSizeList>() else {
            return Ok(None);
        };
        let Some(dict) = fsl.elements().as_opt::<Dict>() else {
            return Ok(None);
        };

        // Navigate the constant side and require its scalar be non-null.
        let Some(const_storage) = constant_tensor_storage(const_candidate) else {
            return Ok(None);
        };

        // Canonicalize codes and values. Codes may be e.g. BitPacked; executing is cheaper
        // than falling through to the standard path (which would also canonicalize).
        let codes_prim: PrimitiveArray = dict.codes().clone().execute(ctx)?;
        let values_prim: PrimitiveArray = dict.values().clone().execute(ctx)?;

        // Gate: u8 codes and f32 centroids.
        if codes_prim.ptype() != PType::U8 {
            // TODO(connor): Should we support wider codes?
            return Ok(None);
        }
        if values_prim.ptype() != PType::F32 {
            // TODO(connor): direct-lookup path only supports f32 centroids. SorfTransform
            // forces f32 anyway, so this is the only shape we need for now.
            return Ok(None);
        }

        let padded_dim = usize::try_from(fsl.list_size()).vortex_expect("fsl list_size fits usize");

        let flat = extract_constant_flat_row(&const_storage, ctx)?;
        if flat.ptype() != PType::F32 {
            // TODO(connor): case 2 is f32-only. For f16/f64 we fall through to the standard
            // path, which computes the inner product with the correct element type.
            return Ok(None);
        }

        // Combine the input validities up front; the per-row arithmetic may write garbage
        // into null rows but the validity mask hides it (matching the standard path).
        let validity = dict_candidate
            .validity()?
            .and(const_candidate.validity()?)?;

        // Fast path for the empty case: skip allocating and touching the codes buffer.
        if len == 0 {
            let empty = PrimitiveArray::empty::<f32>(validity.nullability());
            return Ok(Some(empty.into_array()));
        }

        let q: &[f32] = flat.as_slice::<f32>();
        debug_assert_eq!(q.len(), padded_dim);
        let codes: &[u8] = codes_prim.as_slice::<u8>();
        let values: &[f32] = values_prim.as_slice::<f32>();
        debug_assert_eq!(codes.len(), len * padded_dim);

        // The hot loop is extracted into [`execute_dict_constant_inner_product`] so the compiler
        // can prove the chunked indices stay in bounds and vectorize the inner gather-accumulate.
        let out = execute_dict_constant_inner_product(q, values, codes, len, padded_dim);

        // SAFETY: the buffer length equals `len`, which matches the validity length.
        let result = unsafe { PrimitiveArray::new_unchecked(out.freeze(), validity) }.into_array();
        Ok(Some(result))
    }
}

/// Return the storage constant for a canonical tensor-like constant query.
fn constant_tensor_storage(array: &ArrayRef) -> Option<ArrayRef> {
    let constant = array.as_opt::<Constant>()?;
    if constant.scalar().is_null() {
        return None;
    }
    let ext_scalar = constant.scalar().as_extension_opt()?;
    Some(ConstantArray::new(ext_scalar.to_storage_scalar(), array.len()).into_array())
}

/// Computes the inner product (dot product) of two equal-length float slices.
///
/// Returns `sum(a_i * b_i)`.
fn inner_product_row<T: Float + NativePType>(a: &[T], b: &[T]) -> T {
    a.iter()
        .zip(b.iter())
        .map(|(&x, &y)| x * y)
        .fold(T::zero(), |acc, v| acc + v)
}

/// Compute inner products between a constant query vector and dictionary-encoded rows.
///
/// For each row, computes `sum(q[j] * values[codes[row * dim + j]])` using the codebook `values`
/// directly instead of decoding the dictionary into dense vectors.
///
/// The inner loop uses `PARTIAL_SUMS` independent accumulators so the CPU can pipeline FP additions
/// instead of waiting for each `fadd` to retire before starting the next.
fn execute_dict_constant_inner_product(
    q: &[f32],
    values: &[f32],
    codes: &[u8],
    num_rows: usize,
    dim: usize,
) -> BufferMut<f32> {
    let mut out = BufferMut::<f32>::with_capacity(num_rows);

    const PARTIAL_SUMS: usize = 8;

    for row_codes in codes.chunks_exact(dim) {
        let mut acc = [0.0f32; PARTIAL_SUMS];

        let code_chunks = row_codes.chunks_exact(PARTIAL_SUMS);
        let q_chunks = q.chunks_exact(PARTIAL_SUMS);
        let code_rem = code_chunks.remainder();
        let q_rem = q_chunks.remainder();

        for (cc, qd) in code_chunks.zip(q_chunks) {
            for i in 0..PARTIAL_SUMS {
                acc[i] = qd[i].mul_add(values[cc[i] as usize], acc[i]);
            }
        }

        for (&code, &q_val) in code_rem.iter().zip(q_rem.iter()) {
            acc[0] = q_val.mul_add(values[code as usize], acc[0]);
        }

        // SAFETY: we reserved `num_rows` slots and push exactly once per row.
        unsafe { out.push_unchecked(acc.iter().sum::<f32>()) };
    }

    out
}

#[cfg(test)]
mod tests {

    use rstest::rstest;
    use vortex_array::ArrayPlugin;
    use vortex_array::ArrayRef;
    use vortex_array::IntoArray;
    use vortex_array::VortexSessionExecute;
    use vortex_array::arrays::MaskedArray;
    use vortex_array::arrays::PrimitiveArray;
    use vortex_array::arrays::ScalarFnArray;
    use vortex_array::arrays::scalar_fn::plugin::ScalarFnArrayPlugin;
    use vortex_array::validity::Validity;
    use vortex_error::VortexResult;

    use crate::scalar_fns::inner_product::InnerProduct;
    use crate::scalar_fns::l2_denorm::L2Denorm;
    use crate::tests::SESSION;
    use crate::utils::test_helpers::assert_close;
    use crate::utils::test_helpers::l2_denorm_array;
    use crate::utils::test_helpers::tensor_array;
    use crate::utils::test_helpers::vector_array;

    /// Evaluates inner product between two tensor arrays and returns the result as `Vec<f64>`.
    fn eval_inner_product(lhs: ArrayRef, rhs: ArrayRef, len: usize) -> VortexResult<Vec<f64>> {
        let scalar_fn = InnerProduct::new().erased();
        let result = ScalarFnArray::try_new(scalar_fn, vec![lhs, rhs], len)?;
        let mut ctx = SESSION.create_execution_ctx();
        let prim: PrimitiveArray = result.into_array().execute(&mut ctx)?;
        Ok(prim.as_slice::<f64>().to_vec())
    }

    /// Single-row inner product for various vector pairs.
    #[rstest]
    // Orthogonal: [1, 0] . [0, 1] = 0.
    #[case::orthogonal(&[2], &[1.0, 0.0], &[0.0, 1.0], &[0.0])]
    // Parallel: [3, 4] . [3, 4] = 9 + 16 = 25.
    #[case::parallel(&[2], &[3.0, 4.0], &[3.0, 4.0], &[25.0])]
    // Antiparallel: [1, 2] . [-1, -2] = -1 + -4 = -5.
    #[case::antiparallel(&[2], &[1.0, 2.0], &[-1.0, -2.0], &[-5.0])]
    // Scaled: [2, 0] . [3, 0] = 6.
    #[case::scaled(&[2], &[2.0, 0.0], &[3.0, 0.0], &[6.0])]
    fn single_row(
        #[case] shape: &[usize],
        #[case] lhs_elems: &[f64],
        #[case] rhs_elems: &[f64],
        #[case] expected: &[f64],
    ) -> VortexResult<()> {
        let lhs = tensor_array(shape, lhs_elems)?;
        let rhs = tensor_array(shape, rhs_elems)?;
        assert_close(&eval_inner_product(lhs, rhs, 1)?, expected);
        Ok(())
    }

    #[test]
    fn multiple_rows() -> VortexResult<()> {
        let lhs = tensor_array(
            &[3],
            &[
                1.0, 0.0, 0.0, // tensor 0
                3.0, 4.0, 0.0, // tensor 1
                1.0, 1.0, 1.0, // tensor 2
            ],
        )?;
        let rhs = tensor_array(
            &[3],
            &[
                0.0, 1.0, 0.0, // tensor 0: dot = 0
                3.0, 4.0, 0.0, // tensor 1: dot = 25
                2.0, 2.0, 2.0, // tensor 2: dot = 6
            ],
        )?;
        assert_close(&eval_inner_product(lhs, rhs, 3)?, &[0.0, 25.0, 6.0]);
        Ok(())
    }

    #[test]
    fn vector_inner_product() -> VortexResult<()> {
        let lhs = vector_array(
            2,
            &[
                3.0, 4.0, // vector 0
                1.0, 0.0, // vector 1
            ],
        )?;
        let rhs = vector_array(
            2,
            &[
                3.0, 4.0, // vector 0: dot = 25
                0.0, 1.0, // vector 1: dot = 0
            ],
        )?;
        assert_close(&eval_inner_product(lhs, rhs, 2)?, &[25.0, 0.0]);
        Ok(())
    }

    #[test]
    fn null_input_row() -> VortexResult<()> {
        // 3 rows of dim-2 vectors. Row 1 of lhs is masked as null.
        let lhs = tensor_array(&[2], &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0])?;
        let rhs = tensor_array(&[2], &[7.0, 8.0, 9.0, 10.0, 11.0, 12.0])?;
        let lhs = MaskedArray::try_new(lhs, Validity::from_iter([true, false, true]))?.into_array();

        let scalar_fn = InnerProduct::new().erased();
        let result = ScalarFnArray::try_new(scalar_fn, vec![lhs, rhs], 3)?;
        let mut ctx = SESSION.create_execution_ctx();
        let prim: PrimitiveArray = result.into_array().execute(&mut ctx)?;

        // Row 0: 1*7 + 2*8 = 23, row 1: null, row 2: 5*11 + 6*12 = 127.
        assert!(prim.is_valid(0, &mut ctx)?);
        assert!(!prim.is_valid(1, &mut ctx)?);
        assert!(prim.is_valid(2, &mut ctx)?);
        assert_close(&[prim.as_slice::<f64>()[0]], &[23.0]);
        assert_close(&[prim.as_slice::<f64>()[2]], &[127.0]);
        Ok(())
    }

    #[test]
    fn rejects_non_extension_dtype() {
        let lhs = PrimitiveArray::from_iter([1.0_f64, 2.0]).into_array();
        let rhs = PrimitiveArray::from_iter([3.0_f64, 4.0]).into_array();
        let result = InnerProduct::try_new_array(lhs, rhs, 2);
        assert!(result.is_err());
    }

    #[test]
    fn rejects_mismatched_dtypes() -> VortexResult<()> {
        let lhs = tensor_array(&[2], &[1.0_f64, 2.0])?;
        let rhs = vector_array(2, &[3.0_f64, 4.0])?;
        let result = InnerProduct::try_new_array(lhs, rhs, 1);
        assert!(result.is_err());
        Ok(())
    }

    #[test]
    fn both_denorm() -> VortexResult<()> {
        // LHS: [3.0, 4.0] = L2Denorm([0.6, 0.8], 5.0).
        // RHS: [1.0, 0.0] = L2Denorm([1.0, 0.0], 1.0).
        // dot([3.0, 4.0], [1.0, 0.0]) = 3.0.
        let mut ctx = SESSION.create_execution_ctx();
        let lhs = l2_denorm_array(&[2], &[0.6, 0.8], &[5.0], &mut ctx)?;
        let rhs = l2_denorm_array(&[2], &[1.0, 0.0], &[1.0], &mut ctx)?;

        // Expected: 5.0 * 1.0 * dot([0.6, 0.8], [1.0, 0.0]) = 5.0 * 0.6 = 3.0.
        assert_close(&eval_inner_product(lhs, rhs, 1)?, &[3.0]);
        Ok(())
    }

    #[test]
    fn both_denorm_multiple_rows() -> VortexResult<()> {
        // Row 0: [3.0, 4.0] dot [3.0, 4.0] = 25.0.
        // Row 1: [1.0, 0.0] dot [0.0, 1.0] = 0.0.
        let mut ctx = SESSION.create_execution_ctx();
        let lhs = l2_denorm_array(&[2], &[0.6, 0.8, 1.0, 0.0], &[5.0, 1.0], &mut ctx)?;
        let rhs = l2_denorm_array(&[2], &[0.6, 0.8, 0.0, 1.0], &[5.0, 1.0], &mut ctx)?;

        assert_close(&eval_inner_product(lhs, rhs, 2)?, &[25.0, 0.0]);
        Ok(())
    }

    #[test]
    fn one_side_denorm_lhs() -> VortexResult<()> {
        // LHS: L2Denorm([0.6, 0.8], 5.0) representing [3.0, 4.0].
        // RHS: plain [1.0, 2.0].
        // dot([3.0, 4.0], [1.0, 2.0]) = 3.0 + 8.0 = 11.0.
        let mut ctx = SESSION.create_execution_ctx();
        let lhs = l2_denorm_array(&[2], &[0.6, 0.8], &[5.0], &mut ctx)?;
        let rhs = tensor_array(&[2], &[1.0, 2.0])?;

        assert_close(&eval_inner_product(lhs, rhs, 1)?, &[11.0]);
        Ok(())
    }

    #[test]
    fn one_side_denorm_rhs() -> VortexResult<()> {
        // LHS: plain [1.0, 2.0].
        // RHS: L2Denorm([0.6, 0.8], 5.0) representing [3.0, 4.0].
        // dot([1.0, 2.0], [3.0, 4.0]) = 3.0 + 8.0 = 11.0.
        let mut ctx = SESSION.create_execution_ctx();
        let lhs = tensor_array(&[2], &[1.0, 2.0])?;
        let rhs = l2_denorm_array(&[2], &[0.6, 0.8], &[5.0], &mut ctx)?;

        assert_close(&eval_inner_product(lhs, rhs, 1)?, &[11.0]);
        Ok(())
    }

    #[test]
    fn both_denorm_null_norms() -> VortexResult<()> {
        // Row 0: valid, row 1: null (via nullable norms on lhs).
        let normalized_l = tensor_array(&[2], &[0.6, 0.8, 1.0, 0.0])?;
        let norms_l = PrimitiveArray::from_option_iter([Some(5.0f64), None]).into_array();
        let mut ctx = SESSION.create_execution_ctx();

        let lhs = L2Denorm::try_new_array(normalized_l, norms_l, 2, &mut ctx)?.into_array();
        let rhs = l2_denorm_array(&[2], &[0.6, 0.8, 1.0, 0.0], &[5.0, 1.0], &mut ctx)?;

        let scalar_fn = InnerProduct::new().erased();
        let result = ScalarFnArray::try_new(scalar_fn, vec![lhs, rhs], 2)?;
        let prim: PrimitiveArray = result.into_array().execute(&mut ctx)?;

        // Row 0: 5.0 * 5.0 * dot([0.6, 0.8], [0.6, 0.8]) = 25.0, row 1: null.
        assert!(prim.is_valid(0, &mut ctx)?);
        assert!(!prim.is_valid(1, &mut ctx)?);
        assert_close(&[prim.as_slice::<f64>()[0]], &[25.0]);
        Ok(())
    }

    #[rstest]
    #[case::vector(inner_product_vector_lhs(), inner_product_vector_rhs(), 2)]
    #[case::fixed_shape_tensor(inner_product_tensor_lhs(), inner_product_tensor_rhs(), 2)]
    fn serde_round_trip(
        #[case] lhs: ArrayRef,
        #[case] rhs: ArrayRef,
        #[case] len: usize,
    ) -> VortexResult<()> {
        let original = InnerProduct::try_new_array(lhs.clone(), rhs.clone(), len)?.into_array();

        let plugin = ScalarFnArrayPlugin::new(InnerProduct);
        let metadata = plugin
            .serialize(&original, &SESSION)?
            .expect("InnerProduct serialize must produce metadata");

        let children = vec![lhs, rhs];
        let recovered = plugin.deserialize(
            original.dtype(),
            original.len(),
            &metadata,
            &[],
            &children,
            &SESSION,
        )?;

        assert_eq!(recovered.dtype(), original.dtype());
        assert_eq!(recovered.len(), original.len());
        assert_eq!(recovered.encoding_id(), original.encoding_id());
        Ok(())
    }

    fn inner_product_vector_lhs() -> ArrayRef {
        vector_array(3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).expect("valid vector array")
    }

    fn inner_product_vector_rhs() -> ArrayRef {
        vector_array(3, &[7.0, 8.0, 9.0, 10.0, 11.0, 12.0]).expect("valid vector array")
    }

    fn inner_product_tensor_lhs() -> ArrayRef {
        tensor_array(&[2], &[1.0, 2.0, 3.0, 4.0]).expect("valid tensor array")
    }

    fn inner_product_tensor_rhs() -> ArrayRef {
        tensor_array(&[2], &[5.0, 6.0, 7.0, 8.0]).expect("valid tensor array")
    }

    // ---- Tests for the `SorfTransform + constant` and `Dict + constant` fast paths ----

    #[allow(
        clippy::cast_possible_truncation,
        reason = "tests build small fixtures with deterministic in-range indices"
    )]
    mod constant_query_optimizations {
        use rstest::rstest;
        use vortex_array::ArrayRef;
        use vortex_array::IntoArray;
        use vortex_array::VortexSessionExecute;
        use vortex_array::arrays::Constant;
        use vortex_array::arrays::FixedSizeListArray;
        use vortex_array::arrays::PrimitiveArray;
        use vortex_array::arrays::ScalarFnArray;
        use vortex_array::arrays::dict::DictArray;
        use vortex_array::dtype::DType;
        use vortex_array::dtype::Nullability;
        use vortex_array::dtype::PType;
        use vortex_array::validity::Validity;
        use vortex_buffer::Buffer;
        use vortex_error::VortexResult;

        use crate::scalar_fns::inner_product::InnerProduct;
        use crate::scalar_fns::inner_product::constant_tensor_storage;
        use crate::scalar_fns::sorf_transform::SorfMatrix;
        use crate::scalar_fns::sorf_transform::SorfOptions;
        use crate::scalar_fns::sorf_transform::SorfTransform;
        use crate::tests::SESSION;
        use crate::types::vector::Vector;
        use crate::utils::extract_flat_elements;
        use crate::utils::test_helpers::literal_vector_array;
        use crate::utils::test_helpers::vector_array;

        /// Build a `Vector<list_size, f32>` whose storage is `FSL(DictArray(codes: u8, values:
        /// f32))`. This mirrors the shape that TurboQuant produces as the SorfTransform child.
        fn dict_vector_f32(list_size: u32, codes: &[u8], values: &[f32]) -> VortexResult<ArrayRef> {
            let num_rows = codes.len() / list_size as usize;
            let codes_arr =
                PrimitiveArray::new::<u8>(Buffer::copy_from(codes), Validity::NonNullable)
                    .into_array();
            let values_arr =
                PrimitiveArray::new::<f32>(Buffer::copy_from(values), Validity::NonNullable)
                    .into_array();
            let dict = DictArray::try_new(codes_arr, values_arr)?;
            let fsl = FixedSizeListArray::try_new(
                dict.into_array(),
                list_size,
                Validity::NonNullable,
                num_rows,
            )?;
            Vector::try_new_vector_array(fsl.into_array())
        }

        /// Execute an inner product and return the flat `f32` results.
        fn eval_ip_f32(lhs: ArrayRef, rhs: ArrayRef, len: usize) -> VortexResult<Vec<f32>> {
            let scalar_fn = InnerProduct::new().erased();
            let result = ScalarFnArray::try_new(scalar_fn, vec![lhs, rhs], len)?;
            let mut ctx = SESSION.create_execution_ctx();
            let prim: PrimitiveArray = result.into_array().execute(&mut ctx)?;
            Ok(prim.as_slice::<f32>().to_vec())
        }

        fn assert_close_f32(actual: &[f32], expected: &[f32], tol: f32) {
            assert_eq!(actual.len(), expected.len(), "length mismatch");
            for (i, (a, e)) in actual.iter().zip(expected).enumerate() {
                assert!(
                    (a - e).abs() < tol,
                    "row {i}: got {a}, expected {e} (diff = {})",
                    (a - e).abs()
                );
            }
        }

        /// Build a SorfTransform ScalarFnArray whose child is a `Vector<padded_dim, f32>`
        /// wrapping `FSL(Dict(codes, values))`. Returns `(sorf_array, codes, values,
        /// padded_dim)`.
        fn build_sorf_with_dict_child(
            dim: u32,
            num_rows: usize,
            seed: u64,
            num_rounds: u8,
        ) -> VortexResult<(ArrayRef, Vec<u8>, Vec<f32>, usize)> {
            let padded_dim = (dim as usize).next_power_of_two();
            // Small hand-picked codebook of 8 f32 centroids.
            let values: Vec<f32> = vec![-1.5, -1.0, -0.5, -0.1, 0.1, 0.5, 1.0, 1.5];
            // Deterministic codes in 0..values.len() covering every position.
            let codes: Vec<u8> = (0..num_rows * padded_dim)
                .map(|i| (i as u8) % (values.len() as u8))
                .collect();

            let padded_vector = dict_vector_f32(padded_dim as u32, &codes, &values)?;
            let sorf_options = SorfOptions {
                seed,
                num_rounds,
                dimensions: dim,
                element_ptype: PType::F32,
            };
            let sorf =
                SorfTransform::try_new_array(&sorf_options, padded_vector, num_rows)?.into_array();
            Ok((sorf, codes, values, padded_dim))
        }

        /// Decode a SorfTransform-wrapped dict-vector to a flat `Vec<f32>` of `num_rows *
        /// dim` post-rotation, post-truncation values. This is the ground truth against
        /// which we compare the fast-path result.
        fn decode_sorf_dict(
            codes: &[u8],
            values: &[f32],
            padded_dim: usize,
            dim: usize,
            num_rows: usize,
            seed: u64,
            num_rounds: u8,
        ) -> VortexResult<Vec<f32>> {
            let rotation = SorfMatrix::try_new_padded(padded_dim, num_rounds as usize, seed)?;
            let mut padded = vec![0.0f32; padded_dim];
            let mut rotated = vec![0.0f32; padded_dim];
            let mut out = Vec::with_capacity(num_rows * dim);
            for row in 0..num_rows {
                for j in 0..padded_dim {
                    padded[j] = values[codes[row * padded_dim + j] as usize];
                }
                rotation.inverse_rotate(&padded, &mut rotated);
                out.extend_from_slice(&rotated[..dim]);
            }
            Ok(out)
        }

        fn naive_dot(a: &[f32], b: &[f32]) -> f32 {
            a.iter().zip(b.iter()).map(|(&x, &y)| x * y).sum()
        }

        // ---- Case 1: SorfTransform + Constant pull-through ----

        #[test]
        fn constant_tensor_storage_accepts_extension_scalar_literal() -> VortexResult<()> {
            let literal = literal_vector_array(&[1.0f32, 2.0, 3.0], 5);
            let storage =
                constant_tensor_storage(&literal).expect("literal vector should be recognized");

            assert_eq!(storage.len(), 5);
            let const_storage = storage
                .as_opt::<Constant>()
                .expect("storage should remain constant-backed");
            assert!(matches!(
                const_storage.scalar().dtype(),
                DType::FixedSizeList(_, 3, Nullability::NonNullable)
            ));

            let mut ctx = SESSION.create_execution_ctx();
            let flat = extract_flat_elements(&storage, 3, &mut ctx)?;
            assert_eq!(flat.row::<f32>(0), &[1.0, 2.0, 3.0]);
            Ok(())
        }

        /// Case 1: SorfTransform on LHS, constant query on RHS, with `dim < padded_dim`
        /// so the zero-padding branch is exercised.
        #[test]
        fn case1_sorf_lhs_constant_rhs_padded_gt_dim() -> VortexResult<()> {
            let dim: u32 = 100;
            let num_rows = 7usize;
            let seed = 42u64;
            let num_rounds = 3u8;
            let padded_dim = (dim as usize).next_power_of_two();
            assert!(padded_dim > dim as usize, "test must exercise padding");

            let (sorf_lhs, codes, values, padded_dim_computed) =
                build_sorf_with_dict_child(dim, num_rows, seed, num_rounds)?;
            assert_eq!(padded_dim_computed, padded_dim);

            // Query has `dim` elements.
            let query_elems: Vec<f32> = (0..dim).map(|i| (i as f32 * 0.1).sin()).collect();
            let const_rhs = Vector::constant_array(&query_elems, num_rows)?;

            // Ground truth: decode LHS to plain f32 vectors, dot each with the query.
            let decoded = decode_sorf_dict(
                &codes,
                &values,
                padded_dim,
                dim as usize,
                num_rows,
                seed,
                num_rounds,
            )?;
            let expected: Vec<f32> = (0..num_rows)
                .map(|i| {
                    naive_dot(
                        &decoded[i * dim as usize..(i + 1) * dim as usize],
                        &query_elems,
                    )
                })
                .collect();

            let actual = eval_ip_f32(sorf_lhs, const_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-3);
            Ok(())
        }

        /// Case 1: SorfTransform on RHS, constant query on LHS (mirrored).
        #[test]
        fn case1_constant_lhs_sorf_rhs_mirrored() -> VortexResult<()> {
            let dim: u32 = 100;
            let num_rows = 5usize;
            let seed = 7u64;
            let num_rounds = 3u8;

            let (sorf, codes, values, padded_dim) =
                build_sorf_with_dict_child(dim, num_rows, seed, num_rounds)?;

            let query_elems: Vec<f32> = (0..dim).map(|i| (i as f32 * 0.2).cos()).collect();
            let const_lhs = Vector::constant_array(&query_elems, num_rows)?;

            let decoded = decode_sorf_dict(
                &codes,
                &values,
                padded_dim,
                dim as usize,
                num_rows,
                seed,
                num_rounds,
            )?;
            let expected: Vec<f32> = (0..num_rows)
                .map(|i| {
                    naive_dot(
                        &decoded[i * dim as usize..(i + 1) * dim as usize],
                        &query_elems,
                    )
                })
                .collect();

            let actual = eval_ip_f32(const_lhs, sorf, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-3);
            Ok(())
        }

        /// Case 1: `dim == padded_dim` (power-of-two, no zero padding).
        #[test]
        fn case1_padded_equals_dim() -> VortexResult<()> {
            let dim: u32 = 128;
            let num_rows = 4usize;
            let seed = 11u64;
            let num_rounds = 3u8;

            let (sorf, codes, values, padded_dim) =
                build_sorf_with_dict_child(dim, num_rows, seed, num_rounds)?;
            assert_eq!(padded_dim, dim as usize);

            let query_elems: Vec<f32> = (0..dim).map(|i| i as f32 * 0.01 - 0.5).collect();
            let const_rhs = Vector::constant_array(&query_elems, num_rows)?;

            let decoded = decode_sorf_dict(
                &codes,
                &values,
                padded_dim,
                dim as usize,
                num_rows,
                seed,
                num_rounds,
            )?;
            let expected: Vec<f32> = (0..num_rows)
                .map(|i| {
                    naive_dot(
                        &decoded[i * dim as usize..(i + 1) * dim as usize],
                        &query_elems,
                    )
                })
                .collect();

            let actual = eval_ip_f32(sorf, const_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-3);
            Ok(())
        }

        /// Case 1: empty `len == 0`. The fast path should handle this without exploding.
        #[test]
        fn case1_empty_len_zero() -> VortexResult<()> {
            let dim: u32 = 100;
            let num_rows = 0usize;
            let seed = 42u64;
            let num_rounds = 3u8;

            let (sorf, _codes, _values, _padded_dim) =
                build_sorf_with_dict_child(dim, num_rows, seed, num_rounds)?;

            let query_elems: Vec<f32> = vec![0.0; dim as usize];
            let const_rhs = Vector::constant_array(&query_elems, num_rows)?;

            let actual = eval_ip_f32(sorf, const_rhs, num_rows)?;
            assert_eq!(actual.len(), 0);
            Ok(())
        }

        // ---- Case 2: Dict + Constant direct-lookup path ----

        /// Case 2: Vector[FSL[Dict(u8, f32)]] on LHS, constant query on RHS.
        #[test]
        fn case2_dict_lhs_constant_rhs_matches_naive() -> VortexResult<()> {
            let list_size: u32 = 8;
            let num_rows = 10usize;
            // 8 centroids, tiny table.
            let values: Vec<f32> = vec![-1.0, -0.5, -0.25, -0.1, 0.1, 0.25, 0.5, 1.0];
            // Deterministic codes.
            let codes: Vec<u8> = (0..num_rows * list_size as usize)
                .map(|i| (i as u8) % (values.len() as u8))
                .collect();
            let dict_lhs = dict_vector_f32(list_size, &codes, &values)?;

            let query: Vec<f32> = (0..list_size).map(|i| (i as f32 + 1.0) * 0.3).collect();
            let const_rhs = Vector::constant_array(&query, num_rows)?;

            let expected: Vec<f32> = (0..num_rows)
                .map(|row| {
                    let mut acc = 0.0f32;
                    for j in 0..list_size as usize {
                        let k = codes[row * list_size as usize + j] as usize;
                        acc += query[j] * values[k];
                    }
                    acc
                })
                .collect();

            let actual = eval_ip_f32(dict_lhs, const_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-5);
            Ok(())
        }

        /// Case 2: constant query on LHS, dict column on RHS (mirrored).
        #[test]
        fn case2_constant_lhs_dict_rhs_mirrored() -> VortexResult<()> {
            let list_size: u32 = 4;
            let num_rows = 6usize;
            let values: Vec<f32> = vec![0.1, 0.4, 0.7, 1.0];
            let codes: Vec<u8> = (0..num_rows * list_size as usize)
                .map(|i| ((i * 3) as u8) % (values.len() as u8))
                .collect();
            let dict_rhs = dict_vector_f32(list_size, &codes, &values)?;

            let query: Vec<f32> = vec![0.5, -1.0, 2.5, -0.25];
            let const_lhs = Vector::constant_array(&query, num_rows)?;

            let expected: Vec<f32> = (0..num_rows)
                .map(|row| {
                    let mut acc = 0.0f32;
                    for j in 0..list_size as usize {
                        let k = codes[row * list_size as usize + j] as usize;
                        acc += query[j] * values[k];
                    }
                    acc
                })
                .collect();

            let actual = eval_ip_f32(const_lhs, dict_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-5);
            Ok(())
        }

        /// Case 2: dict with `u16` codes (and hence more than 256 values) falls through to
        /// the standard path but still produces the correct result. The direct-lookup path
        /// only handles `u8` codes today.
        #[test]
        fn case2_u16_codes_falls_through() -> VortexResult<()> {
            let list_size: u32 = 4;
            let num_rows = 3usize;
            let num_values = 300usize;
            let values: Vec<f32> = (0..num_values).map(|i| i as f32 * 0.01).collect();
            // Codes must be u16 because 300 > 255. dict_vector_f32 only supports u8 so we
            // build the dict by hand here.
            let codes_u16: Vec<u16> = (0..(num_rows * 4))
                .map(|i| (i % num_values) as u16)
                .collect();
            let codes_arr =
                PrimitiveArray::new::<u16>(Buffer::copy_from(codes_u16), Validity::NonNullable)
                    .into_array();
            let values_arr =
                PrimitiveArray::new::<f32>(Buffer::copy_from(&values), Validity::NonNullable)
                    .into_array();
            let dict = DictArray::try_new(codes_arr, values_arr)?;
            let fsl = FixedSizeListArray::try_new(
                dict.into_array(),
                list_size,
                Validity::NonNullable,
                num_rows,
            )?;
            let dict_lhs = Vector::try_new_vector_array(fsl.into_array())?;

            let query: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
            let const_rhs = Vector::constant_array(&query, num_rows)?;

            // Build expected by decoding by hand.
            let expected: Vec<f32> = (0..num_rows)
                .map(|row| {
                    let mut acc = 0.0f32;
                    for j in 0..4 {
                        let code = (row * 4 + j) % num_values;
                        acc += query[j] * values[code];
                    }
                    acc
                })
                .collect();

            let actual = eval_ip_f32(dict_lhs, const_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-5);
            Ok(())
        }

        /// Case 2: plain (non-dict) FSL with a constant RHS falls through to the standard
        /// path and produces the correct result.
        #[test]
        fn case2_plain_fsl_falls_through() -> VortexResult<()> {
            let dim: u32 = 4;
            let num_rows = 3usize;
            let lhs_elems: Vec<f32> = (0..num_rows * dim as usize)
                .map(|i| i as f32 * 0.25)
                .collect();
            let plain_lhs = vector_array(dim, &lhs_elems)?;

            let query: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
            let const_rhs = Vector::constant_array(&query, num_rows)?;

            let expected: Vec<f32> = (0..num_rows)
                .map(|row| {
                    naive_dot(
                        &lhs_elems[row * dim as usize..(row + 1) * dim as usize],
                        &query,
                    )
                })
                .collect();

            let actual = eval_ip_f32(plain_lhs, const_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-5);
            Ok(())
        }

        /// Case 2: empty `len == 0` fast path returns an empty primitive array without
        /// touching the codes buffer.
        #[test]
        fn case2_empty_len_zero() -> VortexResult<()> {
            let list_size: u32 = 4;
            let num_rows = 0usize;
            let values: Vec<f32> = vec![0.0, 1.0, 2.0, 3.0];
            let codes: Vec<u8> = Vec::new();
            let dict_lhs = dict_vector_f32(list_size, &codes, &values)?;

            let query: Vec<f32> = vec![0.0; 4];
            let const_rhs = Vector::constant_array(&query, num_rows)?;

            let actual = eval_ip_f32(dict_lhs, const_rhs, num_rows)?;
            assert_eq!(actual.len(), 0);
            Ok(())
        }

        /// Case 1 + Case 2 end-to-end: the SorfTransform-wrapped dict column hits Case 1
        /// then Case 2 via recursive execution.
        #[test]
        fn end_to_end_sorf_plus_dict_cosine_path() -> VortexResult<()> {
            let dim: u32 = 100;
            let num_rows = 9usize;
            let seed = 99u64;
            let num_rounds = 3u8;

            let (sorf, codes, values, padded_dim) =
                build_sorf_with_dict_child(dim, num_rows, seed, num_rounds)?;

            let query_elems: Vec<f32> = (0..dim).map(|i| ((i as f32) * 0.15).sin() * 0.4).collect();
            let const_rhs = Vector::constant_array(&query_elems, num_rows)?;

            // Ground truth via full decode + naive dot.
            let decoded = decode_sorf_dict(
                &codes,
                &values,
                padded_dim,
                dim as usize,
                num_rows,
                seed,
                num_rounds,
            )?;
            let expected: Vec<f32> = (0..num_rows)
                .map(|i| {
                    naive_dot(
                        &decoded[i * dim as usize..(i + 1) * dim as usize],
                        &query_elems,
                    )
                })
                .collect();

            let actual = eval_ip_f32(sorf, const_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-3);
            Ok(())
        }

        // ---- Additional correctness / stress tests (all with loose tolerances) ----

        /// A tiny in-place xorshift64 PRNG so these tests don't depend on `rand`. Producing
        /// deterministic pseudo-random f32 values lets the correctness checks exercise
        /// realistic data instead of smooth sin/cos patterns.
        struct XorShift64(u64);

        impl XorShift64 {
            fn new(seed: u64) -> Self {
                // Any nonzero seed is fine; xorshift fixed-points at 0.
                Self(seed.wrapping_add(0x9E37_79B9_7F4A_7C15))
            }

            fn next_u64(&mut self) -> u64 {
                let mut x = self.0;
                x ^= x << 13;
                x ^= x >> 7;
                x ^= x << 17;
                self.0 = x;
                x
            }

            /// Uniform f32 in `[-1.0, 1.0)`.
            fn next_f32(&mut self) -> f32 {
                // Top 24 bits -> mantissa in [0, 1), then shift to [-1, 1).
                let bits = (self.next_u64() >> 40) as u32; // 24 bits
                (bits as f32) / (1u32 << 24) as f32 * 2.0 - 1.0
            }
        }

        /// Case 2 stress: u8-coded dict with 200 centroids (formerly blocked by the
        /// `values.len() <= 256` gate). The direct-lookup path must now handle it.
        #[test]
        fn case2_large_u8_codebook_direct_lookup() -> VortexResult<()> {
            let list_size: u32 = 16;
            let num_rows = 20usize;
            let num_centroids = 200usize;
            assert!(num_centroids > 8 && num_centroids <= 256);

            let mut rng = XorShift64::new(0xDEAD_BEEF);
            let values: Vec<f32> = (0..num_centroids).map(|_| rng.next_f32()).collect();
            let codes: Vec<u8> = (0..num_rows * list_size as usize)
                .map(|_| (rng.next_u64() % num_centroids as u64) as u8)
                .collect();

            let dict_lhs = dict_vector_f32(list_size, &codes, &values)?;
            let query: Vec<f32> = (0..list_size).map(|_| rng.next_f32()).collect();
            let const_rhs = Vector::constant_array(&query, num_rows)?;

            let expected: Vec<f32> = (0..num_rows)
                .map(|row| {
                    let mut acc = 0.0f32;
                    for j in 0..list_size as usize {
                        let k = codes[row * list_size as usize + j] as usize;
                        acc += query[j] * values[k];
                    }
                    acc
                })
                .collect();

            let actual = eval_ip_f32(dict_lhs, const_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-4);
            Ok(())
        }

        /// Parameterized sweep over the full `InnerProduct(SorfTransform(Vector[FSL(Dict)]),
        /// ConstantArray)` tree, exercising the case 1 + case 2 chain for a realistic mix
        /// of dimensions, row counts, seeds, and number of SORF rounds. Tolerance is
        /// deliberately loose because the rewrite introduces an f32-domain rotation that
        /// accumulates a small numerical drift versus a naive decode.
        #[rstest]
        #[case::small_no_pad(128, 11, 1, 1)]
        #[case::small_no_pad_rounds3(128, 23, 1_234, 3)]
        #[case::small_padded(100, 17, 42, 3)]
        #[case::mid_padded(200, 13, 2024, 3)]
        #[case::mid_power_of_two(256, 31, 7, 3)]
        #[case::larger_padded(300, 9, 99, 3)]
        #[case::max_rounds(128, 5, 31_415, 5)]
        fn case1_sorf_random_sweep(
            #[case] dim: u32,
            #[case] num_rows: usize,
            #[case] seed: u64,
            #[case] num_rounds: u8,
        ) -> VortexResult<()> {
            let (sorf, codes, values, padded_dim) =
                build_sorf_with_dict_child(dim, num_rows, seed, num_rounds)?;

            // Use a pseudo-random query with both positive and negative entries so the sum
            // has cancellation.
            let mut rng = XorShift64::new(seed ^ 0xABCD_1234);
            let query: Vec<f32> = (0..dim).map(|_| rng.next_f32()).collect();
            let const_rhs = Vector::constant_array(&query, num_rows)?;

            let decoded = decode_sorf_dict(
                &codes,
                &values,
                padded_dim,
                dim as usize,
                num_rows,
                seed,
                num_rounds,
            )?;
            let expected: Vec<f32> = (0..num_rows)
                .map(|i| naive_dot(&decoded[i * dim as usize..(i + 1) * dim as usize], &query))
                .collect();

            // Loose tolerance: the sorf transform works in f32 with a k-round butterfly, so
            // the rewrite path and the decoded path accumulate slightly different rounding
            // even though the math is equivalent in exact arithmetic.
            let actual = eval_ip_f32(sorf, const_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-2);
            Ok(())
        }

        /// Parameterized sweep over plain `Vector[FSL(Dict(u8, f32))]` + constant query,
        /// without SorfTransform in the mix. This directly exercises case 2 across a
        /// variety of list sizes, num_rows, and codebook sizes including large ones that
        /// the old `<= 256` gate would have rejected.
        #[rstest]
        #[case::small(4, 7, 8)]
        #[case::medium(16, 50, 64)]
        #[case::larger(32, 100, 150)]
        #[case::very_large_codebook(8, 25, 250)]
        fn case2_random_sweep(
            #[case] list_size: u32,
            #[case] num_rows: usize,
            #[case] num_centroids: usize,
        ) -> VortexResult<()> {
            let mut rng = XorShift64::new((list_size as u64) * 31 + num_rows as u64);
            let values: Vec<f32> = (0..num_centroids).map(|_| rng.next_f32()).collect();
            assert!(num_centroids <= 256, "u8 codes cap at 256 centroids");
            let codes: Vec<u8> = (0..num_rows * list_size as usize)
                .map(|_| (rng.next_u64() % num_centroids as u64) as u8)
                .collect();

            let dict_lhs = dict_vector_f32(list_size, &codes, &values)?;
            let query: Vec<f32> = (0..list_size).map(|_| rng.next_f32()).collect();
            let const_rhs = Vector::constant_array(&query, num_rows)?;

            let expected: Vec<f32> = (0..num_rows)
                .map(|row| {
                    let mut acc = 0.0f32;
                    for j in 0..list_size as usize {
                        let k = codes[row * list_size as usize + j] as usize;
                        acc += query[j] * values[k];
                    }
                    acc
                })
                .collect();

            // Tight tolerance here because no SorfTransform rotation is involved — the
            // arithmetic should agree bit-for-bit up to float reassociation.
            let actual = eval_ip_f32(dict_lhs, const_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-4);
            Ok(())
        }

        /// End-to-end regression: for a plausible vector-search configuration (SORF rounds
        /// = 3, dim = 128, num_rows = 64, u8 codes, 64 centroids), the fast-path result
        /// must track a fully naive computation within 1e-2.
        #[test]
        fn end_to_end_dim128_rows64_bit6_regression() -> VortexResult<()> {
            let dim: u32 = 128;
            let num_rows = 64usize;
            let seed = 0xFACE_F00D;
            let num_rounds = 3u8;

            // Use 64 centroids (6 bits), a typical TurboQuant configuration.
            let num_centroids = 64usize;
            let padded_dim = (dim as usize).next_power_of_two();
            let mut rng = XorShift64::new(seed);
            let values: Vec<f32> = (0..num_centroids).map(|_| rng.next_f32()).collect();
            let codes: Vec<u8> = (0..num_rows * padded_dim)
                .map(|_| (rng.next_u64() % num_centroids as u64) as u8)
                .collect();

            let padded_vector = dict_vector_f32(padded_dim as u32, &codes, &values)?;
            let sorf_options = SorfOptions {
                seed,
                num_rounds,
                dimensions: dim,
                element_ptype: PType::F32,
            };
            let sorf =
                SorfTransform::try_new_array(&sorf_options, padded_vector, num_rows)?.into_array();

            let query: Vec<f32> = (0..dim).map(|_| rng.next_f32()).collect();
            let const_rhs = Vector::constant_array(&query, num_rows)?;

            let decoded = decode_sorf_dict(
                &codes,
                &values,
                padded_dim,
                dim as usize,
                num_rows,
                seed,
                num_rounds,
            )?;
            let expected: Vec<f32> = (0..num_rows)
                .map(|i| naive_dot(&decoded[i * dim as usize..(i + 1) * dim as usize], &query))
                .collect();

            let actual = eval_ip_f32(sorf, const_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-2);

            // Also verify the max relative error is small. The SORF rotation does not
            // amplify error, so both measures should be bounded.
            for (i, (a, e)) in actual.iter().zip(expected.iter()).enumerate() {
                let denom = e.abs().max(1.0);
                let rel = (a - e).abs() / denom;
                assert!(
                    rel < 1e-3,
                    "row {i}: rel err {rel} too large (a={a}, e={e})"
                );
            }
            Ok(())
        }

        /// Case 1 + Case 2 end-to-end with varying `num_rounds`. The rotation becomes
        /// progressively more chaotic as rounds increase, so this catches any off-by-one
        /// bug in the round-indexing that would not show up in the 3-round default.
        #[rstest]
        #[case(1)]
        #[case(2)]
        #[case(3)]
        #[case(4)]
        #[case(5)]
        fn case1_various_num_rounds(#[case] num_rounds: u8) -> VortexResult<()> {
            let dim: u32 = 128;
            let num_rows = 8usize;
            let seed = 0x1234_5678;

            let (sorf, codes, values, padded_dim) =
                build_sorf_with_dict_child(dim, num_rows, seed, num_rounds)?;

            let mut rng = XorShift64::new(seed ^ (num_rounds as u64));
            let query: Vec<f32> = (0..dim).map(|_| rng.next_f32()).collect();
            let const_rhs = Vector::constant_array(&query, num_rows)?;

            let decoded = decode_sorf_dict(
                &codes,
                &values,
                padded_dim,
                dim as usize,
                num_rows,
                seed,
                num_rounds,
            )?;
            let expected: Vec<f32> = (0..num_rows)
                .map(|i| naive_dot(&decoded[i * dim as usize..(i + 1) * dim as usize], &query))
                .collect();

            let actual = eval_ip_f32(sorf, const_rhs, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-2);
            Ok(())
        }

        /// Swap LHS and RHS on the full tree to prove the side-detection and the scalar
        /// argument-order handling are symmetric for both cases simultaneously.
        #[test]
        fn end_to_end_constant_lhs_sorf_rhs_mirrored() -> VortexResult<()> {
            let dim: u32 = 256;
            let num_rows = 12usize;
            let seed = 0xBEEF_CAFE;
            let num_rounds = 3u8;

            let (sorf, codes, values, padded_dim) =
                build_sorf_with_dict_child(dim, num_rows, seed, num_rounds)?;

            let mut rng = XorShift64::new(seed);
            let query: Vec<f32> = (0..dim).map(|_| rng.next_f32()).collect();
            let const_lhs = Vector::constant_array(&query, num_rows)?;

            let decoded = decode_sorf_dict(
                &codes,
                &values,
                padded_dim,
                dim as usize,
                num_rows,
                seed,
                num_rounds,
            )?;
            let expected: Vec<f32> = (0..num_rows)
                .map(|i| naive_dot(&decoded[i * dim as usize..(i + 1) * dim as usize], &query))
                .collect();

            let actual = eval_ip_f32(const_lhs, sorf, num_rows)?;
            assert_close_f32(&actual, &expected, 1e-2);
            Ok(())
        }
    }
}