bitnet-quantize 0.2.1

//! CubeCL GPU kernels for BitNet ternary quantization.
//!
//! This module provides GPU-accelerated implementations of BitNet operations:
//! - AbsMean quantization: weights → ternary {-1, 0, +1}
//! - Ternary dequantization: ternary → float
//! - Ternary matmul: exploits {-1, 0, +1} sparsity for fast multiply-accumulate
//! - Packed ternary matmul: memory-efficient with 2 bits per trit
//! - BitLinear forward: fused LayerNorm + ternary matmul
//!
//! ## BitNet b1.58 Background
//!
//! BitNet quantization:
//! 1. Compute scale: α = mean(|W|) per group
//! 2. Quantize: W_q = round(W / α) clamped to {-1, 0, +1}
//! 3. Inference: Y = X @ W_q * α (or fused)
//!
//! The ternary matmul exploits the fact that multiplication by {-1, 0, +1}
//! is just negation, zero, or identity - no actual multiply needed.

#![cfg(feature = "cuda")]

use candle_core::{DType, Device, Tensor};
use cubecl::prelude::*;

use crate::config::BitNetConfig;
use crate::error::BitNetError;
use crate::quantization::TernaryWeight;

// ============================================================================
// Constants
// ============================================================================

/// Default block size for kernel launches
#[allow(dead_code)]
const BLOCK_SIZE: u32 = 256;

/// Tile size for tiled matmul
const TILE_SIZE: u32 = 32;

/// Maximum shared memory elements (conservative for compatibility)
const MAX_SHARED_ELEMENTS: u32 = 1024;

// ============================================================================
// CubeCL Kernel Definitions
// ============================================================================

/// AbsMean quantization kernel.
///
/// Computes: scale = mean(|weight|) per row, then quantizes to {-1, 0, +1}.
///
/// Each block processes one row of the weight matrix:
/// - Phase 1: Parallel reduction to compute sum of absolute values
/// - Phase 2: Parallel quantization using computed scale
///
/// Output format: i32 where each value is -1, 0, or +1
#[cube(launch)]
fn absmean_quantize_kernel<F: Float>(
    weight: &Array<F>,          // Input weights [out_features * in_features]
    quantized: &mut Array<i32>, // Output ternary values
    scales: &mut Array<F>,      // Scale per row [out_features]
    in_features: u32,
) {
    let row = CUBE_POS_X;
    let tid = UNIT_POS_X;
    let block_size = CUBE_DIM_X;

    let row_start = row * in_features;

    // Shared memory for reduction (fixed size for CubeCL)
    let mut shared_sum = SharedMemory::<F>::new(256usize);

    // Phase 1: Compute sum of absolute values via parallel reduction
    let mut local_sum = F::new(0.0);
    let mut i = tid;
    while i < in_features {
        let val = weight[(row_start + i) as usize];
        // Manual abs: val < 0 ? -val : val
        let abs_val = select(val < F::new(0.0), F::new(0.0) - val, val);
        local_sum = local_sum + abs_val;
        i = i + block_size;
    }

    // Store local sum for reduction
    shared_sum[tid as usize] = local_sum;
    sync_cube();

    // Tree reduction for sum
    let mut stride: u32 = 128;
    while stride > 0 {
        if tid < stride && (tid + stride) < block_size {
            shared_sum[tid as usize] =
                shared_sum[tid as usize] + shared_sum[(tid + stride) as usize];
        }
        sync_cube();
        stride = stride / 2;
    }

    // Compute and store scale (absmean)
    // scale = sum / in_features
    let scale = shared_sum[0usize] / F::cast_from(in_features);

    if tid == 0 {
        scales[row as usize] = scale;
    }

    // Compute inverse scale for quantization (avoid division in loop)
    let eps = F::new(1e-8);
    let inv_scale = F::new(1.0) / (scale + eps);

    sync_cube();

    // Phase 2: Quantize each element to ternary
    i = tid;
    while i < in_features {
        let val = weight[(row_start + i) as usize] * inv_scale;

        // Round to nearest integer, then clamp to {-1, 0, +1}
        // round(val) = floor(val + 0.5) for positive, ceil(val - 0.5) for negative
        let rounded = F::floor(val + F::new(0.5));

        // Clamp to {-1, 0, +1}
        let clamped: i32 = select(
            rounded < F::new(-0.5),
            -1,
            select(rounded > F::new(0.5), 1, 0),
        );

        quantized[(row_start + i) as usize] = clamped;
        i = i + block_size;
    }
}

/// Ternary dequantization kernel.
///
/// Converts ternary values back to float: output = ternary * scale
///
/// Each thread processes one element independently.
#[cube(launch)]
fn ternary_dequantize_kernel<F: Float>(
    ternary: &Array<i32>,
    scales: &Array<F>,
    output: &mut Array<F>,
    in_features: u32,
    num_elements: u32,
) {
    let idx = ABSOLUTE_POS as u32;

    if idx >= num_elements {
        terminate!();
    }

    let row = idx / in_features;
    let scale = scales[row as usize];
    let trit = ternary[idx as usize];

    // Convert trit (-1, 0, +1) to float and multiply by scale
    // Using conditionals since CubeCL doesn't have direct i32->float cast
    let trit_f: F = select(
        trit == 1,
        F::new(1.0),
        select(trit == -1, F::new(-1.0), F::new(0.0)),
    );

    output[idx as usize] = trit_f * scale;
}

/// Optimized ternary matrix multiplication kernel.
///
/// Exploits the fact that weights are only {-1, 0, +1}:
/// - Weight = -1: negate input (subtract)
/// - Weight =  0: skip (don't accumulate)
/// - Weight = +1: add input as-is
///
/// This eliminates multiplication operations entirely!
///
/// Grid: (ceil(out_features/TILE), 1, batch_size)
/// Block: (TILE, 1, 1)
#[cube(launch)]
fn ternary_matmul_kernel<F: Float>(
    input: &Array<F>,      // [batch_size * in_features]
    weights: &Array<i32>,  // Ternary [out_features * in_features]
    scales: &Array<F>,     // [out_features]
    output: &mut Array<F>, // [batch_size * out_features]
    batch_size: u32,
    in_features: u32,
    out_features: u32,
) {
    let batch_idx = CUBE_POS_Z;
    let out_tile = CUBE_POS_X;
    let out_local = UNIT_POS_X;
    let out_idx = out_tile * TILE_SIZE + out_local;

    if batch_idx >= batch_size || out_idx >= out_features {
        terminate!();
    }

    let input_base = batch_idx * in_features;
    let weight_base = out_idx * in_features;

    // Shared memory for input tile (collaborative loading)
    let mut input_tile = SharedMemory::<F>::new(TILE_SIZE as usize);

    let mut acc = F::new(0.0);

    // Process input in tiles for better cache utilization
    let num_tiles = (in_features + TILE_SIZE - 1) / TILE_SIZE;

    for tile in 0..num_tiles {
        let tile_start = tile * TILE_SIZE;

        // Collaborative load of input tile
        let in_idx = tile_start + out_local;
        if in_idx < in_features {
            input_tile[out_local as usize] = input[(input_base + in_idx) as usize];
        } else {
            input_tile[out_local as usize] = F::new(0.0);
        }
        sync_cube();

        // Compute with ternary weights - no multiplication needed!
        for i in 0u32..TILE_SIZE {
            let global_in_idx = tile_start + i;
            if global_in_idx < in_features {
                let trit = weights[(weight_base + global_in_idx) as usize];
                let x = input_tile[i as usize];

                // Ternary multiply-accumulate without multiplication
                // trit == +1: add x
                // trit == -1: subtract x
                // trit ==  0: no-op (skip)
                acc = select(trit == 1, acc + x, select(trit == -1, acc - x, acc));
            }
        }
        sync_cube();
    }

    // Apply scale factor
    let scale = scales[out_idx as usize];
    output[(batch_idx * out_features + out_idx) as usize] = acc * scale;
}

/// Packed ternary matrix multiplication kernel.
///
/// Uses packed 2-bit representation: 16 trits per u32
/// Encoding: 00 = 0, 01 = +1, 10 = -1
///
/// This provides 2x memory compression vs i8 storage.
#[cube(launch)]
fn packed_ternary_matmul_kernel<F: Float>(
    input: &Array<F>,
    packed_weights: &Array<u32>, // 16 trits per u32 (2 bits each)
    scales: &Array<F>,
    output: &mut Array<F>,
    batch_size: u32,
    in_features: u32,
    out_features: u32,
) {
    let batch_idx = CUBE_POS_Z;
    let out_tile = CUBE_POS_X;
    let out_local = UNIT_POS_X;
    let out_idx = out_tile * TILE_SIZE + out_local;

    if batch_idx >= batch_size || out_idx >= out_features {
        terminate!();
    }

    let input_base = batch_idx * in_features;
    let packed_per_row = (in_features + 15) / 16; // 16 trits per u32 (2 bits each)
    let weight_base = out_idx * packed_per_row;

    let mut acc = F::new(0.0);

    // Process packed weights
    for pack_idx in 0u32..packed_per_row {
        let packed = packed_weights[(weight_base + pack_idx) as usize];

        // Unpack 16 trits (2 bits each)
        // Encoding: 00 = 0, 01 = +1, 10 = -1
        for i in 0u32..16u32 {
            let in_idx = pack_idx * 16 + i;
            if in_idx < in_features {
                // Extract 2-bit trit: (packed >> (i * 2)) & 0x3
                let shift = i * 2;
                let trit_bits = (packed >> shift) & 0x3u32;
                let x = input[(input_base + in_idx) as usize];

                // Decode and accumulate
                // 01 (+1): add
                // 10 (-1): subtract
                // 00 (0): skip
                acc = select(
                    trit_bits == 1u32,
                    acc + x,
                    select(trit_bits == 2u32, acc - x, acc),
                );
            }
        }
    }

    let scale = scales[out_idx as usize];
    output[(batch_idx * out_features + out_idx) as usize] = acc * scale;
}

/// BitLinear forward pass kernel.
///
/// Fuses: LayerNorm + ternary matmul for maximum efficiency.
///
/// BitLinear: Y = LayerNorm(X) @ Q(W) * scale_w
/// where Q(W) is ternary quantized weights.
///
/// Note: This kernel is optimized for moderate sequence lengths.
/// For very large batches, consider using separate LayerNorm + matmul.
#[cube(launch)]
fn bitlinear_forward_kernel<F: Float>(
    input: &Array<F>,         // [batch_size * in_features]
    weights: &Array<i32>,     // Ternary [out_features * in_features]
    weight_scales: &Array<F>, // [out_features]
    ln_weight: &Array<F>,     // LayerNorm weight [in_features]
    ln_bias: &Array<F>,       // LayerNorm bias [in_features]
    output: &mut Array<F>,
    batch_size: u32,
    in_features: u32,
    out_features: u32,
) {
    // Each block handles one (batch, out_idx) pair
    let batch_idx = CUBE_POS_Y;
    let out_tile = CUBE_POS_X;
    let tid = UNIT_POS_X;
    let block_size = CUBE_DIM_X;
    let out_idx = out_tile * TILE_SIZE + (tid % TILE_SIZE);

    if batch_idx >= batch_size || out_idx >= out_features {
        terminate!();
    }

    let input_base = batch_idx * in_features;

    // Shared memory for reductions and normalized input
    let mut shared = SharedMemory::<F>::new(256usize);
    let mut normed_cache = SharedMemory::<F>::new(MAX_SHARED_ELEMENTS as usize);

    // ========================================
    // Step 1: LayerNorm - compute mean
    // ========================================
    let mut local_sum = F::new(0.0);
    let mut i = tid;
    while i < in_features {
        local_sum = local_sum + input[(input_base + i) as usize];
        i = i + block_size;
    }
    shared[tid as usize] = local_sum;
    sync_cube();

    // Tree reduction for mean
    let mut stride: u32 = 128;
    while stride > 0 {
        if tid < stride && (tid + stride) < block_size {
            shared[tid as usize] = shared[tid as usize] + shared[(tid + stride) as usize];
        }
        sync_cube();
        stride = stride / 2;
    }
    let mean = shared[0usize] / F::cast_from(in_features);
    sync_cube();

    // ========================================
    // Step 2: LayerNorm - compute variance
    // ========================================
    local_sum = F::new(0.0);
    i = tid;
    while i < in_features {
        let diff = input[(input_base + i) as usize] - mean;
        local_sum = local_sum + diff * diff;
        i = i + block_size;
    }
    shared[tid as usize] = local_sum;
    sync_cube();

    // Tree reduction for variance
    stride = 128;
    while stride > 0 {
        if tid < stride && (tid + stride) < block_size {
            shared[tid as usize] = shared[tid as usize] + shared[(tid + stride) as usize];
        }
        sync_cube();
        stride = stride / 2;
    }
    let var = shared[0usize] / F::cast_from(in_features);
    let eps = F::new(1e-5);
    let inv_std = F::new(1.0) / F::sqrt(var + eps);
    sync_cube();

    // ========================================
    // Step 3: Apply LayerNorm and cache normalized values
    // ========================================
    // For efficiency, only cache up to MAX_SHARED_ELEMENTS
    // Beyond that, we recompute on-the-fly
    i = tid;
    while i < in_features && i < MAX_SHARED_ELEMENTS {
        let norm = (input[(input_base + i) as usize] - mean) * inv_std;
        normed_cache[i as usize] = norm * ln_weight[i as usize] + ln_bias[i as usize];
        i = i + block_size;
    }
    sync_cube();

    // ========================================
    // Step 4: Ternary matmul with normalized input
    // ========================================
    let weight_base = out_idx * in_features;
    let mut acc = F::new(0.0);

    i = 0;
    while i < in_features {
        // Get normalized input value
        let normed_val: F = if i < MAX_SHARED_ELEMENTS {
            normed_cache[i as usize]
        } else {
            // Recompute for values beyond shared memory cache
            let norm = (input[(input_base + i) as usize] - mean) * inv_std;
            norm * ln_weight[i as usize] + ln_bias[i as usize]
        };

        let trit = weights[(weight_base + i) as usize];

        // Ternary accumulate
        acc = select(
            trit == 1,
            acc + normed_val,
            select(trit == -1, acc - normed_val, acc),
        );
        i = i + 1;
    }

    // Apply weight scale
    let scale = weight_scales[out_idx as usize];

    // Only first thread per output writes (avoid race conditions)
    if tid % TILE_SIZE == out_idx % TILE_SIZE {
        output[(batch_idx * out_features + out_idx) as usize] = acc * scale;
    }
}

// ============================================================================
// Launch Wrappers
// ============================================================================

/// Check if CubeCL CUDA runtime is available.
#[must_use]
pub fn has_cuda_support() -> bool {
    matches!(Device::cuda_if_available(0), Ok(Device::Cuda(_)))
}

/// GPU-accelerated AbsMean quantization.
///
/// Quantizes a weight tensor to ternary {-1, 0, +1} using AbsMean scaling.
///
/// # Arguments
///
/// * `weight` - Input weight tensor [out_features, in_features]
///
/// # Returns
///
/// Tuple of (quantized tensor [out, in] as i32, scales [out])
///
/// # Errors
///
/// Returns error if tensor is not on CUDA device.
pub fn absmean_quantize(weight: &Tensor) -> std::result::Result<(Tensor, Tensor), BitNetError> {
    if !weight.device().is_cuda() {
        return Err(BitNetError::FeatureNotAvailable(
            "absmean_quantize requires CUDA device".into(),
        ));
    }

    let (out_features, in_features) = weight.dims2()?;
    let device = weight.device();

    // For CubeCL kernel, we need raw data access
    // Fall back to CPU computation with GPU tensor creation for now
    // TODO: Implement full CubeCL launch when cubecl-cuda runtime is stabilized

    let weight_f32: Vec<f32> = weight.to_dtype(DType::F32)?.flatten_all()?.to_vec1()?;

    let mut quantized = vec![0i32; out_features * in_features];
    let mut scales = vec![0.0f32; out_features];

    // CPU fallback with GPU-style algorithm
    for row in 0..out_features {
        let row_start = row * in_features;

        // Compute absmean scale
        let abs_sum: f32 = weight_f32[row_start..row_start + in_features]
            .iter()
            .map(|x| x.abs())
            .sum();
        let scale = abs_sum / in_features as f32;
        scales[row] = scale;

        let inv_scale = if scale > 1e-8 { 1.0 / scale } else { 1.0 };

        // Quantize to ternary
        for i in 0..in_features {
            let val = weight_f32[row_start + i] * inv_scale;
            let rounded = val.round();
            quantized[row_start + i] = rounded.clamp(-1.0, 1.0) as i32;
        }
    }

    // Create GPU tensors
    let quantized_tensor = Tensor::from_vec(quantized, (out_features, in_features), device)?;
    let scales_tensor = Tensor::from_vec(scales, out_features, device)?;

    Ok((quantized_tensor, scales_tensor))
}

/// GPU-accelerated ternary dequantization.
///
/// Converts ternary values back to float: output = ternary * scale
///
/// # Arguments
///
/// * `ternary` - Ternary tensor [out_features, in_features] as i32
/// * `scales` - Scale tensor [out_features]
///
/// # Returns
///
/// Dequantized float tensor [out_features, in_features]
pub fn ternary_dequantize(
    ternary: &Tensor,
    scales: &Tensor,
) -> std::result::Result<Tensor, BitNetError> {
    let _device = ternary.device();
    let (out_features, _in_features) = ternary.dims2()?;

    // Convert ternary i32 to f32, multiply by broadcasted scales
    let ternary_f32 = ternary.to_dtype(DType::F32)?;
    let scales_broadcast = scales.reshape((out_features, 1))?;
    let output = ternary_f32.broadcast_mul(&scales_broadcast)?;

    Ok(output)
}

/// GPU-accelerated ternary matrix multiplication.
///
/// Computes `output = input @ ternary_weight.T` using optimized GPU kernels
/// that exploit ternary weight sparsity.
///
/// # Arguments
///
/// * `input` - Input tensor [batch, in_features]
/// * `weight` - Ternary weight structure
///
/// # Errors
///
/// Returns error if CUDA operation fails.
pub fn ternary_matmul_gpu(
    input: &Tensor,
    weight: &TernaryWeight,
) -> std::result::Result<Tensor, BitNetError> {
    let device = input.device();

    // Dequantize weights and perform standard matmul
    // TODO: Replace with direct CubeCL kernel launch for full optimization
    let dequant_weight = crate::quantization::dequantize_weights(weight, device)?;

    let output = input
        .matmul(&dequant_weight.t()?)
        .map_err(BitNetError::from)?;

    Ok(output)
}

/// GPU-accelerated ternary matrix multiplication with raw tensors.
///
/// Lower-level API that takes pre-converted ternary tensor.
///
/// # Arguments
///
/// * `input` - Input tensor [batch, in_features]
/// * `ternary_weights` - Ternary weights as i32 tensor [out_features, in_features]
/// * `scales` - Scale factors [out_features]
///
/// # Returns
///
/// Output tensor [batch, out_features]
pub fn ternary_matmul_raw(
    input: &Tensor,
    ternary_weights: &Tensor,
    scales: &Tensor,
) -> std::result::Result<Tensor, BitNetError> {
    let _device = input.device();

    // Dequantize and compute
    let dequant = ternary_dequantize(ternary_weights, scales)?;
    let output = input.matmul(&dequant.t()?)?;

    Ok(output)
}

/// Pack ternary weights into 2-bit representation.
///
/// Encodes ternary values as 2 bits: 00 = 0, 01 = +1, 10 = -1
/// This provides 2x memory compression vs i8.
///
/// # Arguments
///
/// * `ternary` - Ternary tensor [out_features, in_features] with values in {-1, 0, +1}
///
/// # Returns
///
/// Packed tensor [out_features, ceil(in_features/16)] as u32
pub fn pack_ternary_weights(ternary: &Tensor) -> std::result::Result<Tensor, BitNetError> {
    let (out_features, in_features) = ternary.dims2()?;
    let device = ternary.device();

    let ternary_i32: Vec<i32> = ternary.flatten_all()?.to_vec1()?;

    let trits_per_word = 16usize;
    let packed_per_row = (in_features + trits_per_word - 1) / trits_per_word;
    let mut packed = vec![0u32; out_features * packed_per_row];

    for row in 0..out_features {
        for pack_idx in 0..packed_per_row {
            let mut word = 0u32;
            for i in 0..trits_per_word {
                let in_idx = pack_idx * trits_per_word + i;
                if in_idx < in_features {
                    let trit = ternary_i32[row * in_features + in_idx];
                    // Encode: 0 -> 00, +1 -> 01, -1 -> 10
                    let bits = match trit {
                        1 => 0b01u32,
                        -1 => 0b10u32,
                        _ => 0b00u32,
                    };
                    word |= bits << (i * 2);
                }
            }
            packed[row * packed_per_row + pack_idx] = word;
        }
    }

    let packed_tensor = Tensor::from_vec(packed, (out_features, packed_per_row), device)?;
    Ok(packed_tensor)
}

/// GPU-accelerated packed ternary matrix multiplication.
///
/// Uses 2-bit packed weight representation for reduced memory bandwidth.
///
/// # Arguments
///
/// * `input` - Input tensor [batch, in_features]
/// * `packed_weights` - Packed weights [out_features, ceil(in_features/16)]
/// * `scales` - Scale factors [out_features]
/// * `in_features` - Original input feature dimension
///
/// # Returns
///
/// Output tensor [batch, out_features]
pub fn packed_ternary_matmul(
    input: &Tensor,
    packed_weights: &Tensor,
    scales: &Tensor,
    in_features: usize,
) -> std::result::Result<Tensor, BitNetError> {
    let _device = input.device();
    let _batch_size = input.dims()[0];
    let _out_features = packed_weights.dims()[0];

    // Unpack and compute (for correctness - full CubeCL version avoids unpacking)
    let ternary = unpack_ternary_weights(packed_weights, in_features)?;
    let output = ternary_matmul_raw(input, &ternary, scales)?;

    Ok(output)
}

/// Unpack 2-bit ternary weights back to i32.
///
/// # Arguments
///
/// * `packed` - Packed weights [out_features, packed_per_row]
/// * `in_features` - Original input feature dimension
///
/// # Returns
///
/// Ternary tensor [out_features, in_features] as i32
pub fn unpack_ternary_weights(
    packed: &Tensor,
    in_features: usize,
) -> std::result::Result<Tensor, BitNetError> {
    let (out_features, packed_per_row) = packed.dims2()?;
    let device = packed.device();

    let packed_u32: Vec<u32> = packed.to_dtype(DType::U32)?.flatten_all()?.to_vec1()?;

    let trits_per_word = 16usize;
    let mut ternary = vec![0i32; out_features * in_features];

    for row in 0..out_features {
        for pack_idx in 0..packed_per_row {
            let word = packed_u32[row * packed_per_row + pack_idx];
            for i in 0..trits_per_word {
                let in_idx = pack_idx * trits_per_word + i;
                if in_idx < in_features {
                    let bits = (word >> (i * 2)) & 0x3;
                    let trit = match bits {
                        0b01 => 1i32,
                        0b10 => -1i32,
                        _ => 0i32,
                    };
                    ternary[row * in_features + in_idx] = trit;
                }
            }
        }
    }

    let ternary_tensor = Tensor::from_vec(ternary, (out_features, in_features), device)?;
    Ok(ternary_tensor)
}

/// GPU-accelerated BitLinear forward pass.
///
/// Fuses LayerNorm + ternary matmul for maximum efficiency.
///
/// # Arguments
///
/// * `input` - Input tensor [batch, in_features]
/// * `weight` - Ternary weight structure
/// * `ln_weight` - LayerNorm weight [in_features]
/// * `ln_bias` - LayerNorm bias [in_features]
/// * `config` - BitNet configuration
///
/// # Returns
///
/// Output tensor [batch, out_features]
pub fn bitlinear_forward(
    input: &Tensor,
    weight: &TernaryWeight,
    ln_weight: &Tensor,
    ln_bias: &Tensor,
    config: &BitNetConfig,
) -> std::result::Result<Tensor, BitNetError> {
    let _device = input.device();
    let eps = config.eps;

    // For now, use separate operations
    // TODO: Implement fused CubeCL kernel

    // LayerNorm
    let mean = input.mean_keepdim(1)?;
    let centered = input.broadcast_sub(&mean)?;
    let var = centered.sqr()?.mean_keepdim(1)?;
    let std = (var + eps as f64)?.sqrt()?;
    let normalized = centered.broadcast_div(&std)?;
    let ln_output = normalized
        .broadcast_mul(ln_weight)?
        .broadcast_add(ln_bias)?;

    // Ternary matmul
    let output = ternary_matmul_gpu(&ln_output, weight)?;

    Ok(output)
}

/// Check if the GPU kernel is available and beneficial.
///
/// Returns true if:
/// - CUDA device is available
/// - Input size is large enough to benefit from GPU acceleration
#[must_use]
pub fn should_use_gpu(input: &Tensor, weight: &TernaryWeight) -> bool {
    // Check if on CUDA device
    if !input.device().is_cuda() {
        return false;
    }

    // Heuristic: use GPU for matrices larger than threshold
    let input_size = input.elem_count();
    let weight_size = weight.out_features() * weight.in_features();

    // Threshold: 64K operations
    input_size * weight_size > 65536
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::config::BitNetConfig;
    use crate::quantization::quantize_weights;
    use candle_core::Device;

    #[test]
    fn test_ternary_matmul_cpu_fallback() {
        let device = Device::Cpu;
        let config = BitNetConfig::default().with_group_size(64);

        let weight_tensor = candle_core::Tensor::randn(0.0f32, 1.0, (64, 128), &device).unwrap();
        let weight = quantize_weights(&weight_tensor, &config).unwrap();

        let input = candle_core::Tensor::randn(0.0f32, 1.0, (4, 128), &device).unwrap();

        let output = ternary_matmul_gpu(&input, &weight).unwrap();
        assert_eq!(output.shape().dims(), &[4, 64]);
    }

    #[test]
    fn test_pack_unpack_roundtrip() {
        let device = Device::Cpu;

        // Create ternary tensor with known values
        let ternary_data: Vec<i32> = vec![1, 0, -1, 1, 0, -1, 0, 1, -1, 0, 1, -1, 0, 0, 1, -1];
        let ternary = Tensor::from_vec(ternary_data.clone(), (1, 16), &device).unwrap();

        // Pack
        let packed = pack_ternary_weights(&ternary).unwrap();

        // Unpack
        let unpacked = unpack_ternary_weights(&packed, 16).unwrap();

        // Verify roundtrip
        let unpacked_data: Vec<i32> = unpacked.flatten_all().unwrap().to_vec1().unwrap();
        assert_eq!(ternary_data, unpacked_data);
    }

    #[test]
    fn test_ternary_dequantize() {
        let device = Device::Cpu;

        let ternary = Tensor::from_vec(vec![1i32, 0, -1, 1], (2, 2), &device).unwrap();
        let scales = Tensor::from_vec(vec![2.0f32, 0.5], 2, &device).unwrap();

        let output = ternary_dequantize(&ternary, &scales).unwrap();
        let output_data: Vec<f32> = output.flatten_all().unwrap().to_vec1().unwrap();

        // Row 0: [1, 0] * 2.0 = [2.0, 0.0]
        // Row 1: [-1, 1] * 0.5 = [-0.5, 0.5]
        assert!((output_data[0] - 2.0).abs() < 1e-6);
        assert!((output_data[1] - 0.0).abs() < 1e-6);
        assert!((output_data[2] - (-0.5)).abs() < 1e-6);
        assert!((output_data[3] - 0.5).abs() < 1e-6);
    }

    #[test]
    fn test_absmean_quantize_cpu() {
        let device = Device::Cpu;

        // Create weight tensor with known distribution
        let weight = Tensor::from_vec(
            vec![1.0f32, -0.5, 0.2, -0.8, 0.1, 0.9, -0.3, 0.0],
            (2, 4),
            &device,
        )
        .unwrap();

        // Note: absmean_quantize requires CUDA, so this test is expected to fail on CPU
        let result = absmean_quantize(&weight);
        assert!(result.is_err()); // Expected: requires CUDA
    }
}