axonml-tensor 0.6.1

//! Tensor - Core N-Dimensional Array Type
//!
//! # File
//! `crates/axonml-tensor/src/tensor.rs`
//!
//! # Author
//! Andrew Jewell Sr - AutomataNexus
//!
//! # Updated
//! March 8, 2026
//!
//! # Disclaimer
//! Use at own risk. This software is provided "as is", without warranty of any
//! kind, express or implied. The author and AutomataNexus shall not be held
//! liable for any damages arising from the use of this software.

use core::fmt;
use core::ops::{Add, Div, Mul, Neg, Sub};

use axonml_core::Device;
use axonml_core::backends::CpuBackend;
#[cfg(feature = "cuda")]
use axonml_core::backends::CudaBackend;
use axonml_core::dtype::{Float, Numeric, Scalar};
use axonml_core::error::{Error, Result};
use axonml_core::storage::Storage;
use num_traits::NumCast;

// =============================================================================
// CUDA Acceleration
// =============================================================================

#[cfg(feature = "cuda")]
mod cuda_accel {
    use super::*;
    use axonml_core::backends::cuda::get_cuda_backend;

    /// Get the global CUDA backend (delegates to core singleton).
    pub fn get_cuda() -> Option<&'static CudaBackend> {
        get_cuda_backend()
    }

    /// GPU-accelerated matmul: copies data to GPU, runs cuBLAS GEMM, copies back.
    /// Returns None if GPU is unavailable or an error occurs.
    pub fn cuda_matmul(a: &[f32], b: &[f32], m: usize, n: usize, k: usize) -> Option<Vec<f32>> {
        let cuda = get_cuda()?;

        let a_gpu = cuda.htod_copy(a).ok()?;
        let b_gpu = cuda.htod_copy(b).ok()?;
        let mut c_gpu = cuda.alloc::<f32>(m * n).ok()?;

        // cuBLAS GEMM: C(m,n) = A(m,k) @ B(k,n) in row-major
        // In column-major terms: C^T(n,m) = B^T(n,k) @ A^T(k,m)
        cuda.gemm_f32(
            false, false, n, m, k, 1.0, &b_gpu, n, &a_gpu, k, 0.0, &mut c_gpu, n,
        )
        .ok()?;

        cuda.dtoh_copy(&c_gpu).ok()
    }
}

use crate::shape::{
    Shape, Strides, broadcast_shape, broadcast_strides, contiguous_strides, is_contiguous,
    linear_index, normalize_dim, numel, reshape, squeeze, transpose_shape, transpose_strides,
    unsqueeze,
};

// =============================================================================
// GPU Dispatch Helpers
// =============================================================================
//
// These enable calling Tensor<f32> GPU methods from generic Tensor<T> code
// when T is verified to be f32 via TypeId check at runtime.

#[cfg(feature = "cuda")]
unsafe fn gpu_ref<T: Scalar>(t: &Tensor<T>) -> &Tensor<f32> {
    assert!(
        is_f32::<T>(),
        "gpu_ref: only Tensor<f32> can be used for GPU operations, got {:?}",
        T::DTYPE
    );
    // SAFETY: T is f32 (asserted above), Tensor<f32> and Tensor<T> have identical layout
    unsafe { &*(t as *const Tensor<T> as *const Tensor<f32>) }
}

#[cfg(feature = "cuda")]
unsafe fn gpu_into<T: Scalar>(t: Tensor<f32>) -> Tensor<T> {
    assert!(
        is_f32::<T>(),
        "gpu_into: only Tensor<f32> can be produced from GPU operations, got {:?}",
        T::DTYPE
    );
    // SAFETY: T is f32 (asserted above), ownership transfer via ptr::read + forget
    unsafe {
        let out = std::ptr::read(&t as *const Tensor<f32> as *const Tensor<T>);
        std::mem::forget(t);
        out
    }
}

#[cfg(feature = "cuda")]
fn is_f32<T: 'static>() -> bool {
    std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>()
}

// =============================================================================
// Tensor Struct
// =============================================================================

/// An N-dimensional array of numeric values.
///
/// Tensors are the core data structure for all computations in Axonml.
/// They support arbitrary dimensions, automatic broadcasting, and efficient
/// memory sharing between views.
#[derive(Clone)]
pub struct Tensor<T: Scalar> {
    /// Underlying data storage (reference-counted).
    pub(crate) storage: Storage<T>,
    /// Shape of the tensor (dimensions).
    pub(crate) shape: Shape,
    /// Strides for each dimension.
    pub(crate) strides: Strides,
    /// Offset into storage (for views).
    pub(crate) offset: usize,
}

impl<T: Scalar> Tensor<T> {
    // =========================================================================
    // Constructors
    // =========================================================================

    /// Creates a new tensor from storage with the given shape.
    ///
    /// # Arguments
    /// * `storage` - The underlying data storage
    /// * `shape` - Shape of the tensor
    ///
    /// # Returns
    /// New tensor, or error if shape doesn't match storage size.
    pub fn from_storage(storage: Storage<T>, shape: &[usize]) -> Result<Self> {
        let total = numel(shape);
        if total != storage.len() {
            return Err(Error::shape_mismatch(&[storage.len()], shape));
        }

        let shape = Shape::from_slice(shape);
        let strides = contiguous_strides(&shape);

        Ok(Self {
            storage,
            shape,
            strides,
            offset: 0,
        })
    }

    /// Creates a new tensor from a vector with the given shape.
    ///
    /// # Arguments
    /// * `data` - Vector of data
    /// * `shape` - Shape of the tensor
    ///
    /// # Returns
    /// New tensor, or error if shape doesn't match data length.
    pub fn from_vec(data: Vec<T>, shape: &[usize]) -> Result<Self> {
        let storage = Storage::from_vec(data, Device::Cpu);
        Self::from_storage(storage, shape)
    }

    /// Creates a new tensor from a slice with the given shape.
    ///
    /// # Arguments
    /// * `data` - Slice of data to copy
    /// * `shape` - Shape of the tensor
    ///
    /// # Returns
    /// New tensor, or error if shape doesn't match data length.
    pub fn from_slice(data: &[T], shape: &[usize]) -> Result<Self> {
        let storage = Storage::from_slice(data, Device::Cpu);
        Self::from_storage(storage, shape)
    }

    /// Creates a scalar tensor (0-dimensional).
    ///
    /// # Arguments
    /// * `value` - The scalar value
    ///
    /// # Returns
    /// New 0-dimensional tensor.
    pub fn scalar(value: T) -> Self {
        Self {
            storage: Storage::from_vec(vec![value], Device::Cpu),
            shape: Shape::new(),
            strides: Strides::new(),
            offset: 0,
        }
    }

    /// Creates a tensor filled with zeros.
    #[must_use]
    pub fn zeros(shape: &[usize]) -> Self {
        crate::creation::zeros(shape)
    }

    /// Creates a tensor filled with ones.
    #[must_use]
    pub fn ones(shape: &[usize]) -> Self
    where
        T: Numeric,
    {
        crate::creation::ones(shape)
    }

    /// Creates a tensor filled with a constant value.
    #[must_use]
    pub fn full(shape: &[usize], value: T) -> Self {
        crate::creation::full(shape, value)
    }

    /// Creates a tensor with random values from standard normal distribution.
    #[must_use]
    pub fn randn(shape: &[usize]) -> Self
    where
        T: Float,
        rand_distr::StandardNormal: rand::distributions::Distribution<T>,
    {
        crate::creation::randn(shape)
    }

    /// Creates a tensor with random values from uniform distribution [0, 1).
    #[must_use]
    pub fn rand(shape: &[usize]) -> Self
    where
        T: Float,
        rand::distributions::Standard: rand::distributions::Distribution<T>,
    {
        crate::creation::rand(shape)
    }

    // =========================================================================
    // Properties
    // =========================================================================

    /// Returns the shape of the tensor.
    #[must_use]
    pub fn shape(&self) -> &[usize] {
        &self.shape
    }

    /// Returns the strides of the tensor.
    #[must_use]
    pub fn strides(&self) -> &[isize] {
        &self.strides
    }

    /// Returns the number of dimensions.
    #[must_use]
    pub fn ndim(&self) -> usize {
        self.shape.len()
    }

    /// Returns the total number of elements.
    #[must_use]
    pub fn numel(&self) -> usize {
        numel(&self.shape)
    }

    /// Returns true if the tensor is empty (has zero elements).
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.numel() == 0
    }

    /// Returns the size of a specific dimension.
    ///
    /// # Arguments
    /// * `dim` - Dimension index (supports negative indexing)
    pub fn size(&self, dim: i64) -> Result<usize> {
        let idx = normalize_dim(dim, self.ndim())?;
        Ok(self.shape[idx])
    }

    /// Returns the device this tensor is on.
    #[must_use]
    pub fn device(&self) -> Device {
        self.storage.device()
    }

    /// Returns true if the tensor is contiguous in memory.
    #[must_use]
    pub fn is_contiguous(&self) -> bool {
        is_contiguous(&self.shape, &self.strides)
    }

    /// Returns true if this tensor is a scalar (0-dimensional).
    #[must_use]
    pub fn is_scalar(&self) -> bool {
        self.shape.is_empty()
    }

    // =========================================================================
    // Data Access
    // =========================================================================

    /// Returns the element at the given indices.
    ///
    /// # Arguments
    /// * `indices` - Multi-dimensional indices
    pub fn get(&self, indices: &[usize]) -> Result<T> {
        if indices.len() != self.ndim() {
            return Err(Error::invalid_operation(format!(
                "Expected {} indices, got {}",
                self.ndim(),
                indices.len()
            )));
        }

        for (&idx, &dim) in indices.iter().zip(self.shape.iter()) {
            if idx >= dim {
                return Err(Error::IndexOutOfBounds {
                    index: idx,
                    size: dim,
                });
            }
        }

        let offset = self.offset + linear_index(indices, &self.strides);
        Ok(self.storage.as_slice()[offset])
    }

    /// Sets the element at the given indices.
    ///
    /// # Arguments
    /// * `indices` - Multi-dimensional indices
    /// * `value` - Value to set
    pub fn set(&self, indices: &[usize], value: T) -> Result<()> {
        if indices.len() != self.ndim() {
            return Err(Error::invalid_operation(format!(
                "Expected {} indices, got {}",
                self.ndim(),
                indices.len()
            )));
        }

        for (&idx, &dim) in indices.iter().zip(self.shape.iter()) {
            if idx >= dim {
                return Err(Error::IndexOutOfBounds {
                    index: idx,
                    size: dim,
                });
            }
        }

        let offset = self.offset + linear_index(indices, &self.strides);
        self.storage.as_slice_mut()[offset] = value;
        Ok(())
    }

    /// Returns the scalar value for a 0-dimensional tensor.
    pub fn item(&self) -> Result<T> {
        if self.numel() != 1 {
            return Err(Error::invalid_operation(
                "item() only works on single-element tensors",
            ));
        }

        // Use to_vec() which handles both CPU and GPU tensors safely
        let data = self.to_vec();
        if data.is_empty() {
            Err(Error::invalid_operation("item() on empty tensor"))
        } else {
            Ok(data[0])
        }
    }

    /// Returns the data as a contiguous vector.
    ///
    /// If the tensor is already contiguous, this returns a reference.
    /// Otherwise, it copies the data into a new contiguous vector.
    /// For GPU tensors (f32 only), performs a D2H copy.
    #[must_use]
    pub fn to_vec(&self) -> Vec<T> {
        // GPU path: GPU storage is always f32
        #[cfg(feature = "cuda")]
        if self.storage.is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let self_f32 = unsafe { gpu_ref(self) };
            let f32_vec = self_f32.to_vec_gpu();
            unsafe {
                let mut v = std::mem::ManuallyDrop::new(f32_vec);
                return Vec::from_raw_parts(v.as_mut_ptr() as *mut T, v.len(), v.capacity());
            }
        }

        if self.is_contiguous() {
            let storage = self.storage.as_slice();
            storage[self.offset..self.offset + self.numel()].to_vec()
        } else {
            let mut result = Vec::with_capacity(self.numel());
            self.copy_data_to(&mut result);
            result
        }
    }

    /// Copies data to a slice, handling non-contiguous layouts.
    fn copy_data_to(&self, dst: &mut Vec<T>) {
        dst.clear();
        let storage = self.storage.as_slice();

        // Iterate through all indices
        let total = self.numel();
        for i in 0..total {
            let indices = crate::shape::unravel_index(i, &self.shape);
            let offset = self.offset + linear_index(&indices, &self.strides);
            dst.push(storage[offset]);
        }
    }

    // =========================================================================
    // Shape Operations
    // =========================================================================

    /// Returns a new tensor with the specified shape.
    ///
    /// The total number of elements must remain the same.
    /// Supports -1 in one dimension to infer the size.
    ///
    /// # Arguments
    /// * `new_shape` - Target shape
    pub fn reshape(&self, new_shape: &[isize]) -> Result<Self> {
        let shape = reshape(&self.shape, new_shape)?;

        if self.is_contiguous() {
            // Can just change shape without copying
            Ok(Self {
                storage: self.storage.clone(),
                strides: contiguous_strides(&shape),
                shape,
                offset: self.offset,
            })
        } else {
            // Need to make contiguous first
            let contig = self.contiguous();
            Ok(Self {
                storage: contig.storage,
                strides: contiguous_strides(&shape),
                shape,
                offset: 0,
            })
        }
    }

    /// Returns a new tensor with a flattened shape.
    #[must_use]
    pub fn flatten(&self) -> Self {
        self.reshape(&[-1]).expect("Flatten should never fail")
    }

    /// Returns a new tensor with dimensions of size 1 removed.
    ///
    /// # Arguments
    /// * `dim` - Optional specific dimension to squeeze
    pub fn squeeze(&self, dim: Option<i64>) -> Result<Self> {
        let dim = match dim {
            Some(d) => Some(normalize_dim(d, self.ndim())?),
            None => None,
        };

        let new_shape = squeeze(&self.shape, dim);
        let new_strides: Strides = match dim {
            Some(d) => {
                let mut s = self.strides.clone();
                if d < self.shape.len() && self.shape[d] == 1 {
                    s.remove(d);
                }
                s
            }
            None => self
                .shape
                .iter()
                .zip(self.strides.iter())
                .filter(|(dim, _)| **dim != 1)
                .map(|(_, stride)| *stride)
                .collect(),
        };

        Ok(Self {
            storage: self.storage.clone(),
            shape: new_shape,
            strides: new_strides,
            offset: self.offset,
        })
    }

    /// Returns a new tensor with a dimension of size 1 inserted.
    ///
    /// # Arguments
    /// * `dim` - Position to insert the new dimension
    pub fn unsqueeze(&self, dim: i64) -> Result<Self> {
        let normalized = if dim < 0 {
            (dim + self.ndim() as i64 + 1) as usize
        } else {
            dim as usize
        };

        let new_shape = unsqueeze(&self.shape, normalized)?;
        let mut new_strides = Strides::with_capacity(new_shape.len());

        for (i, _) in new_shape.iter().enumerate() {
            if i < normalized {
                new_strides.push(self.strides.get(i).copied().unwrap_or(1));
            } else if i == normalized {
                // Stride for new dimension (doesn't matter since size is 1)
                new_strides.push(1);
            } else {
                new_strides.push(self.strides[i - 1]);
            }
        }

        Ok(Self {
            storage: self.storage.clone(),
            shape: new_shape,
            strides: new_strides,
            offset: self.offset,
        })
    }

    /// Transposes two dimensions.
    ///
    /// # Arguments
    /// * `dim0` - First dimension
    /// * `dim1` - Second dimension
    pub fn transpose(&self, dim0: i64, dim1: i64) -> Result<Self> {
        let d0 = normalize_dim(dim0, self.ndim())?;
        let d1 = normalize_dim(dim1, self.ndim())?;

        let new_shape = transpose_shape(&self.shape, d0, d1)?;
        let new_strides = transpose_strides(&self.strides, d0, d1);

        Ok(Self {
            storage: self.storage.clone(),
            shape: new_shape,
            strides: new_strides,
            offset: self.offset,
        })
    }

    /// Returns the transpose of a 2D tensor.
    pub fn t(&self) -> Result<Self> {
        if self.ndim() != 2 {
            return Err(Error::invalid_operation("t() only works on 2D tensors"));
        }
        self.transpose(0, 1)
    }

    /// Returns a permuted tensor with dimensions reordered.
    ///
    /// # Arguments
    /// * `dims` - New order of dimensions
    pub fn permute(&self, dims: &[usize]) -> Result<Self> {
        if dims.len() != self.ndim() {
            return Err(Error::invalid_operation(format!(
                "Expected {} dimensions, got {}",
                self.ndim(),
                dims.len()
            )));
        }

        // Check that dims is a permutation
        let mut seen = vec![false; self.ndim()];
        for &d in dims {
            if d >= self.ndim() {
                return Err(Error::InvalidDimension {
                    index: d as i64,
                    ndim: self.ndim(),
                });
            }
            if seen[d] {
                return Err(Error::invalid_operation("Duplicate dimension in permute"));
            }
            seen[d] = true;
        }

        let new_shape: Shape = dims.iter().map(|&d| self.shape[d]).collect();
        let new_strides: Strides = dims.iter().map(|&d| self.strides[d]).collect();

        Ok(Self {
            storage: self.storage.clone(),
            shape: new_shape,
            strides: new_strides,
            offset: self.offset,
        })
    }

    /// Returns a contiguous copy of the tensor.
    #[must_use]
    pub fn contiguous(&self) -> Self {
        if self.is_contiguous() && self.offset == 0 {
            return self.clone();
        }

        #[cfg(feature = "cuda")]
        if self.storage.is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let self_f32 = unsafe { gpu_ref(self) };
            let result = self_f32.contiguous_gpu();
            return unsafe { gpu_into(result) };
        }

        let data = self.to_vec();
        Self::from_vec(data, &self.shape).expect("Contiguous should never fail")
    }

    // =========================================================================
    // Functional Map Operations (zero-copy for CPU tensors)
    // =========================================================================

    /// Apply a function element-wise, producing a new tensor with the same shape.
    ///
    /// Avoids the to_vec() → map → from_vec() pattern by operating directly
    /// on contiguous storage.
    #[must_use]
    pub fn map<F: Fn(T) -> T>(&self, f: F) -> Self {
        let data = self.to_vec(); // contiguous read
        let result: Vec<T> = data.into_iter().map(f).collect();
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// Apply a binary function element-wise with another tensor of the same shape.
    ///
    /// This is the primary zero-allocation pattern for backward functions:
    /// instead of `a.to_vec()` + `b.to_vec()` + zip + `from_vec()`,
    /// use `a.zip_map(&b, |x, y| ...)` which does a single allocation.
    #[must_use]
    pub fn zip_map<F: Fn(T, T) -> T>(&self, other: &Self, f: F) -> Self {
        let a = self.to_vec();
        let b = other.to_vec();
        debug_assert_eq!(
            a.len(),
            b.len(),
            "zip_map requires same number of elements: {} vs {}",
            a.len(),
            b.len()
        );
        let result: Vec<T> = a.into_iter().zip(b).map(|(x, y)| f(x, y)).collect();
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// Apply a ternary function element-wise with two other tensors.
    #[must_use]
    pub fn zip_map3<F: Fn(T, T, T) -> T>(&self, b: &Self, c: &Self, f: F) -> Self {
        let a_data = self.to_vec();
        let b_data = b.to_vec();
        let c_data = c.to_vec();
        debug_assert_eq!(a_data.len(), b_data.len());
        debug_assert_eq!(a_data.len(), c_data.len());
        let result: Vec<T> = a_data
            .into_iter()
            .zip(b_data)
            .zip(c_data)
            .map(|((a, b), c)| f(a, b, c))
            .collect();
        Self::from_vec(result, &self.shape).unwrap()
    }

    // =========================================================================
    // Device Operations
    // =========================================================================

    /// Transfers the tensor to a different device.
    ///
    /// # Arguments
    /// * `device` - Target device
    pub fn to_device(&self, device: Device) -> Result<Self> {
        if self.device() == device {
            return Ok(self.clone());
        }

        #[cfg(feature = "cuda")]
        if self.storage.is_gpu() || device.is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let self_f32 = unsafe { gpu_ref(self) };
            let result = self_f32.to_device_f32(device)?;
            return Ok(unsafe { gpu_into(result) });
        }

        let contig = self.contiguous();
        let new_storage = contig.storage.to_device(device)?;

        Ok(Self {
            storage: new_storage,
            shape: self.shape.clone(),
            strides: self.strides.clone(),
            offset: 0,
        })
    }

    /// Transfers to CPU.
    pub fn cpu(&self) -> Result<Self> {
        self.to_device(Device::Cpu)
    }

    // =========================================================================
    // Deep Copy
    // =========================================================================

    /// Creates a deep copy of this tensor with its own storage.
    #[must_use]
    pub fn clone_deep(&self) -> Self {
        let data = self.to_vec();
        let cpu = Self::from_vec(data, &self.shape).expect("Deep clone should never fail");
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            return cpu.to_device(self.device()).unwrap();
        }
        cpu
    }
}

// =============================================================================
// Numeric Operations
// =============================================================================

impl<T: Numeric> Tensor<T> {
    /// Fills the tensor with a value.
    ///
    /// # Panics
    /// Panics on GPU tensors. Use `Tensor::from_vec(vec![value; n], shape)`
    /// followed by `.to_device()` instead.
    pub fn fill_(&self, value: T) {
        assert!(
            self.storage.is_cpu(),
            "fill_() not supported on GPU tensors — create a new tensor and transfer instead"
        );
        let mut data = self.storage.as_slice_mut();
        CpuBackend::fill(&mut data, value);
    }

    /// Fills the tensor with zeros.
    pub fn zero_(&self) {
        self.fill_(T::zero());
    }

    // =========================================================================
    // Reduction Operations
    // =========================================================================

    /// Returns the sum of all elements as a scalar tensor.
    ///
    /// On GPU, uses native CUDA reduction kernels (no CPU round-trip).
    #[must_use]
    pub fn sum(&self) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let self_f32 = unsafe { gpu_ref(self) };
            let mut t = self_f32.clone();
            while t.ndim() > 1 {
                t = t.sum_dim_cuda(0);
            }
            if t.numel() > 1 {
                t = t.sum_dim_cuda(0);
            }
            return unsafe { gpu_into(t) };
        }

        let data = self.to_vec();
        let result = CpuBackend::sum(&data);
        Self::scalar(result)
    }

    /// Returns the product of all elements.
    ///
    /// GPU: D2H round-trip (no CUDA prod reduction kernel yet).
    #[must_use]
    pub fn prod(&self) -> Self {
        let data = self.to_vec();
        let result = CpuBackend::prod(&data);
        let s = Self::scalar(result);
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            return s
                .to_device(self.device())
                .expect("prod: device transfer failed");
        }
        s
    }

    /// Returns the maximum element.
    ///
    /// GPU: D2H round-trip (no CUDA max reduction kernel yet).
    pub fn max(&self) -> Result<Self> {
        if self.is_empty() {
            return Err(Error::EmptyTensor);
        }
        let data = self.to_vec();
        let result = CpuBackend::max(&data).expect("max on non-empty tensor");
        let s = Self::scalar(result);
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            return Ok(s
                .to_device(self.device())
                .expect("max: device transfer failed"));
        }
        Ok(s)
    }

    /// Returns the minimum element.
    ///
    /// GPU: D2H round-trip (no CUDA min reduction kernel yet).
    pub fn min(&self) -> Result<Self> {
        if self.is_empty() {
            return Err(Error::EmptyTensor);
        }
        let data = self.to_vec();
        let result = CpuBackend::min(&data).expect("min on non-empty tensor");
        let s = Self::scalar(result);
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            return Ok(s
                .to_device(self.device())
                .expect("min: device transfer failed"));
        }
        Ok(s)
    }

    /// Returns the index of the maximum element.
    pub fn argmax(&self) -> Result<usize> {
        if self.is_empty() {
            return Err(Error::EmptyTensor);
        }
        let data = self.to_vec();
        Ok(CpuBackend::argmax(&data).unwrap())
    }

    /// Returns the index of the minimum element.
    pub fn argmin(&self) -> Result<usize> {
        if self.is_empty() {
            return Err(Error::EmptyTensor);
        }
        let data = self.to_vec();
        Ok(CpuBackend::argmin(&data).unwrap())
    }

    /// Concatenates tensors along a dimension.
    ///
    /// All tensors must have the same shape except along the cat dimension.
    pub fn cat(tensors: &[&Self], dim: usize) -> Result<Self> {
        if tensors.is_empty() {
            return Err(Error::invalid_operation("cat requires at least one tensor"));
        }
        let ndim = tensors[0].ndim();
        if dim >= ndim {
            return Err(Error::invalid_operation("cat dimension out of range"));
        }

        for t in &tensors[1..] {
            if t.ndim() != ndim {
                return Err(Error::invalid_operation(
                    "cat: all tensors must have same ndim",
                ));
            }
            for d in 0..ndim {
                if d != dim && t.shape[d] != tensors[0].shape[d] {
                    return Err(Error::invalid_operation(
                        "cat: shapes must match on non-cat dims",
                    ));
                }
            }
        }

        let total_dim_size: usize = tensors.iter().map(|t| t.shape[dim]).sum();
        let mut out_shape: Vec<usize> = tensors[0].shape.to_vec();
        out_shape[dim] = total_dim_size;

        let outer_size: usize = out_shape[..dim].iter().product();
        let inner_size: usize = out_shape[dim + 1..].iter().product();
        let total_numel: usize = out_shape.iter().product();
        let mut result = vec![T::zero(); total_numel];

        let mut dim_offset = 0;
        for t in tensors {
            let t_data = t.contiguous().to_vec();
            let t_dim_size = t.shape[dim];
            for outer in 0..outer_size {
                for d in 0..t_dim_size {
                    let src_base = outer * t_dim_size * inner_size + d * inner_size;
                    let dst_base =
                        outer * total_dim_size * inner_size + (dim_offset + d) * inner_size;
                    result[dst_base..dst_base + inner_size]
                        .copy_from_slice(&t_data[src_base..src_base + inner_size]);
                }
            }
            dim_offset += t_dim_size;
        }

        let out = Self::from_vec(result, &out_shape)?;
        #[cfg(feature = "cuda")]
        if tensors[0].device().is_gpu() {
            return Ok(out.to_device(tensors[0].device()).unwrap());
        }
        Ok(out)
    }
}

// =============================================================================
// Float Operations
// =============================================================================

impl<T: Float> Tensor<T> {
    /// Returns the mean of all elements.
    /// Returns the mean of all elements.
    ///
    /// On GPU, uses native CUDA sum reduction then divides by numel.
    pub fn mean(&self) -> Result<Self> {
        if self.is_empty() {
            return Err(Error::EmptyTensor);
        }
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            let s = self.sum(); // uses CUDA sum_dim chain
            let n = self.numel() as f32;
            // mul_scalar stays on GPU
            return Ok(s.mul_scalar(T::from(1.0 / n as f64).unwrap_or(T::zero())));
        }

        let data = self.to_vec();
        let result = CpuBackend::mean(&data).expect("mean on non-empty tensor");
        Ok(Self::scalar(result))
    }

    // =========================================================================
    // Activation Functions
    // =========================================================================

    /// Applies `ReLU` activation: max(0, x).
    #[must_use]
    pub fn relu(&self) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            return unsafe { gpu_into(gpu_ref(self).relu_cuda()) };
        }
        let data = self.to_vec();
        let mut result = vec![T::zero(); data.len()];
        CpuBackend::relu(&mut result, &data);
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// Applies sigmoid activation: 1 / (1 + exp(-x)).
    #[must_use]
    pub fn sigmoid(&self) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            return unsafe { gpu_into(gpu_ref(self).sigmoid_cuda()) };
        }
        let data = self.to_vec();
        let mut result = vec![T::zero(); data.len()];
        CpuBackend::sigmoid(&mut result, &data);
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// Applies tanh activation.
    #[must_use]
    pub fn tanh(&self) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            return unsafe { gpu_into(gpu_ref(self).tanh_cuda()) };
        }
        let data = self.to_vec();
        let mut result = vec![T::zero(); data.len()];
        CpuBackend::tanh(&mut result, &data);
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// Applies exponential function.
    #[must_use]
    pub fn exp(&self) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            return unsafe { gpu_into(gpu_ref(self).exp_cuda()) };
        }
        let data = self.to_vec();
        let mut result = vec![T::zero(); data.len()];
        CpuBackend::exp(&mut result, &data);
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// Applies natural logarithm.
    #[must_use]
    pub fn ln(&self) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            return unsafe { gpu_into(gpu_ref(self).ln_cuda()) };
        }
        let data = self.to_vec();
        let mut result = vec![T::zero(); data.len()];
        CpuBackend::ln(&mut result, &data);
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// Applies square root.
    #[must_use]
    pub fn sqrt(&self) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            return unsafe { gpu_into(gpu_ref(self).sqrt_cuda()) };
        }
        let data = self.to_vec();
        let mut result = vec![T::zero(); data.len()];
        CpuBackend::sqrt(&mut result, &data);
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// Computes element-wise power.
    #[must_use]
    pub fn pow(&self, exp: T) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let exp_f32: f32 = unsafe { *(&exp as *const T as *const f32) };
            return unsafe { gpu_into(gpu_ref(self).pow_cuda(exp_f32)) };
        }
        let data = self.to_vec();
        let result: Vec<T> = data.iter().map(|&x| x.pow_value(exp)).collect();
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// GELU activation function (Gaussian Error Linear Unit).
    #[must_use]
    pub fn gelu(&self) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            return unsafe { gpu_into(gpu_ref(self).gelu_cuda()) };
        }
        crate::ops::gelu(self)
    }

    /// SiLU/Swish activation function.
    #[must_use]
    pub fn silu(&self) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            return unsafe { gpu_into(gpu_ref(self).silu_cuda()) };
        }
        crate::ops::silu(self)
    }

    /// Softmax along specified dimension.
    #[must_use]
    pub fn softmax(&self, dim: i32) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let self_f32 = unsafe { gpu_ref(self) };
            return unsafe { gpu_into(self_f32.softmax_cuda(dim).expect("CUDA softmax failed")) };
        }
        crate::ops::softmax(self, dim as i64).unwrap_or_else(|_| self.clone())
    }

    /// Log softmax along specified dimension.
    #[must_use]
    pub fn log_softmax(&self, dim: i32) -> Self {
        let softmax_result = self.softmax(dim);
        softmax_result.ln()
    }

    /// Mean along a dimension.
    #[must_use]
    pub fn mean_dim(&self, dim: i32, keepdim: bool) -> Self {
        let ndim = self.ndim();
        let dim = if dim < 0 {
            (ndim as i32 + dim) as usize
        } else {
            dim as usize
        };

        if dim >= ndim {
            return self.clone();
        }

        // GPU fast path: sum_dim then divide by dim_size (all on GPU)
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let self_f32 = unsafe { gpu_ref(self) };
            let summed = if keepdim {
                self_f32.sum_dim_keepdim_cuda(dim)
            } else {
                self_f32.sum_dim_cuda(dim)
            };
            let dim_size = self.shape[dim];
            let result = summed.mul_scalar_cuda(1.0 / dim_size as f32);
            return unsafe { gpu_into(result) };
        }

        let dim_size = self.shape[dim];
        let data = self.to_vec();
        let mut new_shape = self.shape.clone();

        if keepdim {
            new_shape[dim] = 1;
        } else {
            new_shape.remove(dim);
        }

        if new_shape.is_empty() {
            new_shape = smallvec::smallvec![1];
        }

        let new_numel: usize = new_shape.iter().product();
        let mut result = vec![T::zero(); new_numel];

        let outer_size: usize = self.shape[..dim].iter().product();
        let inner_size: usize = self.shape[dim + 1..].iter().product();

        for outer in 0..outer_size {
            for inner in 0..inner_size {
                let mut sum = T::zero();
                for d in 0..dim_size {
                    let idx = outer * dim_size * inner_size + d * inner_size + inner;
                    sum = sum + data[idx];
                }
                let mean = sum / NumCast::from(dim_size).unwrap();
                let result_idx = outer * inner_size + inner;
                result[result_idx] = mean;
            }
        }

        Self::from_vec(result, &new_shape).unwrap()
    }

    /// Sum along a dimension.
    #[must_use]
    pub fn sum_dim(&self, dim: i32, keepdim: bool) -> Self {
        let ndim = self.ndim();
        let dim = if dim < 0 {
            (ndim as i32 + dim) as usize
        } else {
            dim as usize
        };

        if dim >= ndim {
            return self.clone();
        }

        // GPU fast path: use CUDA sum_dim kernel (no CPU copies)
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let self_f32 = unsafe { gpu_ref(self) };
            let result = if keepdim {
                self_f32.sum_dim_keepdim_cuda(dim)
            } else {
                self_f32.sum_dim_cuda(dim)
            };
            return unsafe { gpu_into(result) };
        }

        let dim_size = self.shape[dim];
        let data = self.to_vec();
        let mut new_shape = self.shape.clone();

        if keepdim {
            new_shape[dim] = 1;
        } else {
            new_shape.remove(dim);
        }

        if new_shape.is_empty() {
            new_shape = smallvec::smallvec![1];
        }

        let new_numel: usize = new_shape.iter().product();
        let mut result = vec![T::zero(); new_numel];

        let outer_size: usize = self.shape[..dim].iter().product();
        let inner_size: usize = self.shape[dim + 1..].iter().product();

        for outer in 0..outer_size {
            for inner in 0..inner_size {
                let mut sum = T::zero();
                for d in 0..dim_size {
                    let idx = outer * dim_size * inner_size + d * inner_size + inner;
                    sum = sum + data[idx];
                }
                let result_idx = outer * inner_size + inner;
                result[result_idx] = sum;
            }
        }

        Self::from_vec(result, &new_shape).unwrap()
    }

    /// Variance along a dimension.
    #[must_use]
    pub fn var_dim(&self, dim: i32, keepdim: bool) -> Self {
        // variance = E[x²] - E[x]²  (saves one full-size intermediate allocation)
        let mean = self.mean_dim(dim, true);
        let sq = self.mul(self).unwrap_or_else(|_| self.clone());
        let mean_sq = sq.mean_dim(dim, keepdim);
        let mean_keepdim = if keepdim {
            mean.clone()
        } else {
            self.mean_dim(dim, keepdim)
        };
        let mean_squared = mean_keepdim
            .mul(&mean_keepdim)
            .unwrap_or_else(|_| mean_keepdim.clone());
        mean_sq
            .sub(&mean_squared)
            .unwrap_or_else(|_| mean_sq.clone())
    }

    /// Broadcasts tensor to a new shape.
    #[must_use]
    pub fn broadcast_to(&self, shape: &[usize]) -> Self {
        if self.shape.as_slice() == shape {
            return self.clone();
        }

        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let self_f32 = unsafe { gpu_ref(self) };
            return unsafe {
                gpu_into(
                    self_f32
                        .broadcast_to_cuda(shape)
                        .expect("CUDA broadcast_to failed"),
                )
            };
        }

        let result_shape = broadcast_shape(&self.shape, shape).unwrap_or_else(|_| shape.into());
        let self_strides = broadcast_strides(&self.shape, &self.strides, &result_shape);

        let total = numel(&result_shape);
        let mut result_data = vec![T::zero(); total];
        let self_data = self.storage.as_slice();

        for i in 0..total {
            let indices = crate::shape::unravel_index(i, &result_shape);
            let self_idx = self.offset + linear_index(&indices, &self_strides);
            result_data[i] = self_data[self_idx];
        }

        Self::from_vec(result_data, &result_shape).unwrap()
    }

    /// Slices the tensor using ranges for each dimension.
    #[must_use]
    pub fn slice(&self, ranges: &[std::ops::Range<usize>]) -> Self {
        let mut new_shape = Vec::with_capacity(self.ndim());
        for (i, range) in ranges.iter().enumerate() {
            if i < self.ndim() {
                new_shape.push(range.end - range.start);
            }
        }
        // Keep remaining dimensions unchanged
        for i in ranges.len()..self.ndim() {
            new_shape.push(self.shape[i]);
        }

        let new_numel: usize = new_shape.iter().product();
        let mut result_data = vec![T::zero(); new_numel];
        let self_data = self.to_vec();

        // Copy data with proper indexing
        let mut result_idx = 0;
        Self::slice_recursive(
            &self_data,
            &self.shape,
            ranges,
            0,
            0,
            &mut result_data,
            &mut result_idx,
        );

        let out = Self::from_vec(result_data, &new_shape).unwrap();
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            return out.to_device(self.device()).unwrap();
        }
        out
    }

    fn slice_recursive(
        data: &[T],
        shape: &[usize],
        ranges: &[std::ops::Range<usize>],
        dim: usize,
        offset: usize,
        result: &mut [T],
        result_idx: &mut usize,
    ) {
        if dim == shape.len() {
            result[*result_idx] = data[offset];
            *result_idx += 1;
            return;
        }

        let stride: usize = shape[dim + 1..].iter().product();
        let (start, end) = if dim < ranges.len() {
            (ranges[dim].start, ranges[dim].end)
        } else {
            (0, shape[dim])
        };

        for i in start..end {
            Self::slice_recursive(
                data,
                shape,
                ranges,
                dim + 1,
                offset + i * stride,
                result,
                result_idx,
            );
        }
    }
}

// =============================================================================
// Arithmetic Operator Implementations
// =============================================================================

impl<T: Numeric> Tensor<T> {
    /// Element-wise addition with broadcasting.
    pub fn add(&self, other: &Self) -> Result<Self> {
        #[cfg(feature = "cuda")]
        {
            let self_gpu = self.device().is_gpu();
            let other_gpu = other.device().is_gpu();
            if self_gpu || other_gpu {
                assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
                if self_gpu && other_gpu {
                    let (s, o) = unsafe { (gpu_ref(self), gpu_ref(other)) };
                    if self.shape == other.shape {
                        return Ok(unsafe { gpu_into(s.add_cuda(o)?) });
                    } else {
                        return Ok(unsafe { gpu_into(s.broadcast_add_cuda(o)?) });
                    }
                }
                // Mixed device — move to GPU, then operate
                let target_device = if self_gpu {
                    self.device()
                } else {
                    other.device()
                };
                let a_gpu = if self_gpu {
                    self.clone()
                } else {
                    self.to_device(target_device)?
                };
                let b_gpu = if other_gpu {
                    other.clone()
                } else {
                    other.to_device(target_device)?
                };
                return a_gpu.add(&b_gpu);
            }
        }
        // Fast path: same shape, both contiguous — no index arithmetic needed
        if self.shape == other.shape && self.is_contiguous() && other.is_contiguous() {
            let a = self.storage.as_slice();
            let b = other.storage.as_slice();
            let ao = self.offset;
            let bo = other.offset;
            let n = numel(&self.shape);
            let mut result_data = vec![T::zero(); n];
            for i in 0..n {
                result_data[i] = a[ao + i] + b[bo + i];
            }
            return Self::from_vec(result_data, &self.shape);
        }

        let result_shape = broadcast_shape(&self.shape, &other.shape)?;
        let self_strides = broadcast_strides(&self.shape, &self.strides, &result_shape);
        let other_strides = broadcast_strides(&other.shape, &other.strides, &result_shape);

        let total = numel(&result_shape);
        let mut result_data = vec![T::zero(); total];

        let self_data = self.storage.as_slice();
        let other_data = other.storage.as_slice();

        for i in 0..total {
            let indices = crate::shape::unravel_index(i, &result_shape);
            let self_idx = self.offset + linear_index(&indices, &self_strides);
            let other_idx = other.offset + linear_index(&indices, &other_strides);
            result_data[i] = self_data[self_idx] + other_data[other_idx];
        }

        Self::from_vec(result_data, &result_shape)
    }

    /// Element-wise subtraction with broadcasting.
    pub fn sub(&self, other: &Self) -> Result<Self> {
        #[cfg(feature = "cuda")]
        {
            let self_gpu = self.device().is_gpu();
            let other_gpu = other.device().is_gpu();
            if self_gpu || other_gpu {
                assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
                if self_gpu && other_gpu {
                    let (s, o) = unsafe { (gpu_ref(self), gpu_ref(other)) };
                    if self.shape == other.shape {
                        return Ok(unsafe { gpu_into(s.sub_cuda(o)?) });
                    } else {
                        return Ok(unsafe { gpu_into(s.broadcast_sub_cuda(o)?) });
                    }
                }
                let target = if self_gpu {
                    self.device()
                } else {
                    other.device()
                };
                let a_gpu = if self_gpu {
                    self.clone()
                } else {
                    self.to_device(target)?
                };
                let b_gpu = if other_gpu {
                    other.clone()
                } else {
                    other.to_device(target)?
                };
                return a_gpu.sub(&b_gpu);
            }
        }
        // Fast path: same shape, contiguous
        if self.shape == other.shape && self.is_contiguous() && other.is_contiguous() {
            let a = self.storage.as_slice();
            let b = other.storage.as_slice();
            let (ao, bo) = (self.offset, other.offset);
            let n = numel(&self.shape);
            let mut r = vec![T::zero(); n];
            for i in 0..n {
                r[i] = a[ao + i] - b[bo + i];
            }
            return Self::from_vec(r, &self.shape);
        }

        let result_shape = broadcast_shape(&self.shape, &other.shape)?;
        let self_strides = broadcast_strides(&self.shape, &self.strides, &result_shape);
        let other_strides = broadcast_strides(&other.shape, &other.strides, &result_shape);

        let total = numel(&result_shape);
        let mut result_data = vec![T::zero(); total];

        let self_data = self.storage.as_slice();
        let other_data = other.storage.as_slice();

        for i in 0..total {
            let indices = crate::shape::unravel_index(i, &result_shape);
            let self_idx = self.offset + linear_index(&indices, &self_strides);
            let other_idx = other.offset + linear_index(&indices, &other_strides);
            result_data[i] = self_data[self_idx] - other_data[other_idx];
        }

        Self::from_vec(result_data, &result_shape)
    }

    /// Element-wise multiplication with broadcasting.
    pub fn mul(&self, other: &Self) -> Result<Self> {
        #[cfg(feature = "cuda")]
        {
            let self_gpu = self.device().is_gpu();
            let other_gpu = other.device().is_gpu();
            if self_gpu || other_gpu {
                assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
                if self_gpu && other_gpu {
                    let (s, o) = unsafe { (gpu_ref(self), gpu_ref(other)) };
                    if self.shape == other.shape {
                        return Ok(unsafe { gpu_into(s.mul_cuda(o)?) });
                    } else {
                        return Ok(unsafe { gpu_into(s.broadcast_mul_cuda(o)?) });
                    }
                }
                let target = if self_gpu {
                    self.device()
                } else {
                    other.device()
                };
                let a_gpu = if self_gpu {
                    self.clone()
                } else {
                    self.to_device(target)?
                };
                let b_gpu = if other_gpu {
                    other.clone()
                } else {
                    other.to_device(target)?
                };
                return a_gpu.mul(&b_gpu);
            }
        }
        // Fast path: same shape, contiguous
        if self.shape == other.shape && self.is_contiguous() && other.is_contiguous() {
            let a = self.storage.as_slice();
            let b = other.storage.as_slice();
            let (ao, bo) = (self.offset, other.offset);
            let n = numel(&self.shape);
            let mut r = vec![T::zero(); n];
            for i in 0..n {
                r[i] = a[ao + i] * b[bo + i];
            }
            return Self::from_vec(r, &self.shape);
        }

        let result_shape = broadcast_shape(&self.shape, &other.shape)?;
        let self_strides = broadcast_strides(&self.shape, &self.strides, &result_shape);
        let other_strides = broadcast_strides(&other.shape, &other.strides, &result_shape);

        let total = numel(&result_shape);
        let mut result_data = vec![T::zero(); total];

        let self_data = self.storage.as_slice();
        let other_data = other.storage.as_slice();

        for i in 0..total {
            let indices = crate::shape::unravel_index(i, &result_shape);
            let self_idx = self.offset + linear_index(&indices, &self_strides);
            let other_idx = other.offset + linear_index(&indices, &other_strides);
            result_data[i] = self_data[self_idx] * other_data[other_idx];
        }

        Self::from_vec(result_data, &result_shape)
    }

    /// Element-wise division with broadcasting.
    pub fn div(&self, other: &Self) -> Result<Self> {
        #[cfg(feature = "cuda")]
        {
            let self_gpu = self.device().is_gpu();
            let other_gpu = other.device().is_gpu();
            if self_gpu || other_gpu {
                assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
                if self_gpu && other_gpu {
                    let (s, o) = unsafe { (gpu_ref(self), gpu_ref(other)) };
                    if self.shape == other.shape {
                        return Ok(unsafe { gpu_into(s.div_cuda(o)?) });
                    } else {
                        return Ok(unsafe { gpu_into(s.broadcast_div_cuda(o)?) });
                    }
                }
                let target = if self_gpu {
                    self.device()
                } else {
                    other.device()
                };
                let a_gpu = if self_gpu {
                    self.clone()
                } else {
                    self.to_device(target)?
                };
                let b_gpu = if other_gpu {
                    other.clone()
                } else {
                    other.to_device(target)?
                };
                return a_gpu.div(&b_gpu);
            }
        }
        // Fast path: same shape, contiguous
        if self.shape == other.shape && self.is_contiguous() && other.is_contiguous() {
            let a = self.storage.as_slice();
            let b = other.storage.as_slice();
            let (ao, bo) = (self.offset, other.offset);
            let n = numel(&self.shape);
            let mut r = vec![T::zero(); n];
            for i in 0..n {
                r[i] = a[ao + i] / b[bo + i];
            }
            return Self::from_vec(r, &self.shape);
        }

        let result_shape = broadcast_shape(&self.shape, &other.shape)?;
        let self_strides = broadcast_strides(&self.shape, &self.strides, &result_shape);
        let other_strides = broadcast_strides(&other.shape, &other.strides, &result_shape);

        let total = numel(&result_shape);
        let mut result_data = vec![T::zero(); total];

        let self_data = self.storage.as_slice();
        let other_data = other.storage.as_slice();

        for i in 0..total {
            let indices = crate::shape::unravel_index(i, &result_shape);
            let self_idx = self.offset + linear_index(&indices, &self_strides);
            let other_idx = other.offset + linear_index(&indices, &other_strides);
            result_data[i] = self_data[self_idx] / other_data[other_idx];
        }

        Self::from_vec(result_data, &result_shape)
    }

    /// Scalar addition.
    #[must_use]
    pub fn add_scalar(&self, scalar: T) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let self_f32 = unsafe { gpu_ref(self) };
            let scalar_f32: f32 = unsafe { *(&scalar as *const T as *const f32) };
            return unsafe { gpu_into(self_f32.add_scalar_cuda(scalar_f32)) };
        }
        let data = self.to_vec();
        let mut result = vec![T::zero(); data.len()];
        CpuBackend::add_scalar(&mut result, &data, scalar);
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// Scalar multiplication.
    #[must_use]
    pub fn mul_scalar(&self, scalar: T) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let self_f32 = unsafe { gpu_ref(self) };
            let scalar_f32: f32 = unsafe { *(&scalar as *const T as *const f32) };
            return unsafe { gpu_into(self_f32.mul_scalar_cuda(scalar_f32)) };
        }
        let data = self.to_vec();
        let mut result = vec![T::zero(); data.len()];
        CpuBackend::mul_scalar(&mut result, &data, scalar);
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// Element-wise negation.
    #[must_use]
    pub fn neg(&self) -> Self {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let self_f32 = unsafe { gpu_ref(self) };
            return unsafe { gpu_into(self_f32.neg_cuda()) };
        }
        let data = self.to_vec();
        let mut result = vec![T::zero(); data.len()];
        CpuBackend::neg(&mut result, &data);
        Self::from_vec(result, &self.shape).unwrap()
    }

    /// Matrix multiplication with batching support.
    ///
    /// Supports:
    /// - 2D @ 2D: [m, k] @ [k, n] -> [m, n]
    /// - 3D @ 3D: [batch, m, k] @ [batch, k, n] -> [batch, m, n]
    /// - 4D @ 4D: [b1, b2, m, k] @ [b1, b2, k, n] -> [b1, b2, m, n]
    pub fn matmul(&self, other: &Self) -> Result<Self> {
        #[cfg(feature = "cuda")]
        if self.device().is_gpu() {
            assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
            let (s, o) = unsafe { (gpu_ref(self), gpu_ref(other)) };
            return Ok(unsafe { gpu_into(s.matmul_cuda(o)?) });
        }
        if self.ndim() < 2 || other.ndim() < 2 {
            return Err(Error::invalid_operation(
                "matmul requires at least 2D tensors",
            ));
        }

        let m = self.shape[self.ndim() - 2];
        let k1 = self.shape[self.ndim() - 1];
        let k2 = other.shape[other.ndim() - 2];
        let n = other.shape[other.ndim() - 1];

        if k1 != k2 {
            return Err(Error::invalid_operation(format!(
                "matmul inner dimensions must match: {k1} vs {k2}"
            )));
        }

        // For 2D matrices, do simple matmul
        if self.ndim() == 2 && other.ndim() == 2 {
            let a_data = self.contiguous().to_vec();
            let b_data = other.contiguous().to_vec();

            // GPU-accelerated matmul for CPU tensors: only for very large matrices
            // where transfer overhead is negligible relative to compute.
            // For GPU-resident tensors, the dispatch at the top of matmul() handles it.
            #[cfg(feature = "cuda")]
            {
                let flops = m * n * k1;
                if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>()
                    && flops >= 4_000_000
                {
                    debug_assert!(std::mem::size_of::<T>() == std::mem::size_of::<f32>());
                    // SAFETY: T is f32 (checked by TypeId above), same size and layout
                    let a_f32: &[f32] = unsafe { std::mem::transmute(a_data.as_slice()) };
                    let b_f32: &[f32] = unsafe { std::mem::transmute(b_data.as_slice()) };
                    if let Some(c_f32) = cuda_accel::cuda_matmul(a_f32, b_f32, m, n, k1) {
                        // SAFETY: T is f32, Vec<f32> → Vec<T> is a no-op transmute
                        let c_t: Vec<T> = unsafe {
                            let mut v = std::mem::ManuallyDrop::new(c_f32);
                            Vec::from_raw_parts(v.as_mut_ptr() as *mut T, v.len(), v.capacity())
                        };
                        return Self::from_vec(c_t, &[m, n]);
                    }
                }
            }

            let mut c_data = vec![T::zero(); m * n];
            CpuBackend::matmul(&mut c_data, &a_data, &b_data, m, n, k1);
            return Self::from_vec(c_data, &[m, n]);
        }

        // For batched matmul, compute batch size
        let batch_dims_self: Vec<usize> = self.shape[..self.ndim() - 2].to_vec();
        let batch_dims_other: Vec<usize> = other.shape[..other.ndim() - 2].to_vec();

        // Broadcast batch dimensions (PyTorch parity)
        let broadcast_batch = if batch_dims_self == batch_dims_other {
            None
        } else {
            // Pad to same length
            let max_len = batch_dims_self.len().max(batch_dims_other.len());
            let pad_a = vec![1usize; max_len - batch_dims_self.len()];
            let pad_b = vec![1usize; max_len - batch_dims_other.len()];
            let a_dims: Vec<usize> = pad_a
                .iter()
                .chain(batch_dims_self.iter())
                .copied()
                .collect();
            let b_dims: Vec<usize> = pad_b
                .iter()
                .chain(batch_dims_other.iter())
                .copied()
                .collect();

            let mut out_dims = Vec::with_capacity(max_len);
            for i in 0..max_len {
                if a_dims[i] == b_dims[i] {
                    out_dims.push(a_dims[i]);
                } else if a_dims[i] == 1 {
                    out_dims.push(b_dims[i]);
                } else if b_dims[i] == 1 {
                    out_dims.push(a_dims[i]);
                } else {
                    return Err(Error::invalid_operation(format!(
                        "matmul batch dimensions not broadcastable: {:?} vs {:?}",
                        batch_dims_self, batch_dims_other
                    )));
                }
            }
            Some((a_dims, b_dims, out_dims))
        };

        let (batch_size, a_batch_idx, b_batch_idx) =
            if let Some((a_dims, b_dims, out_dims)) = &broadcast_batch {
                let bs: usize = out_dims.iter().product();
                // Build index mapping: for each output batch, which a and b batch to use
                let mut a_idx = Vec::with_capacity(bs);
                let mut b_idx = Vec::with_capacity(bs);
                for flat in 0..bs {
                    let mut remaining = flat;
                    let mut ai = 0usize;
                    let mut bi = 0usize;
                    let mut a_stride_acc = 1usize;
                    let mut b_stride_acc = 1usize;
                    for d in (0..out_dims.len()).rev() {
                        let out_d = out_dims[d];
                        let idx = remaining % out_d;
                        remaining /= out_d;
                        let a_d = a_dims[d];
                        let b_d = b_dims[d];
                        ai += (idx % a_d) * a_stride_acc;
                        bi += (idx % b_d) * b_stride_acc;
                        a_stride_acc *= a_d;
                        b_stride_acc *= b_d;
                    }
                    a_idx.push(ai);
                    b_idx.push(bi);
                }
                (bs, a_idx, b_idx)
            } else {
                let bs: usize = batch_dims_self.iter().product();
                let idx: Vec<usize> = (0..bs).collect();
                (bs, idx.clone(), idx)
            };

        let a_stride = m * k1;
        let b_stride = k1 * n;
        let c_stride = m * n;

        let a_data = self.contiguous().to_vec();
        let b_data = other.contiguous().to_vec();
        let mut c_data = vec![T::zero(); batch_size * m * n];

        // Try GPU acceleration for f32 batched matmul (only for large enough matrices)
        #[cfg(feature = "cuda")]
        {
            let flops = m * n * k1;
            if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() && flops >= 4_000_000 {
                let a_f32: &[f32] = unsafe { std::mem::transmute(a_data.as_slice()) };
                let b_f32: &[f32] = unsafe { std::mem::transmute(b_data.as_slice()) };
                let mut gpu_ok = true;
                for batch in 0..batch_size {
                    let ai = a_batch_idx[batch];
                    let bi = b_batch_idx[batch];
                    let a_slice = &a_f32[ai * a_stride..(ai + 1) * a_stride];
                    let b_slice = &b_f32[bi * b_stride..(bi + 1) * b_stride];
                    if let Some(c_batch) = cuda_accel::cuda_matmul(a_slice, b_slice, m, n, k1) {
                        c_data[batch * c_stride..(batch + 1) * c_stride]
                            .copy_from_slice(unsafe { std::mem::transmute(c_batch.as_slice()) });
                    } else {
                        gpu_ok = false;
                        break;
                    }
                }
                if gpu_ok {
                    let mut output_shape = batch_dims_self;
                    output_shape.push(m);
                    output_shape.push(n);
                    return Self::from_vec(c_data, &output_shape);
                }
                // Fall through to CPU if GPU failed
                c_data = vec![T::zero(); batch_size * m * n];
            }
        }

        // CPU fallback: loop over batches
        for batch in 0..batch_size {
            let ai = a_batch_idx[batch];
            let bi = b_batch_idx[batch];
            let a_slice = &a_data[ai * a_stride..(ai + 1) * a_stride];
            let b_slice = &b_data[bi * b_stride..(bi + 1) * b_stride];
            let c_slice = &mut c_data[batch * c_stride..(batch + 1) * c_stride];
            CpuBackend::matmul(c_slice, a_slice, b_slice, m, n, k1);
        }

        // Build output shape: broadcast batch dims + [m, n]
        let mut output_shape = if let Some((_, _, ref out_dims)) = broadcast_batch {
            out_dims.clone()
        } else {
            batch_dims_self
        };
        output_shape.push(m);
        output_shape.push(n);

        Self::from_vec(c_data, &output_shape)
    }

    /// Dot product for 1D tensors.
    pub fn dot(&self, other: &Self) -> Result<Self> {
        if self.ndim() != 1 || other.ndim() != 1 {
            return Err(Error::invalid_operation("dot requires 1D tensors"));
        }

        if self.shape[0] != other.shape[0] {
            return Err(Error::shape_mismatch(&self.shape, &other.shape));
        }

        let a_data = self.to_vec();
        let b_data = other.to_vec();
        let result = CpuBackend::dot(&a_data, &b_data);

        Ok(Self::scalar(result))
    }
}

// =============================================================================
// Operator Trait Implementations
// =============================================================================

impl<T: Numeric> Add for &Tensor<T> {
    type Output = Tensor<T>;

    fn add(self, other: Self) -> Self::Output {
        self.add(other).expect("Addition failed")
    }
}

impl<T: Numeric> Sub for &Tensor<T> {
    type Output = Tensor<T>;

    fn sub(self, other: Self) -> Self::Output {
        self.sub(other).expect("Subtraction failed")
    }
}

impl<T: Numeric> Mul for &Tensor<T> {
    type Output = Tensor<T>;

    fn mul(self, other: Self) -> Self::Output {
        self.mul(other).expect("Multiplication failed")
    }
}

impl<T: Numeric> Div for &Tensor<T> {
    type Output = Tensor<T>;

    fn div(self, other: Self) -> Self::Output {
        self.div(other).expect("Division failed")
    }
}

impl<T: Numeric> Neg for &Tensor<T> {
    type Output = Tensor<T>;

    fn neg(self) -> Self::Output {
        self.neg()
    }
}

// Scalar operations
impl<T: Numeric> Add<T> for &Tensor<T> {
    type Output = Tensor<T>;

    fn add(self, scalar: T) -> Self::Output {
        self.add_scalar(scalar)
    }
}

impl<T: Numeric> Mul<T> for &Tensor<T> {
    type Output = Tensor<T>;

    fn mul(self, scalar: T) -> Self::Output {
        self.mul_scalar(scalar)
    }
}

// =============================================================================
// Display Implementation
// =============================================================================

impl<T: Scalar + fmt::Display> fmt::Debug for Tensor<T> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "Tensor(shape={:?}, device={}",
            self.shape(),
            self.device()
        )?;
        if self.numel() <= 10 {
            write!(f, ", data={:?}", self.to_vec())?;
        }
        write!(f, ")")
    }
}

impl<T: Scalar + fmt::Display> fmt::Display for Tensor<T> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if self.is_scalar() {
            write!(f, "{}", self.item().unwrap())
        } else if self.ndim() == 1 {
            write!(f, "[")?;
            let data = self.to_vec();
            for (i, val) in data.iter().enumerate() {
                if i > 0 {
                    write!(f, ", ")?;
                }
                write!(f, "{val}")?;
            }
            write!(f, "]")
        } else {
            write!(f, "Tensor(shape={:?})", self.shape())
        }
    }
}

// =============================================================================
// f32 ↔ f16 Casting for AMP (Automatic Mixed Precision)
// =============================================================================

impl Tensor<f32> {
    /// Cast this f32 tensor to f16 values stored as f32.
    ///
    /// Each value is rounded to f16 precision. This simulates half-precision
    /// computation while keeping the tensor type as f32, which is how AMP
    /// works — the autograd graph stays f32 but computation uses f16 precision.
    ///
    /// On GPU, this uses a CUDA kernel for fast conversion.
    /// On CPU, this uses the `half` crate.
    #[must_use]
    pub fn to_f16_precision(&self) -> Self {
        let data = self.to_vec();
        let f16_data: Vec<f32> = data
            .iter()
            .map(|&v| {
                let h = half::f16::from_f32(v);
                h.to_f32()
            })
            .collect();
        Self::from_vec(f16_data, self.shape()).unwrap()
    }

    /// Cast f16-precision values back to full f32 precision.
    ///
    /// This is a no-op since the data is already stored as f32.
    /// Included for API symmetry with `to_f16_precision()`.
    #[must_use]
    pub fn to_f32_precision(&self) -> Self {
        self.clone()
    }

    /// Returns true if applying f16 precision would change any values.
    /// Useful for debugging AMP-related numerical issues.
    #[must_use]
    pub fn has_f16_rounding_error(&self) -> bool {
        let data = self.to_vec();
        data.iter().any(|&v| {
            let h = half::f16::from_f32(v);
            (h.to_f32() - v).abs() > f32::EPSILON
        })
    }
}

// =============================================================================
// Tests
// =============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_from_vec() {
        let t = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3]).unwrap();
        assert_eq!(t.shape(), &[2, 3]);
        assert_eq!(t.numel(), 6);
    }

    #[test]
    fn test_get_set() {
        let t = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0], &[2, 2]).unwrap();
        assert_eq!(t.get(&[0, 0]).unwrap(), 1.0);
        assert_eq!(t.get(&[0, 1]).unwrap(), 2.0);
        assert_eq!(t.get(&[1, 0]).unwrap(), 3.0);
        assert_eq!(t.get(&[1, 1]).unwrap(), 4.0);

        t.set(&[0, 0], 99.0).unwrap();
        assert_eq!(t.get(&[0, 0]).unwrap(), 99.0);
    }

    #[test]
    fn test_reshape() {
        let t = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3]).unwrap();
        let r = t.reshape(&[3, 2]).expect("reshape failed");
        assert_eq!(r.shape(), &[3, 2]);

        let r = t.reshape(&[-1]).expect("reshape failed");
        assert_eq!(r.shape(), &[6]);
    }

    #[test]
    fn test_transpose() {
        let t = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3]).unwrap();
        let r = t.t().unwrap();
        assert_eq!(r.shape(), &[3, 2]);
        assert_eq!(r.get(&[0, 0]).unwrap(), 1.0);
        assert_eq!(r.get(&[0, 1]).unwrap(), 4.0);
        assert_eq!(r.get(&[1, 0]).unwrap(), 2.0);
    }

    #[test]
    fn test_arithmetic() {
        let a = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0], &[3]).unwrap();
        let b = Tensor::<f32>::from_vec(vec![4.0, 5.0, 6.0], &[3]).unwrap();

        let c = &a + &b;
        assert_eq!(c.to_vec(), vec![5.0, 7.0, 9.0]);

        let d = &a * &b;
        assert_eq!(d.to_vec(), vec![4.0, 10.0, 18.0]);
    }

    #[test]
    fn test_broadcasting() {
        let a = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0], &[3]).unwrap();
        let b = Tensor::<f32>::from_vec(vec![10.0], &[1]).unwrap();

        let c = &a + &b;
        assert_eq!(c.to_vec(), vec![11.0, 12.0, 13.0]);
    }

    #[test]
    fn test_sum() {
        let t = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0], &[4]).unwrap();
        let s = t.sum();
        assert_eq!(s.item().unwrap(), 10.0);
    }

    #[test]
    fn test_matmul() {
        // 2x2 @ 2x2
        let a = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0], &[2, 2]).unwrap();
        let b = Tensor::<f32>::from_vec(vec![5.0, 6.0, 7.0, 8.0], &[2, 2]).unwrap();
        let c = a.matmul(&b).unwrap();

        assert_eq!(c.shape(), &[2, 2]);
        assert_eq!(c.to_vec(), vec![19.0, 22.0, 43.0, 50.0]);
    }

    #[test]
    fn test_relu() {
        let t = Tensor::<f32>::from_vec(vec![-1.0, 0.0, 1.0, 2.0], &[4]).unwrap();
        let r = t.relu();
        assert_eq!(r.to_vec(), vec![0.0, 0.0, 1.0, 2.0]);
    }

    #[test]
    fn test_scalar() {
        let s = Tensor::<f32>::scalar(42.0);
        assert!(s.is_scalar());
        assert_eq!(s.numel(), 1);
        assert_eq!(s.item().unwrap(), 42.0);
    }
}