zyx 0.15.2 - Docs.rs

// Copyright (C) 2025 zk4x
// SPDX-License-Identifier: LGPL-3.0-only

//! Tensor
//!
//! Tensors are at the core of all machine learning.

#![allow(clippy::fallible_impl_from)]

use crate::dtype::DType;
use crate::error::ZyxError;
use crate::kernel::{BOp, UOp};
use crate::runtime::{TempData, apply_padding};
use crate::scalar::{Float, Scalar};
use crate::shape::{Dim, IntoShape, UAxis, into_axes, into_axis};
use crate::slab::SlabId;
use crate::{DebugMask, RT};
use core::cmp::Ordering;
use half::{bf16, f16};
use std::fmt::{Debug, Display};
use std::iter::{once, repeat_n};
use std::ops::{Bound, Mul, Neg, Not, Range, RangeBounds};
use std::path::Path;

#[cfg(feature = "py")]
pub use index_ops::DimIndex;
#[cfg(feature = "py")]
pub use reduce_ops::ReduceOp;

mod binary_ops;
mod elementwise;
mod index_ops;
mod reduce_ops;

/// Signed axis, when we need negative axes for indexing, reduces and so on...
pub type Axis = i32;

#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct TensorId(pub u32);

impl TensorId {
    pub const fn null() -> Self {
        Self(u32::MAX)
    }

    pub const fn is_null(self) -> bool {
        self.0 == u32::MAX
    }
}

impl From<usize> for TensorId {
    fn from(value: usize) -> Self {
        TensorId(value as u32)
    }
}

impl From<TensorId> for usize {
    fn from(value: TensorId) -> usize {
        value.0 as usize
    }
}

impl SlabId for TensorId {
    const ZERO: Self = Self(0);
    const NULL: Self = Self(u32::MAX);

    fn inc(&mut self) {
        self.0 += 1;
    }
}

impl Display for TensorId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_fmt(format_args!("{}", self.0))
    }
}

/// A tensor represents a multi-dimensional array of values. This is the primary data structure in the library.
///
/// The `Tensor` struct contains an internal identifier (`id`) that uniquely identifies each tensor.
/// Thus tensor is only 4 bytes, but it is reference counted, so it is not Copy. Clones are cheap, but require
/// locking a mutex.
///
/// ## Initialization
///
/// Tensors are initialized using [`Tensor::from`].
/// This works for initialization from arrays, vectors or scalars. Arrays can be nested.
///
/// For initialization from various random distributions, check respective associated methods.
#[cfg_attr(feature = "py", pyo3::pyclass(from_py_object))]
pub struct Tensor {
    pub(super) id: TensorId,
}

impl Clone for Tensor {
    fn clone(&self) -> Self {
        RT.lock().retain(self.id);
        Tensor { id: self.id }
    }
}

impl Drop for Tensor {
    fn drop(&mut self) {
        let _ = std::panic::catch_unwind(|| {
            let mut rt = match RT.try_lock() {
                Ok(rt) => rt,
                Err(_poisoned) => return, // poisoned.into_inner(),
            };
            rt.release(self.id);
        });
    }
}

impl crate::Module for Tensor {
    fn iter(&self) -> impl Iterator<Item = &Tensor> {
        once(self)
    }

    fn iter_mut(&mut self) -> impl Iterator<Item = &mut Tensor> {
        once(self)
    }

    fn iter_tensors(&self) -> impl Iterator<Item = (String, &Tensor)> {
        once((format!("{}", self.id), self))
    }

    fn iter_tensors_mut(&mut self) -> impl Iterator<Item = (String, &mut Tensor)> {
        once((format!("{}", self.id), self))
    }
}

// Trait to zip tuples of iterators
trait TupleZip: Sized {
    type Item;
    type IntoIter: Iterator<Item = Self::Item>;

    fn zip(self) -> Self::IntoIter;
}

// Implementation for 2-tuples
impl<IA, IB, T> TupleZip for (IA, IB)
where
    IA: IntoIterator<Item = T>,
    IB: IntoIterator<Item = T>,
    T: Copy,
{
    type Item = (T, T);
    type IntoIter = std::iter::Zip<IA::IntoIter, IB::IntoIter>;

    fn zip(self) -> Self::IntoIter {
        self.0.into_iter().zip(self.1)
    }
}

// Implementation for 3-tuples
impl<IA, IB, IC, T> TupleZip for (IA, IB, IC)
where
    IA: IntoIterator<Item = T>,
    IB: IntoIterator<Item = T>,
    IC: IntoIterator<Item = T>,
    T: Copy,
{
    type Item = (T, T, T);
    type IntoIter =
        std::iter::Map<std::iter::Zip<std::iter::Zip<IA::IntoIter, IB::IntoIter>, IC::IntoIter>, fn(((T, T), T)) -> (T, T, T)>;

    fn zip(self) -> Self::IntoIter {
        self.0.into_iter().zip(self.1).zip(self.2).map(|((a, b), c)| (a, b, c))
    }
}

// Implementation for 4-tuples
impl<IA, IB, IC, ID, T> TupleZip for (IA, IB, IC, ID)
where
    IA: IntoIterator<Item = T>,
    IB: IntoIterator<Item = T>,
    IC: IntoIterator<Item = T>,
    ID: IntoIterator<Item = T>,
    T: Copy,
{
    type Item = (T, T, T, T);
    type IntoIter = std::iter::Map<
        std::iter::Zip<std::iter::Zip<std::iter::Zip<IA::IntoIter, IB::IntoIter>, IC::IntoIter>, ID::IntoIter>,
        fn((((T, T), T), T)) -> (T, T, T, T),
    >;

    fn zip(self) -> Self::IntoIter {
        self.0
            .into_iter()
            .zip(self.1)
            .zip(self.2)
            .zip(self.3)
            .map(|(((a, b), c), d)| (a, b, c, d))
    }
}

impl Tensor {
    /// Returns an owned vector containing the shape (dimensions) of the tensor.
    ///
    /// This method retrieves the dimensions of the tensor as a vector. Each element
    /// in the resulting vector corresponds to the size of one dimension of the tensor.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use zyx::Tensor;
    ///
    /// let t = Tensor::from([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]]);
    /// assert_eq!(t.shape(), vec![2, 4]);
    /// ```
    ///
    /// # Returns
    ///
    /// A `Vec<usize>` containing the shape of the tensor.
    #[must_use]
    pub fn shape(&self) -> Vec<Dim> {
        RT.lock().shape(self.id).to_vec()
    }

    /// Is realized
    pub fn is_realized(&self) -> bool {
        RT.lock().is_realized(self.id)
    }

    /// Returns true if the device supports the given dtype.
    #[must_use]
    pub fn supports(dtype: DType) -> bool {
        RT.lock().supports_dtype(dtype)
    }

    /// Returns a slice of the first N dimensions of this tensor.
    ///
    /// # Parameters
    ///
    /// * `const N: usize` - The number of dimensions to return.
    ///
    /// # Errors
    ///
    /// This function will return a `ZyxError` if:
    ///
    /// * `N` is greater than the number of dimensions in this tensor,
    ///   resulting in a `ShapeError` with a message indicating the mismatch.
    ///
    /// # Examples
    ///
    /// ```
    /// # use zyx::Tensor;
    /// let t = Tensor::from([[2, 3, 2], [4, 5, 1]]);
    /// let [d1, d2] = t.dims().unwrap();
    /// assert_eq!(d1, 2);
    /// assert_eq!(d2, 3);
    /// ```
    #[allow(clippy::missing_panics_doc)]
    pub fn dims<const N: usize>(&self) -> Result<[Dim; N], ZyxError> {
        let rt = RT.lock();
        let shape = rt.shape(self.id);
        if N > shape.len() {
            Err(ZyxError::shape_error(
                format!("Requested {N} dims, but tensor only has rank of {}", shape.len()).into(),
            ))
        } else {
            Ok(shape[..N].try_into().unwrap())
        }
    }

    /// Debug graph
    pub fn debug_graph() {
        RT.lock().debug_graph();
    }

    /// Reference count
    pub fn ref_count(&self) -> u32 {
        RT.lock().graph.nodes[self.id].0
    }

    /// Returns a slice of the last N dimensions of this tensor.
    ///
    /// # Parameters
    ///
    /// * `const N: usize` - The number of dimensions to return.
    ///
    /// # Errors
    ///
    /// This function will return a `ZyxError` if:
    ///
    /// * `N` is greater than the number of dimensions in this tensor,
    ///   resulting in a `ShapeError` with a message indicating the mismatch.
    ///
    /// # Examples
    ///
    /// ```
    /// # use zyx::Tensor;
    /// let t = Tensor::from([[2, 3, 2], [4, 5, 1]]);
    /// let [d2] = t.rdims().unwrap();
    /// assert_eq!(d2, 3);
    /// ```
    pub fn rdims<const N: usize>(&self) -> Result<[Dim; N], ZyxError> {
        let rt = RT.lock();
        let shape = rt.shape(self.id);

        if N > shape.len() {
            return Err(ZyxError::shape_error(
                format!("Requested {N} dims, but tensor only has rank of {}", shape.len()).into(),
            ));
        }

        let slice = &shape[shape.len() - N..];
        let mut last_dims = [1; N];
        last_dims.copy_from_slice(slice);
        Ok(last_dims)
    }

    /// Returns the total number of elements in the tensor.
    ///
    /// This method calculates the product of all dimensions of the tensor, effectively
    /// giving you the total number of elements it contains. This can be useful for
    /// various operations where the total size of a tensor is needed.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use zyx::Tensor;
    /// let t = Tensor::from([[2, 3, 2], [4, 5, 1]]);
    /// assert_eq!(t.numel(), 6);
    /// ```
    ///
    /// # Returns
    ///
    /// A `Dim` representing the total number of elements in the tensor.
    ///
    /// # Notes
    ///
    /// The method uses a read lock on the runtime (`RT.lock()`) to access and iterate
    /// over the shape of the tensor, calculating the product of all dimensions.
    #[must_use]
    pub fn numel(&self) -> Dim {
        RT.lock().shape(self.id).iter().product()
    }

    /// Returns the number of dimensions (rank) of the tensor.
    ///
    /// The rank is equivalent to the number of elements in the shape vector.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use zyx::Tensor;
    /// let t = Tensor::from([[2, 3], [4, 1]]);
    /// assert_eq!(t.rank(), 2);
    /// ```
    ///
    /// # Returns
    ///
    /// The rank of the tensor as a `Dim`.
    #[must_use]
    pub fn rank(&self) -> Dim {
        RT.lock().shape(self.id).len() as u64
    }

    /// Returns the data type of the tensor.
    ///
    /// This method retrieves the dtype information for the tensor, which determines
    /// the kind of data stored in the tensor (e.g., float32, int64).
    /// See [`DType`](crate::DType) for available datatypes.
    #[must_use]
    pub fn dtype(&self) -> DType {
        RT.lock().dtype(self.id)
    }

    /// Is zyx in training mode?
    #[must_use]
    pub fn training() -> bool {
        RT.lock().training
    }

    /// Set training mode
    pub fn set_training(training: bool) {
        RT.lock().training = training;
    }

    /// Is implicit casting enabled?
    /// Implicit casts are enabled by default.
    #[must_use]
    pub fn implicit_casts() -> bool {
        RT.lock().implicit_casts
    }

    /// Set implicit casts.
    /// Implicit casts are enabled by default.
    pub fn set_implicit_casts(implicit_casts: bool) {
        RT.lock().implicit_casts = implicit_casts;
    }

    /// Immediatelly evaluate passed tensors This will asynchronously enqueue the computational graph
    /// to the device, but it will not block (await). This is for performance reasons. Actual
    /// blocking only happens when you access a tensor by printing it, converting it to vector,
    /// or some other operation that requires host to have access to data stored in the tensor.
    ///
    /// # Errors
    /// Returns device error if the device fails to realize one or more tensors.
    pub fn realize<'a>(tensors: impl IntoIterator<Item = &'a Tensor>) -> Result<(), ZyxError> {
        //RT.lock().realize_and_cleanup(&tensors.into_iter().map(|t| t.id).collect())
        RT.lock().realize_selected(&tensors.into_iter().map(|t| t.id).collect())
    }

    /// Realize all user held tensors.
    ///
    /// # Errors
    ///
    /// Returns error if any tensor cannot be realized.
    pub fn realize_all() -> Result<(), ZyxError> {
        RT.lock().realize_all()
    }

    /// Realizes this single tensor.
    ///
    /// # Errors
    /// Returns device error if the tensor cannot be realized.
    pub fn realize_self(&self) -> Result<(), ZyxError> {
        Self::realize([self])
    }

    /// Item
    #[allow(clippy::missing_panics_doc)]
    pub fn item<T: Scalar>(&self) -> T {
        let mut rt = RT.lock();
        let mut data = [T::zero(); 1];
        rt.load(self.id, &mut data).unwrap();
        data[0]
    }

    /// Detaches tensor from graph.
    /// This function returns a new tensor with the same data as the previous one,
    /// but drops it's backpropagation graph. This is usefull for recurrent networks:
    /// ```rust no_run
    /// use zyx::{Tensor, DType};
    /// let mut x = Tensor::randn([8, 8], DType::F32)?;
    /// let z = Tensor::randn([8], DType::F32)?;
    /// for _ in 0..100 {
    ///     // Without detach the graph would grow bigger with every iteration
    ///     x = x.detach()? + &z;
    /// }
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    /// [`GradientTape`](crate::GradientTape) limits scope of backpropagation graph, therefore detach
    /// is only required in very advanced cases, not in simple RNNs.
    ///
    /// # Errors
    /// If function needs to realize tensor, it may return device error if the device
    /// fails to realize self.
    pub fn detach(self) -> Result<Tensor, ZyxError> {
        // TODO remove realization from here
        let shape = self.shape();
        let id = match self.dtype() {
            DType::BF16 => {
                let data: Vec<bf16> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::F16 => {
                let data: Vec<f16> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::F32 => {
                let data: Vec<f32> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::F64 => {
                let data: Vec<f64> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::U8 => {
                let data: Vec<u8> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::U16 => {
                let data: Vec<u16> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::U32 => {
                let data: Vec<u32> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::U64 => {
                let data: Vec<u64> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::I8 => {
                let data: Vec<i8> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::I16 => {
                let data: Vec<i16> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::I32 => {
                let data: Vec<i32> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::I64 => {
                let data: Vec<i64> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
            DType::Bool => {
                let data: Vec<bool> = self.try_into()?;
                RT.lock().new_tensor(shape, data)
            }
        }?;
        Ok(Tensor { id })
    }

    /// Create debug guard at the beginning of the block to debug that block.
    /// Once the guard is dropped, debug gets reset to global state,
    /// the one set `by ZYX_DEBUG` env variable.
    /// For more look at `ENV_VARS.md`
    #[must_use]
    pub fn with_debug(debug: DebugMask) -> DebugGuard {
        let mut rt = RT.lock();
        let guard = DebugGuard { debug: rt.debug };
        rt.debug = debug;
        guard
    }

    /// Write graph of operations between tensors as png image with given filename
    /// Expects dot program to be in the path. Otherwise create dot graph file
    /// without converting it to png.
    /// # Errors
    /// Returns error if graph image failed to write to disk.
    pub fn plot_graph<'a>(tensors: impl IntoIterator<Item = &'a Tensor>, name: &str) -> Result<(), std::io::Error> {
        use std::format;
        let path = format!("{name}.dot");
        let graph = RT.lock().plot_dot_graph(&tensors.into_iter().map(|t| t.id).collect());
        std::fs::write(&path, graph)?;
        println!("Path: {path:?}");
        let output = std::process::Command::new("dot")
            .arg("-Tsvg")
            .arg(&path)
            .arg("-o")
            .arg(format!("{name}.svg"))
            .output();
        if let Err(err) = output {
            println!("Graph svg could not be created: {err}");
        } else {
            let _ = std::fs::remove_file(path);
        }
        Ok(())
    }

    /// Manually sets the seed for the random number generator.
    /// This function is only available if the `rand` feature is enabled.
    pub fn manual_seed(seed: u64) {
        RT.lock().manual_seed(seed);
    }

    /// Create random value in range 0f..1f with float dtype
    /// or 0..`{integer}::MAX` if it is integer
    /// # Errors
    /// Returns device error if the device fails to allocate memory for tensor.
    #[allow(clippy::missing_panics_doc, reason = "all panics are checked ahead")]
    pub fn rand(shape: impl IntoShape, dtype: DType) -> Result<Tensor, ZyxError> {
        let shape: Vec<Dim> = shape.into_shape().collect();
        let n = shape.iter().product();
        if dtype.is_float() {
            // TODO later use threefry
            let mut rt = RT.lock();
            match dtype {
                DType::BF16 => {
                    let data: Vec<bf16> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::F16 => {
                    let data: Vec<f16> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::F32 => {
                    let data: Vec<f32> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::F64 => {
                    let data: Vec<f64> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::U8
                | DType::U16
                | DType::U32
                | DType::U64
                | DType::I8
                | DType::I16
                | DType::I32
                | DType::I64
                | DType::Bool => panic!(),
            }
        } else {
            let mut rt = RT.lock();
            match dtype {
                DType::U8 => {
                    let data: Vec<u8> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::U16 => {
                    let data: Vec<u16> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::U32 => {
                    let data: Vec<u32> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::U64 => {
                    let data: Vec<u64> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::I8 => {
                    let data: Vec<i8> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::I16 => {
                    let data: Vec<i16> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::I32 => {
                    let data: Vec<i32> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::I64 => {
                    let data: Vec<i64> = (0..n).map(|_| rt.rng.rand()).collect();
                    Ok(Tensor { id: rt.new_tensor(shape, data)? })
                }
                DType::Bool => Err(ZyxError::dtype_error("Uniform is not supported for bool".into())),
                DType::BF16 | DType::F16 | DType::F32 | DType::F64 => unreachable!(),
            }
        }
    }

    // Initializers
    /// Create tensor sampled from standard distribution.
    /// # Errors
    /// Retuns device error if device fails to allocate memory for given tensor.
    pub fn randn(shape: impl IntoShape, dtype: DType) -> Result<Tensor, ZyxError> {
        // https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
        let shape: Vec<Dim> = shape.into_shape().collect();
        let nshape: Vec<Dim> = once(2).chain(shape).collect();
        let src = Tensor::rand(nshape, DType::F32)?;
        let x1 = src.slice(0)?.mul(2f32 * std::f32::consts::PI).cos();
        let x2 = (1f32 - src.slice(1)?).ln().mul(-2f32).sqrt();
        Ok((x1 * x2).cast(dtype))
    }

    /// Multinomial function
    /// # Errors
    /// Returns device error if the device fails to allocate memory for tensor.
    #[allow(clippy::missing_panics_doc, reason = "TODO disallow panicking")]
    pub fn multinomial(&self, num_samples: Dim, replacement: bool) -> Result<Tensor, ZyxError> {
        let sh = self.shape();
        let rank = sh.len();
        debug_assert!((1..=2).contains(&rank) && num_samples > 0, "rank={rank} must be 1 or 2");
        debug_assert!(
            replacement || num_samples == 1,
            "no replacement only supports num_samples = 1"
        );
        let weight = if rank == 1 { self.unsqueeze(0)? } else { self.clone() };
        let cw = weight.cumsum(1)?.cast(DType::F32);
        let cdf = &cw / cw.slice((.., -1))?.unsqueeze(1)?;
        let cdf_sh = cdf.shape();
        let unif_samples = Tensor::rand([num_samples, cdf_sh[0], 1], DType::F32)?;
        let indices = unif_samples
            .expand([num_samples, cdf_sh[0], cdf_sh[1]])?
            .cmplt(cdf)?
            .not()
            .sum([2])?
            .permute([1, 0])?;
        Ok((if rank == 1 { indices.squeeze([0]) } else { indices }).cast(DType::I32))
    }

    /// Create tensor sampled from uniform distribution
    /// Start of the range must be less than the end of the range.
    /// # Errors
    /// Returns device error if the device fails to allocate memory for tensor.
    pub fn uniform<T: Scalar>(shape: impl IntoShape, range: impl core::ops::RangeBounds<T>) -> Result<Tensor, ZyxError> {
        use core::ops::Bound;
        let low: f32 = match range.start_bound() {
            Bound::Included(value) | Bound::Excluded(value) => value.cast(),
            Bound::Unbounded => f32::min_value(),
        };
        let high: f32 = match range.end_bound() {
            Bound::Included(value) | Bound::Excluded(value) => value.cast(),
            Bound::Unbounded => f32::max_value(),
        };
        Ok((Tensor::rand(shape, DType::F32)? * high.sub(low) + low).cast(T::dtype()))
    }

    /// Create tensor sampled from kaiming uniform distribution.
    /// # Errors
    /// Returns device error if the device fails to allocate memory for tensor.
    #[allow(clippy::missing_panics_doc)]
    pub fn kaiming_uniform<T: Float>(shape: impl IntoShape, a: T) -> Result<Tensor, ZyxError> {
        let n = T::from_i64(shape.clone().into_shape().skip(1).product::<Dim>().try_into().unwrap());
        let one = T::one();
        let x = Scalar::add(one, Scalar::mul(a, a));
        let two = Scalar::add(one, one);
        let three = Scalar::add(two, one);
        let x = Scalar::div(two, x).sqrt();
        let bound = Scalar::mul(three.sqrt(), Scalar::div(x, n));
        Tensor::uniform(shape, bound.neg()..bound)
    }

    /// Create tensor sampled from glorot uniform distribution.
    /// # Errors
    /// Returns device error if the device fails to allocate memory for tensor.
    #[allow(clippy::cast_precision_loss)]
    pub fn glorot_uniform(shape: impl IntoShape, dtype: DType) -> Result<Tensor, ZyxError> {
        let shape: Vec<_> = shape.into_shape().collect();
        let c = 6. / (shape[0] + shape.iter().skip(1).product::<Dim>()) as f32;
        let mut x = Tensor::uniform(shape, -1f32..1f32)?;
        x = x * c.pow(0.5);
        Ok(x.cast(dtype))
    }

    /// Create tensor filled with zeros.
    #[must_use]
    pub fn zeros(shape: impl IntoShape, dtype: DType) -> Tensor {
        Tensor { id: RT.lock().zeros(shape.into_shape().collect(), dtype) }
    }

    /// Create tensor filled with zeros with the same shape and dtype as input.
    #[must_use]
    pub fn zeros_like(input: impl Into<Tensor>) -> Tensor {
        let input = input.into();
        Tensor::zeros(input.shape(), input.dtype())
    }

    /// Create tensor filled with ones.
    #[must_use]
    pub fn ones(shape: impl IntoShape, dtype: DType) -> Tensor {
        Tensor { id: RT.lock().ones(shape.into_shape().collect(), dtype) }
    }

    /// Create tensor filled with ones with the same shape and dtype as input.
    #[must_use]
    pub fn ones_like(input: impl Into<Tensor>) -> Tensor {
        let input = input.into();
        Tensor::ones(input.shape(), input.dtype())
    }

    /// Create tensor filled with value.
    /// # Errors
    /// Returns device error if the device failed to allocate memory for tensor.
    #[allow(clippy::missing_panics_doc)]
    pub fn full(shape: impl IntoShape, value: impl Scalar) -> Tensor {
        Tensor { id: RT.lock().full(shape.into_shape().collect(), value) }
    }

    /// Create square tensor with ones on the main diagonal and all other values set to zero.
    #[allow(clippy::missing_panics_doc)]
    #[must_use]
    pub fn eye(n: Dim, dtype: DType) -> Tensor {
        Tensor::ones(vec![n, 1], dtype)
            .pad_zeros([(0i64, 0i64), (0i64, i64::try_from(n).unwrap())])
            .unwrap()
            .reshape([n + 1, n])
            .unwrap()
            .slice((..-1, ..))
            .unwrap()
    }

    /// Arange method, create range from start, stop, step
    /// # Errors
    /// Returns device error if the device failed to allocate memory for tensor.
    #[allow(clippy::missing_panics_doc)]
    pub fn arange<T: Scalar>(start: T, stop: T, step: T) -> Result<Tensor, ZyxError> {
        // if (stop-start)/step <= 0: return Tensor([], dtype=dtype, **kwargs)
        // return (Tensor.full((math.ceil((stop-start)/step),), step, dtype=dtype, **kwargs)._cumsum() + (start - step)).cast(dtype)
        //println!("Arange {start:?}, {stop:?}, {step:?}");
        let n: i64 = stop.sub(start).div(step).cast();
        let x = Tensor::full(Dim::try_from(n).unwrap(), step);
        let x = x.cumsum(0)?;
        Ok(x + start - step)
    }

    /// Create tensor from vec and shape
    /// # Errors
    /// Returns allocation failure or backend initialization failure
    pub fn from_vec<T: Scalar>(data: Vec<T>, shape: impl IntoShape) -> Result<Tensor, ZyxError> {
        let shape = shape.into_shape().collect();
        let id = RT.lock().new_tensor(shape, data)?;
        Ok(Tensor { id })
    }

    // unary
    /// Casts self to [dtype](crate::DType).
    #[must_use]
    pub fn cast(&self, dtype: DType) -> Tensor {
        return Tensor { id: RT.lock().cast(self.id, dtype) };
    }

    /// Changes dtype of the tensor without mutating it.
    /// Currently this function will also realize the tensor (if it is not already realized)
    ///
    /// # Safety
    /// Not all bits of one type can be safely reinterpreted as bits of other type,
    /// therefore this function is marked as unsafe.
    ///
    /// # Errors
    /// Returns device error if the device failed to allocate memory for tensor.
    #[allow(clippy::missing_panics_doc)]
    pub unsafe fn bitcast(&self, dtype: DType) -> Result<Tensor, ZyxError> {
        let id = unsafe { RT.lock().bitcast(self.id, dtype)? };
        let x = Tensor { id };
        Ok(x)
    }

    /// Applies dropout to the tensor with a given probability.
    ///
    /// This function randomly sets elements of the input tensor to zero based on the provided probability.
    /// The output tensor has the same shape as the input tensor. Elements are preserved with probability `1 - probability`
    /// and set to zero with probability `probability`.
    #[allow(clippy::missing_panics_doc)]
    #[must_use]
    pub fn dropout<P: Scalar + Float>(&self, probability: P) -> Tensor {
        if Tensor::training() {
            Tensor::from(probability)
                .cmplt(Tensor::rand(self.shape(), P::dtype()).unwrap())
                .unwrap()
                * self.clone()
        } else {
            self / P::one().sub(probability)
        }
    }

    /// Linearly interpolates between input and target tensors.
    ///
    /// Performs linear interpolation between two tensors with a given weight factor.
    /// The interpolation formula is: result = input * (1 - weight) + target * weight
    /// This is commonly used for transitions between tensors.
    /// Returns the same dtype as the input tensors.
    ///
    /// **Parameters:**
    ///
    /// * self: Input tensor
    /// * target: Target tensor to interpolate towards
    /// * weight: Interpolation weight between 0.0 and 1.0. 0.0 returns input, 1.0 returns target.
    ///
    /// **Returns:**
    ///
    /// A new tensor containing the interpolated values with the same shape as input.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use zyx::Tensor;
    ///
    /// let input = Tensor::from([1.0f32, 2.0, 3.0]);
    /// let target = Tensor::from([2.0, 4.0, 6.0]);
    /// let interpolated = input.interpolate(&target, 0.5);  // Midway point
    /// // Result: [1.5, 3.0, 4.5] (average of input and target)
    /// ```
    ///
    /// # Panics
    /// Panics if applied on non-float dtype while implicit casting is disabled.
    #[must_use]
    pub fn interpolate(&self, target: &Tensor, weight: f32) -> Tensor {
        let input = self.float_cast().unwrap();
        let target = target.float_cast().unwrap();
        let original_dtype = self.dtype();

        // Linear interpolation: input * (1 - weight) + target * weight
        let result = &input * (1.0 - weight) + &target * weight;

        result.cast(original_dtype)
    }

    /// Computes the Smooth L1 loss between input and target tensors.
    ///
    /// The Smooth L1 loss is a robust loss function that combines L1 and L2 loss. It uses L2 loss
    /// for small values (close to zero) and L1 loss for large values, providing a smooth transition
    /// that is less sensitive to outliers than pure L1 loss while avoiding the large gradients of L2
    /// loss for very large errors.
    /// Returns the same dtype as the input tensors.
    ///
    /// Formula:
    /// ```text
    /// smooth_l1_loss(x, y) = {
    ///     0.5 * (x - y)*(x - y),          if |x - y| <= 1
    ///     |x - y| - 0.5,                   otherwise
    /// }
    /// ```
    ///
    /// **Parameters:**
    ///
    /// * self: Input tensor (predictions)
    /// * target: Target tensor (ground truth)
    ///
    /// **Returns:**
    ///
    /// A new tensor with the same shape as the input, containing the Smooth L1 loss values.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use zyx::Tensor;
    ///
    /// let predictions = Tensor::from([1.0, 2.0, 3.0]);
    /// let targets = Tensor::from([1.5, 2.5, 2.8]);
    /// let loss = predictions.smooth_l1_loss(&targets);
    /// // Smooth L1 loss will be quadratic for differences ≤ 1.0 and linear for differences > 1.0
    /// ```
    ///
    /// # Panics
    /// Panics if applied on non-float dtype while implicit casting is disabled.
    #[must_use]
    pub fn smooth_l1_loss(&self, target: &Tensor) -> Tensor {
        let input = self.float_cast().unwrap();
        let target = target.float_cast().unwrap();
        let original_dtype = self.dtype();

        let diff = &input - &target;
        let abs_diff = diff.abs();
        let mask = abs_diff.cmplt(1.0f32).unwrap();

        // Quadratic region: 0.5 * (x - y)²
        let quadratic_loss = 0.5f32 * &diff * &diff;

        // Linear region: |x - y| - 0.5
        let linear_loss = abs_diff - 0.5f32;

        // Combine based on the mask
        let loss = mask.clone() * quadratic_loss + mask.not() * linear_loss;

        // Sum all elements to get the total loss
        let total_loss = loss.sum([0]).unwrap();

        total_loss.cast(original_dtype)
    }

    /// Computes the Huber loss between input and target tensors.
    ///
    /// The Huber loss is a robust loss function that is less sensitive to outliers than squared error loss.
    /// It combines the best properties of L2 squared loss and L1 absolute loss by being quadratic for small
    /// values and linear for large values.
    /// Returns the same dtype as the input tensors.
    ///
    /// Formula:
    /// ```text
    /// huber_loss(x, y) = {
    ///     0.5 * (x - y)*(x - y),          if |x - y| <= delta
    ///     delta * |x - y| - 0.5 * delta*delta,  otherwise
    /// }
    /// ```
    ///
    /// **Parameters:**
    ///
    /// * self: Input tensor (predictions)
    /// * target: Target tensor (ground truth)
    /// * delta: Threshold value (δ) for switching between quadratic and linear regions (default: 1.0)
    ///
    /// **Returns:**
    ///
    /// A new tensor with the same shape as the input, containing the Huber loss values.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use zyx::Tensor;
    ///
    /// let predictions = Tensor::from([1.0, 2.0, 3.0]);
    /// let targets = Tensor::from([1.5, 2.5, 2.8]);
    /// let loss = predictions.huber_loss(&targets, 1.0);
    /// // Huber loss will be quadratic for differences ≤ 1.0 and linear for differences > 1.0
    /// ```
    #[must_use]
    #[allow(clippy::missing_panics_doc)]
    pub fn huber_loss(&self, target: &Tensor, delta: impl Scalar) -> Tensor {
        let input = self.float_cast().unwrap();
        let target = target.float_cast().unwrap();
        let original_dtype = self.dtype();

        // PyTorch huber loss formula:
        // huber_loss(x, y) = {
        //     0.5 * (x - y)²,                  if |x - y| ≤ δ
        //     δ * |x - y| - 0.5 * δ²,          otherwise
        // }

        let diff = input - target;
        let abs_diff = diff.abs();

        // Cast delta to the same dtype as input to follow PyTorch behavior
        let delta_tensor = Tensor::from(delta).float_cast().unwrap().cast(original_dtype);

        // Create mask for quadratic region (|diff| ≤ delta)
        let quadratic_mask = abs_diff.cmplt(delta_tensor.clone()).unwrap();

        // Quadratic loss: 0.5 * diff²
        let quadratic_loss = 0.5 * diff.clone() * diff;

        // Linear loss: delta * |diff| - 0.5 * delta²
        let linear_loss = delta_tensor.clone() * abs_diff - 0.5 * delta_tensor.clone() * delta_tensor;

        // Combine: use quadratic_loss where |diff| ≤ delta, linear_loss otherwise
        let result = quadratic_mask.clone() * quadratic_loss + quadratic_mask.not() * linear_loss;

        // Sum all elements to get total loss (like smooth_l1_loss does)
        let total_loss = result.sum([0]).unwrap();

        total_loss.cast(original_dtype)
    }

    // movement
    /// Expands this tensor by adding singleton dimensions at the front until its rank matches that of the target shape.
    ///
    /// If the target shape has a higher rank than the current tensor, singleton dimensions are added to the front of the tensor's shape.
    /// If any dimension in the target shape does not match the corresponding dimension in the expanded tensor's shape,
    /// an assertion failure occurs unless the expanded dimension is 1 (in which case it is ignored).
    ///
    /// # Examples
    ///
    /// ```
    /// let t = zyx::Tensor::zeros([2, 3], zyx::DType::U8);
    /// assert_eq!(t.expand((4, 2, 3))?.shape(), &[4, 2, 3]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    /// # Errors
    /// Returns error if self cannot be expanded into shape.
    pub fn expand(&self, shape: impl IntoShape) -> Result<Tensor, ZyxError> {
        //println!("Expand from {sh:?} to {shape:?}");
        let id = RT.lock().expand(self.id, shape.into_shape().collect())?;
        Ok(Tensor { id })
    }

    /// Expands the tensor along a given axis to a new dimension.
    ///
    /// # Arguments
    /// * `axis` – The axis to expand, integer index..
    /// * `dim`  – The new size that the chosen axis should have.
    ///
    /// # Returns
    /// A new `Tensor` with the expanded shape on success, or a `ZyxError` if the
    /// expansion fails (e.g., out‑of‑range axis, runtime error).
    ///
    /// # Errors
    ///
    /// Returns error if the axis is out of bounds.
    ///
    /// # Example
    /// ```
    /// let t = zyx::Tensor::from([[2], [3]]);
    /// let t2 = t.expand_axis(1, 5)?;
    /// assert_eq!(t2.shape(), [2, 5]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    pub fn expand_axis(&self, axis: Axis, dim: Dim) -> Result<Tensor, ZyxError> {
        let mut shape = self.shape();
        let axis = into_axis(axis, shape.len())?;
        shape[axis] = dim;
        let id = RT.lock().expand(self.id, shape)?;
        Ok(Tensor { id })
    }

    /// Permutes the axes of this tensor.
    ///
    /// This function rearranges the dimensions of the tensor according to the provided axes. The axes must be a permutation of the original axes, i.e., they must contain each index once and only once. If the axes have a different length than the rank of the tensor, a panic will occur with an appropriate error message.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use zyx::{Tensor, DType};
    /// let t = Tensor::rand([3, 4], DType::I64).unwrap();
    /// let p = [1, 0];
    /// let permuted_t = t.permute(p); // Results in a tensor with axes (4, 3)
    /// ```
    ///
    /// # Errors
    /// Returns error if self cannot be permute by axes.
    pub fn permute(&self, axes: impl IntoIterator<Item = Axis>) -> Result<Tensor, ZyxError> {
        let rank = self.rank();
        let axes = into_axes(axes, rank as usize)?;
        if rank != axes.len() as u64 {
            return Err(ZyxError::shape_error(
                format!(
                    "Axes has rank {}, but tensor has rank {}. It must be the same for permute.",
                    axes.len(),
                    rank
                )
                .into(),
            ));
        }
        Ok(Tensor { id: RT.lock().permute(self.id, &axes) })
    }

    /// Creates a new tensor by padding zeros around this tensor based on the specified padding configuration.
    /// First padding tuple pads first dimension, second pads second dimension, etc.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::Tensor;
    ///
    /// let t = Tensor::from([1, 2, 3]);
    /// let padded = t.pad_zeros([(1, 1)])?.reshape([5])?;
    /// assert_eq!(padded, [0, 1, 2, 3, 0]);
    ///
    /// let padded = t.pad_zeros([(1, 2)])?;
    /// assert_eq!(padded.shape(), &[6]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Errors
    /// Returns error if self cannot be padded by padding.
    #[allow(clippy::missing_panics_doc)]
    #[track_caller]
    pub fn pad_zeros(&self, padding: impl IntoIterator<Item = (i64, i64)>) -> Result<Tensor, ZyxError> {
        let mut padding: Vec<(i64, i64)> = padding.into_iter().collect();
        let shape = self.shape();
        let rank = shape.len();

        if padding.len() > rank {
            return Err(ZyxError::shape_error(
                format!("Padding with {} dimensions, but tensor only has rank {rank}", padding.len()).into(),
            ));
        }

        padding.extend(std::iter::repeat_n((0i64, 0i64), rank - padding.len()));

        for (i, &(l, r)) in padding.iter().enumerate() {
            let mut total: i64 = 0;
            if l < 0 {
                total -= l;
            }
            if r < 0 {
                total -= r;
            }
            if shape[i] as i64 + l + r < 0 {
                return Err(ZyxError::shape_error(
                    format!("Invalid padding {padding:?} on shape {shape:?}, on dim {i}").into(),
                ));
            }
            if Dim::try_from(total).unwrap() >= shape[i] {
                return Err(ZyxError::shape_error(
                    format!("Invalid padding {padding:?} on shape {shape:?}, on dim {i}").into(),
                ));
            }
        }
        Ok(Tensor { id: RT.lock().pad_zeros(self.id, padding) })
    }

    /// Creates a new tensor by padding zeros around this tensor based on the specified padding configuration.
    /// This is reverse padding. First padding tuple pads last dimension, second pads second last dimension, etc.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::Tensor;
    ///
    /// let t = Tensor::from([1, 2, 3]);
    /// let padded = t.pad_zeros([(1, 1)])?.reshape([5])?;
    /// assert_eq!(padded, [0, 1, 2, 3, 0]);
    ///
    /// let padded = t.pad_zeros([(1, 2)])?;
    /// assert_eq!(padded.shape(), &[6]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Errors
    /// Returns error if self cannot be padded by padding.
    #[allow(clippy::missing_panics_doc)]
    #[track_caller]
    pub fn rpad_zeros(&self, padding: impl IntoIterator<Item = (i64, i64)>) -> Result<Tensor, ZyxError> {
        let mut padding: Vec<(i64, i64)> = padding.into_iter().collect();
        let shape = self.shape();
        let rank = shape.len();

        if padding.len() > rank {
            return Err(ZyxError::shape_error(
                format!("Padding with {} dimensions, but tensor only has rank {rank}", padding.len()).into(),
            ));
        }

        padding.extend(std::iter::repeat_n((0i64, 0i64), rank - padding.len()));
        padding.reverse();
        //println!("padding={padding:?}");

        for (i, &(l, r)) in padding.iter().enumerate() {
            let mut total: i64 = 0;
            if l < 0 {
                total -= l;
            }
            if r < 0 {
                total -= r;
            }
            if shape[i] as i64 + l + r < 0 {
                return Err(ZyxError::shape_error(
                    format!("Invalid padding left={l}, right={r} on dimension size {}", shape[i]).into(),
                ));
            }
            if Dim::try_from(total).unwrap() >= shape[i] {
                return Err(ZyxError::shape_error(
                    format!("Invalid padding {padding:?} on shape {shape:?}").into(),
                ));
            }
        }

        Ok(Tensor { id: RT.lock().pad_zeros(self.id, padding) })
    }

    /// Constant padding
    ///
    /// This can both add and remove values from tensor. Negative padding removes values, positive padding
    /// adds values.
    ///
    /// Pad last dimension by (1, 2)
    /// ```rust
    /// use zyx::Tensor;
    /// let x = Tensor::from([[2i32, 3],
    ///                       [4, 1]]);
    /// println!("{:?}\n{x}", x.shape());
    /// let z = x.pad([(0, 0), (1, 2)], 0i32)?;
    /// println!("{:?}\n{z}", z.shape());
    /// assert_eq!(z, [[0i32, 2, 3, 0, 0],
    ///                [0, 4, 1, 0, 0]]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    /// Pad last dimension by (2, -1) and second last dimension by (1, 1)
    /// ```rust
    /// # use zyx::Tensor;
    /// # let x = Tensor::from([[2i32, 3],
    /// #                       [4, 1]]);
    /// let z = x.pad([(1, 1), (2, -1)], 0i32)?;
    /// assert_eq!(z, [[0i32, 0, 0],
    ///                [0, 0, 2],
    ///                [0, 0, 4],
    ///                [0, 0, 0]]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Errors
    /// Returns error if self cannot be padded by padding.
    #[allow(clippy::missing_panics_doc)]
    pub fn pad(&self, padding: impl IntoIterator<Item = (i64, i64)>, value: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        let dtype = self.dtype();
        let value: Tensor = value.into();
        let padding: Vec<(i64, i64)> = padding.into_iter().collect();
        let mut sh = self.shape();
        if value.dtype() != dtype {
            return Err(ZyxError::dtype_error(
                format!("Cannot pad tensor with dtype {} with value of dtype {}", dtype, value.dtype()).into(),
            ));
        }
        if !padding.len() as UAxis <= sh.rank() && padding.iter().zip(sh.iter().rev()).all(|(&(lp, rp), &d)| if lp < 0 { Dim::try_from(-lp).unwrap() <= d } else { true } && if rp < 0 { Dim::try_from(-rp).unwrap() <= d } else { true }) {
            return Err(ZyxError::shape_error(format!("Cannot pad tensor with shape {sh:?} with padding {padding:?}").into()));
        }
        let t0 = self.pad_zeros(padding.clone())?;
        let ones = Tensor::ones(sh.clone(), dtype);
        apply_padding(&mut sh, &padding);
        let zeros = Tensor::zeros(sh, dtype);
        Ok(t0 + ones.pad_zeros(padding)?.where_(zeros, value)?)
    }

    /// Narrow tensor along an axis, is essentially just padding
    /// ```
    /// # use zyx::Tensor;
    /// let x = Tensor::from([[1, 2, 3], [4, 5, 6], [7, 8, 9]]);
    /// assert_eq!(x.narrow(0, 0, 2)?, [[1, 2, 3], [4, 5, 6]]);
    /// assert_eq!(x.narrow(1, 1, 2)?, [[2, 3], [5, 6], [8, 9]]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    /// # Errors
    /// Returns error if self cannot be narrowed.
    #[allow(clippy::missing_panics_doc)]
    pub fn narrow(&self, axis: Axis, start: Dim, length: Dim) -> Result<Tensor, ZyxError> {
        let shape = self.shape();
        let rank = shape.len() as UAxis;
        let axis = into_axis(axis, rank)?;
        let dim = i64::try_from(shape[axis as usize]).unwrap();
        let padding: Vec<(i64, i64)> = once((
            -i64::try_from(start).unwrap(),
            -dim + i64::try_from(length).unwrap() + i64::try_from(start).unwrap(),
        ))
        .chain(repeat_n((0i64, 0i64), (rank - axis - 1) as usize))
        .collect::<Vec<(i64, i64)>>()
        .into_iter()
        .rev()
        .collect();
        Ok(self.rpad_zeros(padding).unwrap())
    }

    /// Applies a new shape to this tensor while preserving its total number of elements.
    ///
    /// A single `0` in the shape will be inferred automatically. All other dimensions
    /// must be >= 1.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use zyx::Tensor;
    /// let t = Tensor::from([1, 2, 3, 4]);
    /// assert_eq!(t.reshape((2, 2))?, [[1, 2], [3, 4]]);
    ///
    /// // Infer dimension automatically
    /// let t = Tensor::from([1, 2, 3, 4]);
    /// assert_eq!(t.reshape((2, 0))?, [[1, 2], [3, 4]]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Errors
    /// Returns error if self cannot be reshaped to shape.
    pub fn reshape(&self, shape: impl IntoShape) -> Result<Tensor, ZyxError> {
        let mut shape: Vec<Dim> = shape.into_shape().collect();
        let numel = self.numel();

        // count how many dimensions to infer
        let infer_count = shape.iter().filter(|&&d| d == 0).count();
        if infer_count > 1 {
            return Err(ZyxError::shape_error("Can only infer one dimension (0).".into()));
        }

        // infer the dimension if needed
        if infer_count > 0 {
            let product_other: Dim = shape.iter().map(|&d| if d == 0 { 1 } else { d }).product();
            let inferred_dim = numel / product_other;
            if inferred_dim * product_other != numel {
                return Err(ZyxError::shape_error(
                    format!(
                        "Cannot infer dimension: total elements {numel} not divisible by product of specified dims {product_other}"
                    )
                    .into(),
                ));
            }
            shape = shape.into_iter().map(|d| if d == 0 { inferred_dim } else { d }).collect();
        }

        // final check
        let final_product: Dim = shape.iter().product();
        if final_product != numel {
            return Err(ZyxError::shape_error(
                format!(
                    "Invalid reshape: tensor has {numel} elements, but requested shape {shape:?} has {final_product} elements"
                )
                .into(),
            ));
        }

        Ok(Tensor { id: RT.lock().reshape(self.id, shape) })
    }

    /// Transpose (swap) the last two dimensions of this tensor.
    ///
    /// If the rank is 1, the method reshapes the tensor to shape `[n, 1]`.
    ///
    /// # Returns
    ///
    /// A new `Tensor` where the last two dimensions have been swapped.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::Tensor;
    ///
    /// let t = Tensor::from([1.0, 2.0, 3.0]);
    /// assert_eq!(t.t().shape(), &[3, 1]);
    ///
    /// let t = Tensor::from([[1.0, 2.0], [3.0, 4.0]]);
    /// assert_eq!(t.t().shape(), &[2, 2]);
    /// ```
    #[must_use]
    #[allow(clippy::missing_panics_doc)]
    pub fn t(&self) -> Tensor {
        let rank = self.rank();
        if rank == 1 {
            let n = self.numel();
            return self.reshape([n, 1]).unwrap();
        }
        let mut axes: Vec<Axis> = (0..Axis::try_from(rank).unwrap()).collect();
        axes.swap((rank - 1) as usize, (rank - 2) as usize);
        self.permute(axes).unwrap()
    }

    /// Transpose two arbitrary dimensions
    /// ```rust
    /// use zyx::Tensor;
    /// let t = Tensor::from([[[1, 2]], [[3, 4]]]);
    /// assert_eq!(t.transpose(0, -1)?, [[[1, 3]], [[2, 4]]]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Errors
    /// Returns error if self cannot be transposed by dim0 and dim1.
    #[allow(clippy::missing_panics_doc)]
    pub fn transpose(&self, dim0: Axis, dim1: Axis) -> Result<Tensor, ZyxError> {
        let rank = self.rank();
        if (dim0 < 0 && Dim::try_from(-dim0).unwrap() > rank) || (dim0 >= 0 && Dim::try_from(dim0).unwrap() >= rank) {
            return Err(ZyxError::shape_error(
                format!("Cannot transpose dimensions {dim0} and {dim1}, {dim0} is greater than rank {rank}").into(),
            ));
        }
        if (dim1 < 0 && Dim::try_from(-dim1).unwrap() > rank) || (dim1 >= 0 && Dim::try_from(dim1).unwrap() >= rank) {
            return Err(ZyxError::shape_error(
                format!("Cannot transpose dimensions {dim0} and {dim1}, {dim1} is greater than rank {rank}").into(),
            ));
        }
        let mut axes: Vec<Axis> = (0..Axis::try_from(rank).unwrap()).collect();
        axes.swap(
            into_axis(dim0, rank as usize)? as usize,
            into_axis(dim1, rank as usize)? as usize,
        );
        self.permute(axes)
    }

    // reduce
    /// Computes the natural logarithm of the softmax of the input tensor along the specified axes.
    ///
    /// This function first subtracts the maximum value along the given axes from the input tensor,
    /// then computes the exponential of the result, sums over the specified axes using `sum_kd`,
    /// and finally takes the natural logarithm of the sum before returning it.
    ///
    /// # Arguments
    ///
    /// * `self` - The input tensor to compute the softmax and natural logarithm of.
    /// * `axes` - A trait implementing `IntoAxes`, specifying along which axes the softmax should be computed.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::Tensor;
    /// let x = Tensor::from([2f32, 3., 4.]);
    /// let y = x.ln_softmax([]);
    /// ```
    ///
    /// # Returns
    ///
    /// The resulting tensor after computing the natural logarithm of the softmax of `self`.
    ///
    /// # Errors
    ///
    /// Returns error if any of the specified axes are out-of-bounds for the input tensor.
    #[allow(clippy::missing_panics_doc)]
    pub fn ln_softmax(&self, axes: impl IntoIterator<Item = Axis>) -> Result<Tensor, ZyxError> {
        let axes: Vec<_> = axes.into_iter().collect();
        let m = self - self.max_keepdim(axes.clone())?;
        Ok(&m - m.exp().sum_keepdim(axes)?.ln())
    }

    /// Comulative sum along axis.
    ///
    /// # Errors
    ///
    /// Returns error if axis is out of range.
    #[allow(clippy::missing_panics_doc)]
    pub fn cumsum(&self, axis: Axis) -> Result<Tensor, ZyxError> {
        self.cum_reduce(axis, BOp::Add)
    }

    /// Comulative max along axis.
    ///
    /// # Errors
    ///
    /// Returns error if axis is out of range.
    #[allow(clippy::missing_panics_doc)]
    pub fn cummax(&self, axis: Axis) -> Result<Tensor, ZyxError> {
        self.cum_reduce(axis, BOp::Max)
    }

    /// Comulative product along axis.
    ///
    /// # Errors
    ///
    /// Returns error if axis is out of range.
    #[allow(clippy::missing_panics_doc)]
    pub fn cumprod(&self, axis: Axis) -> Result<Tensor, ZyxError> {
        self.cum_reduce(axis, BOp::Mul)
    }

    /// Cumulative reduce along axis
    fn cum_reduce(&self, axis: Axis, rop: BOp) -> Result<Tensor, ZyxError> {
        let shape = self.shape();
        let uaxis = into_axis(axis, shape.len())?;
        let pl_sz = i64::try_from(shape[uaxis] - 1).unwrap();
        let mut x = self.transpose(axis, -1)?;
        x = x.rpad_zeros([(pl_sz, 0i64)])?;
        x = x.pool(shape[uaxis], 1, 1)?;
        x = match rop {
            BOp::Add => x.sum([-1])?,
            BOp::Max => x.max([-1])?,
            BOp::Mul => x.prod([-1])?,
            _ => unreachable!(),
        };
        x = x.transpose(axis, -1)?;
        Ok(x)
    }

    /// Calculates the softmax of this tensor along the specified axes.
    ///
    /// # Arguments
    ///
    /// * `axes`: The axes along which to calculate the softmax.
    ///
    /// # Returns
    ///
    /// * A new tensor containing the result of the softmax operation.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::Tensor;
    ///
    /// let t = Tensor::from(vec![1f32, 2.0, 3.0]);
    /// let sm = t.softmax([])?;
    /// assert_eq!(sm, [0.0900305748f32, 0.2447281546, 0.6652412706]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Errors
    ///
    /// Returns error if self cannot be reduced by axes.
    pub fn softmax(&self, axes: impl IntoIterator<Item = Axis>) -> Result<Tensor, ZyxError> {
        let axes: Vec<_> = axes.into_iter().collect();
        let e = (self - self.max_keepdim(axes.clone())?).exp();
        Ok(&e / e.sum_keepdim(axes)?)
    }

    // binary
    /// Matmul and dot
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have non broadcasteable shapes.
    pub fn dot(&self, rhs: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        let rhs = rhs.into();
        let org_y_shape = rhs.shape();
        let y = rhs.t();
        let xshape = self.shape();
        let yshape = y.shape();
        let xrank = xshape.len();
        let yrank = yshape.len();
        if xshape[xrank - 1] != yshape[yrank - 1] {
            return Err(ZyxError::ShapeError(
                format!("Cannot dot tensors with shapes {xshape:?} and {org_y_shape:?}").into(),
            ));
        }
        let x_shape = xshape[..xrank - 1]
            .iter()
            .copied()
            .chain([1])
            .chain([xshape[xrank - 1]])
            .collect::<Vec<u64>>();
        let y_shape = yshape[0..yrank - 2]
            .iter()
            .copied()
            .chain([1])
            .chain(yshape[yrank - yrank.min(2)..yrank].iter().copied())
            .collect::<Vec<u64>>();
        (self.reshape(x_shape)? * y.reshape(y_shape)?).sum([-1])?.reshape(
            xshape[0..xshape.len() - 1]
                .iter()
                .copied()
                .chain([yshape[yshape.len() - 2]])
                .collect::<Vec<u64>>(),
        )
    }

    /// Matmul
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have incompatible shapes for matmul.
    pub fn dot_dtype(&self, rhs: impl Into<Tensor>, out_dtype: DType) -> Result<Tensor, ZyxError> {
        let rhs: Tensor = rhs.into();
        let org_y_shape = rhs.shape();
        let y = rhs.t();
        let xshape = self.shape();
        let yshape = y.shape();
        let xrank = xshape.len();
        let yrank = yshape.len();
        if xshape[xrank - 1] != yshape[yrank - 1] {
            return Err(ZyxError::ShapeError(
                format!("Cannot dot tensors with shapes {xshape:?} and {org_y_shape:?}").into(),
            ));
        }
        let x_shape = xshape[..xrank - 1]
            .iter()
            .copied()
            .chain([1])
            .chain([xshape[xrank - 1]])
            .collect::<Vec<u64>>();
        let y_shape = yshape[0..yrank - 2]
            .iter()
            .copied()
            .chain([1])
            .chain(yshape[yrank - yrank.min(2)..yrank].iter().copied())
            .collect::<Vec<u64>>();
        (self.reshape(x_shape)?.cast(out_dtype) * y.reshape(y_shape)?.cast(out_dtype))
            .sum([-1])?
            .reshape(
                xshape[0..xshape.len() - 1]
                    .iter()
                    .copied()
                    .chain([yshape[yshape.len() - 2]])
                    .collect::<Vec<u64>>(),
            )
    }

    /// Matmul is just alias to dot
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have non broadcasteable shapes.
    pub fn matmul(&self, rhs: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        self.dot(rhs)
    }

    /// Returns a new tensor where each element is the result of raising the corresponding element in `self` to the power of `exponent`.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::Tensor;
    ///
    /// let arr = Tensor::from([1.0f32, 2.0]);
    /// assert_eq!(arr.pow(2.0f32)?, [1.0f32, 4.0]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Returns
    ///
    /// A new tensor where each element is the result of raising the corresponding element in `self` to the power of `exponent`.
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have non broadcasteable shapes.
    pub fn pow(&self, exponent: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        //Ok((self.log2() * exponent).exp2())
        let (x, y) = Tensor::broadcast(self.clone(), exponent)?;
        let id = RT.lock().binary(x.id, y.id, BOp::Pow);
        Ok(Tensor { id })
    }

    /// Logical and
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have non broadcasteable shapes.
    pub fn logical_and(&self, rhs: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        let (x, y) = Tensor::broadcast(self.clone(), rhs)?;
        let id = RT.lock().binary(x.id, y.id, BOp::And);
        Ok(Tensor { id })
    }

    /// Logical or
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have non broadcasteable shapes.
    pub fn logical_or(&self, rhs: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        let (x, y) = Tensor::broadcast(self.clone(), rhs)?;
        let id = RT.lock().binary(x.id, y.id, BOp::Or);
        Ok(Tensor { id })
    }

    /// Returns boolean mask with true where self == rhs
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have non broadcasteable shapes.
    pub fn equal(&self, rhs: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        let (x, y) = Tensor::broadcast(self.clone(), rhs)?;
        let id = RT.lock().binary(x.id, y.id, BOp::Eq);
        let x = Tensor { id };
        Ok(x)
    }

    /// Returns true where self is different from zero and false otherwise.
    #[allow(clippy::missing_panics_doc)]
    #[must_use]
    pub fn nonzero(&self) -> Tensor {
        let y = Tensor::from(0).cast(self.dtype()).expand(self.shape()).unwrap();
        let id = RT.lock().binary(self.id, y.id, BOp::NotEq);
        Tensor { id }
    }

    // ternary
    /// Where operation. Replaces elementwise true values with `if_true` and false values with `if_false`.
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have non broadcasteable shapes.
    #[allow(clippy::missing_panics_doc)]
    pub fn where_(&self, if_true: impl Into<Tensor>, if_false: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        let if_true = if_true.into();
        let if_false = if_false.into();
        let dtype = if_true.dtype();
        let x = self.cast(dtype);
        let (if_true, if_false) = Tensor::broadcast(if_true, if_false)?;
        Ok(x.clone() * if_true + (Tensor::ones(if_false.shape(), dtype) - x) * if_false)
    }

    // loss functions
    /// Calculates the cross-entropy loss for this tensor.
    ///
    /// This function takes a target tensor and axes as input. It first calculates the softmax of the input tensor along the specified axes,
    /// then multiplies the result by the logarithm of the target tensor.
    ///
    /// Self is logits, target is one-hot.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::Tensor;
    /// let input = Tensor::from([5f32, 2., -3.]);
    /// let target = Tensor::from([1f32, 0., 0.]);
    /// let loss = input.cross_entropy(target, [])?.mean_all();
    /// assert_eq!(loss, 0.048907f32);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have non broadcasteable shapes or axes cannot reduce self.
    pub fn cross_entropy(&self, target: impl Into<Tensor>, axes: impl IntoIterator<Item = Axis>) -> Result<Tensor, ZyxError> {
        let axes: Vec<_> = axes.into_iter().collect();
        let m = self - self.max_keepdim(axes.clone())?;
        let neg_log2_softmax = m.exp().sum_keepdim(axes)?.ln() - m;
        (neg_log2_softmax * target).sum([-1])
    }

    /*
    /// Cross entropy loss with class indices
    pub fn cross_entropy_loss(
        &self,
        target: impl Into<Tensor>,            // Class indices (shape: [batch_size])
        axes: impl IntoIterator<Item = Axis>, // Axis over which to apply softmax (typically the last axis)
    ) -> Result<Tensor, ZyxError> {
        // Step 1: Apply softmax to the logits along the class axis (usually the last axis)
        let ln_softmax = self.ln_softmax([-1])?;

        // Step 3: Gather the log-softmax values for the target class indices
        let selected_log_softmax = ln_softmax.gather(1, target)?; // Gather log-softmax values for each class index

        // Step 4: Calculate the cross-entropy loss (mean of the negative log-probabilities)
        let loss = selected_log_softmax.neg().sum(); // Sum of negative log-softmax values
        let mean_loss = loss / self.shape()[0] as f32; // Average across the batch size

        Ok(mean_loss) // Return the mean loss
    }*/

    /// Gather
    ///
    /// Gathers values along axis based on indices.
    ///
    /// Negative indices are wrapped (e.g., -1 → last element).
    /// Out-of-bounds indices return 0 (zya doesn't check bounds, returns 0 for OOB).
    ///
    /// # Errors
    ///
    /// Returns error if the shapes are incompatible.
    pub fn gather(&self, axis: Axis, indices: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        let indices = indices.into();
        let shape = self.shape();
        let index_shape = indices.shape();
        let dim = into_axis(axis, shape.len())?;

        if shape.len() != index_shape.len() {
            return Err(ZyxError::shape_error(
                format!("self.rank({}) != indices.rank({})", shape.len(), index_shape.len()).into(),
            ));
        }

        for (d, (&s, &i)) in shape.iter().zip(index_shape.iter()).enumerate() {
            if d != dim && s < i {
                return Err(ZyxError::shape_error(
                    format!("Shape mismatch at dimension {d}: self.shape[{d}] = {s} < indices.shape[{d}] = {i}").into(),
                ));
            }
        }

        let dim_size = shape[dim];
        let is_negative = indices.cmplt(0)?;
        let indices = indices + is_negative.mul(dim_size as i32);

        // Prepare one-hot along dim
        let one_hot = indices.unsqueeze(-1)?.one_hot_along_dim(dim_size, -1)?;

        // Prepare negative padding for shrink
        let mut padding = Vec::new();
        for d in (0..index_shape.len()).rev() {
            if d == dim {
                padding.push((0i64, 0i64));
            } else {
                padding.push((0i64, -(shape[d] as i64 - index_shape[d] as i64)));
            }
        }

        let x = self.rpad_zeros(padding)?.unsqueeze(-1)?.transpose(-1, dim as i32)?;
        let result = one_hot.mul(&x).sum_dtype([-1], self.dtype())?;

        Ok(result)
    }

    /// Index select
    ///
    /// # Errors
    ///
    /// Returns error if the dimension is out of bounds.
    pub fn index_select(&self, dim: Axis, index: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        let index = index.into();
        let mut shape = self.shape();
        let rank = shape.len();
        let dim = into_axis(dim, rank)?;

        shape[dim] = index.shape()[0];
        let mut view_shape: Vec<Dim> = vec![1; rank];
        view_shape[dim] = 0;
        let index_expanded = index.reshape(view_shape)?.expand(shape)?;

        self.gather(dim as Axis, index_expanded)
    }

    /// Shrink
    ///
    /// # Errors
    ///
    /// Returns error if the dimensions are invalid.
    pub fn shrink<I>(&self, dims: I) -> Result<Tensor, ZyxError>
    where
        I: IntoIterator<Item = (Dim, Dim)>,
        I::IntoIter: DoubleEndedIterator,
    {
        self.rpad_zeros(
            self.shape()
                .into_iter()
                .rev()
                .zip(dims.into_iter().rev())
                .map(|(d, (s, e))| (-(s as i64), -((d - e) as i64))),
        )
    }

    /// One hot
    ///
    /// If `num_classes` is less than any scalr in self, that scalar is ignored.
    #[allow(clippy::missing_panics_doc)]
    #[must_use]
    pub fn one_hot(&self, num_classes: Dim) -> Tensor {
        let mut num_classes = num_classes;
        if num_classes == 0 {
            num_classes = (self.max_all() + 1).item::<i64>() as u64;
        }

        let dtype = self.dtype();
        self.unsqueeze(-1)
            .unwrap()
            .one_hot_along_dim(num_classes, -1)
            .unwrap()
            .where_(Tensor::ones([1], dtype), Tensor::zeros([1], dtype))
            .unwrap()
    }

    /// One hot along dim
    ///
    /// # Errors
    ///
    /// Returns error if the tensor dtype is not integer.
    pub fn one_hot_along_dim(&self, num_classes: Dim, dim: Axis) -> Result<Tensor, ZyxError> {
        if !self.dtype().is_int() {
            return Err(ZyxError::dtype_error(
                format!("_one_hot_along_dim expects integer index tensor, got {:?}", self.dtype()).into(),
            ));
        }

        let rank = self.rank();
        let dim = if dim < 0 { rank as Axis + dim } else { dim };
        let offset = rank as Axis - dim - 1;

        let dt = if num_classes > i32::MAX as u64 {
            DType::I64
        } else {
            DType::I32
        };

        let arange = Tensor::arange(0, num_classes as i64, 1)?.cast(dt);

        // Reshape to [num_classes, 1, 1, ..., 1] with `offset` ones
        let mut new_shape: Vec<Dim> = vec![num_classes];
        new_shape.extend(vec![1; offset as usize]);
        let arange = arange.reshape(&new_shape)?;

        // Broadcast and compare
        self.equal(&arange)
    }

    /// Calculates the L1 loss between `self` and the target tensor.
    ///
    /// # Arguments
    ///
    /// * `target`: The target tensor to compare against. It will be converted into a `Tensor`.
    ///
    /// # Returns
    ///
    /// A new `Tensor` containing the absolute difference between `self` and the target tensor.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::Tensor;
    ///
    /// let self_tensor = Tensor::from([1.0f32, 2.0, 3.0]);
    /// let target_tensor = Tensor::from([2.0f32, 3.0, 4.0]);
    ///
    /// assert_eq!(self_tensor.l1_loss(target_tensor), [1.0f32, 1.0, 1.0]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    #[must_use]
    pub fn l1_loss(&self, target: impl Into<Tensor>) -> Tensor {
        (self - target).abs()
    }

    /// Calculates the Mean Squared Error (MSE) loss.
    ///
    /// # Arguments
    ///
    /// * `target`: The target tensor to compare against the input tensor (`self`).
    ///
    /// # Returns
    ///
    /// * A new tensor containing the MSE loss values.
    ///
    /// # Example
    ///
    /// ```
    /// use zyx::Tensor;
    ///
    /// let input = Tensor::from([2.0f32, 3.0]);
    /// let target = Tensor::from([4.0f32, 5.0]);
    ///
    /// assert_eq!(input.mse_loss(target).unwrap(), 4.0f32);
    /// ```
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have non broadcasteable shapes.
    #[track_caller]
    pub fn mse_loss(&self, target: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        let (x, y) = Tensor::broadcast(self, target)?;
        let x = Tensor { id: RT.lock().binary(x.id, y.id, BOp::Sub) };
        Ok((x.clone() * x).mean_all())
    }

    /// BCE Loss
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have non-broadcastable shapes.
    #[track_caller]
    pub fn bce_loss(&self, target: impl Into<Tensor>, eps: f32) -> Result<Tensor, ZyxError> {
        let target: Tensor = target.into();
        let x: Tensor = self.clamp(eps, 1.0 - eps)?;
        let temp: Tensor = 1 - &x;
        let loss: Tensor = (1 - &target) * temp.ln() - (target * x.ln());
        Ok(loss.mean_all())
    }

    /// Calculates the cosine similarity between this tensor and another.
    ///
    /// # Arguments
    ///
    /// * `rhs`: The other tensor to compare against. It will be converted into a `Tensor`.
    /// * `eps`: A tolerance value for numerical stability, which will also be converted into a `Tensor`.
    ///
    /// # Returns
    ///
    /// A new `Tensor` containing the cosine similarity values.
    ///
    /// # Example
    ///
    /// ```
    /// use zyx::Tensor;
    ///
    /// let tensor1 = Tensor::from([1.0, 2.0, 3.0]);
    /// let tensor2 = Tensor::from([4.0, 5.0, 6.0]);
    /// let eps = Tensor::from([1e-9]);
    ///
    /// let similarity = tensor1.cosine_similarity(tensor2, eps);
    /// ```
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have non broadcasteable shapes.
    pub fn cosine_similarity(&self, rhs: impl Into<Tensor>, eps: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        let rhs: Tensor = rhs.into();
        let eps: Tensor = eps.into();
        let x = (self * self).sqrt() * (&rhs * &rhs).sqrt();
        Ok(self * rhs / x.cmplt(eps.clone())?.where_(eps, x)?)
    }

    // misc
    /// Flatten. Joins axes into one dimension,
    ///
    /// # Errors
    ///
    /// Returns error if self cannot be flattened by axes.
    pub fn flatten(&self, axes: impl RangeBounds<Axis>) -> Result<Tensor, ZyxError> {
        let shape = self.shape();
        let rank = shape.len();
        let start_dim = into_axis(
            match axes.start_bound() {
                Bound::Included(dim) => *dim,
                Bound::Excluded(dim) => *dim + 1,
                Bound::Unbounded => 0,
            },
            rank,
        )?;
        let end_dim = into_axis(
            match axes.end_bound() {
                Bound::Included(dim) => *dim,
                Bound::Excluded(dim) => *dim - 1,
                Bound::Unbounded => -1,
            },
            rank,
        )? + 1;
        let dim = shape[start_dim..end_dim].iter().product();
        let new_shape: Vec<Dim> = shape[..start_dim]
            .iter()
            .copied()
            .chain([dim])
            .chain(shape[end_dim..].iter().copied())
            .collect();
        self.reshape(new_shape)
    }

    /// Concatenates a list of tensors along a specified dimension.
    ///
    /// # Arguments
    ///
    /// * `tensors`: An iterator of tensor references to concatenate.
    /// * `dim`: The dimension along which to concatenate. If negative, it is interpreted as counting from the end.
    ///
    /// # Returns
    ///
    /// A new tensor containing the concatenated input tensors.
    ///
    /// # Panics
    ///
    /// This function panics if any two tensors have different shapes except at the specified dimension.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::Tensor;
    ///
    /// let a = Tensor::from([[1, 2], [3, 4]]);
    /// let b = Tensor::from([[5, 6], [7, 8]]);
    /// let c = Tensor::cat([&a, &b], 0)?;
    /// assert_eq!(c, [[1, 2], [3, 4], [5, 6], [7, 8]]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Errors
    ///
    /// Returns error if tensors cannot be concattenated along axis.
    pub fn cat<'a>(tensors: impl IntoIterator<Item = &'a Tensor>, axis: Axis) -> Result<Tensor, ZyxError> {
        let tensors: Vec<&Tensor> = tensors.into_iter().collect();
        if tensors.len() < 2 {
            return Err(ZyxError::shape_error("Cat requires two or more tensors.".into()));
        }
        let shape = tensors[0].shape();
        let rank = shape.rank();
        let dim: usize = (if axis < 0 {
            axis + Axis::try_from(rank).unwrap()
        } else {
            axis
        })
        .try_into()
        .unwrap();
        // Dimension check
        for tensor in &tensors {
            for (i, (d1, d2)) in shape.iter().zip(tensor.shape().iter()).enumerate() {
                if i != dim && *d1 != *d2 {
                    return Err(ZyxError::shape_error("Cannot concatenate these tensors.".into()));
                }
            }
        }
        let mut offset = 0i64;
        let mut offset2 = tensors
            .iter()
            .fold(0i64, |acc, t| acc + i64::try_from(t.shape()[dim]).unwrap());
        let mut shape = tensors[0].shape();
        shape[dim] = Dim::try_from(offset2).unwrap();
        let mut res = None;
        for tensor in tensors {
            let d = i64::try_from(tensor.shape()[dim]).unwrap();
            offset2 -= d;
            let padding: Vec<(i64, i64)> = repeat_n((0i64, 0i64), rank - dim - 1).chain([(offset, offset2)]).collect();
            let t = tensor.rpad_zeros(padding)?;
            if let Some(r) = res {
                res = Some(r + t);
            } else {
                res = Some(t);
            }
            offset += d;
        }
        Ok(res.unwrap())
    }

    /// Squeeze
    ///
    /// # Errors
    ///
    /// Returns error if self cannot be squeezed along axis.
    #[allow(clippy::missing_panics_doc)]
    #[must_use]
    pub fn squeeze(&self, axes: impl IntoIterator<Item = Axis>) -> Tensor {
        let shape = self.shape();
        let mut naxes = Vec::new();
        for axis in axes.into_iter().take(shape.len()) {
            if let Ok(axis) = into_axis(axis, shape.len()) {
                naxes.push(axis);
            }
        }
        let mut new_shape = Vec::new();
        for (a, d) in shape.into_iter().enumerate() {
            if d != 1 || !naxes.contains(&a) {
                new_shape.push(d);
            }
        }
        if new_shape.is_empty() {
            new_shape = vec![1];
        }
        self.reshape(new_shape).unwrap()
    }

    /// Expands the dimensionality of a tensor by inserting singleton dimensions.
    ///
    /// # Arguments
    ///
    /// * `dim`: The dimension to insert the singleton dimension at. If negative, it is counted from the end.
    ///
    /// # Returns
    ///
    /// A new tensor with expanded dimensionality.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::{Tensor, DType};
    ///
    /// let t = Tensor::zeros([2, 3], DType::I8);
    /// assert_eq!(t.unsqueeze(1)?.shape(), &[2, 1, 3]);
    /// assert_eq!(t.unsqueeze(-1)?.shape(), &[2, 3, 1]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Errors
    ///
    /// Returns error if self cannot be unsqueezed along axis.
    #[allow(clippy::missing_panics_doc)]
    pub fn unsqueeze(&self, dim: Axis) -> Result<Tensor, ZyxError> {
        let shape = self.shape();
        let rank = shape.len();
        if dim < 0 {
            if -dim > (rank + 1) as Axis {
                return Err(ZyxError::shape_error(
                    format!("Unsqueeze dim {dim} is not possible on rank {rank} tensor.").into(),
                ));
            }
            let dim = usize::try_from(-dim).unwrap();
            let dim = rank - dim + 1;
            self.reshape(
                shape[..dim]
                    .iter()
                    .copied()
                    .chain([1])
                    .chain(shape[dim..].iter().copied())
                    .collect::<Vec<u64>>(),
            )
        } else {
            let dim = usize::try_from(dim).unwrap();
            if dim > rank {
                return Err(ZyxError::shape_error(
                    format!("Unsqueeze dim {dim} is not possible on rank {rank} tensor.").into(),
                ));
            }
            self.reshape(
                shape[..dim]
                    .iter()
                    .copied()
                    .chain([1])
                    .chain(shape[dim..].iter().copied())
                    .collect::<Vec<u64>>(),
            )
        }
    }

    /// Argmax
    #[allow(clippy::missing_panics_doc)]
    #[must_use]
    pub fn argmax(&self) -> Tensor {
        self.flatten(..).unwrap().argmax_impl(0, false).unwrap()
    }

    /// Argmax
    ///
    /// # Errors
    ///
    /// Returns error if the axis is out of bounds.
    pub fn argmax_axis(&self, axis: Axis) -> Result<Tensor, ZyxError> {
        let rank = self.rank();
        let _ = into_axis(axis, rank as usize)?;
        self.argmax_impl(axis, false)
    }

    /* // Argmax
    fn argmax_impl(&self, axis: Axis, keepdim: bool) -> Result<Tensor, ZyxError> {
        // Find the maximum values along the specified axis
        let max_vals = self.max_keepdim([axis]).unwrap();

        // Create a mask where each element is `true` if it equals the max value
        let mask = self.equal(max_vals)?;
        let shape = self.shape();
        let uaxis = into_axis(axis, shape.len())?;
        println!("shape={shape:?}, uaxis={uaxis}");
        let range = Tensor::arange(shape[uaxis] as i32, 0, -1)?;

        let shape_value = shape[uaxis];
        let repeat_count = shape.len() - uaxis;
        let mut shape = vec![shape_value];
        shape.extend(vec![1; repeat_count]);

        let reshaped_range = range.reshape(&shape)?;
        let idx = mask * reshaped_range;
        let res = Tensor::from(shape[uaxis] as i64) - if keepdim { idx.max_keepdim([axis])? } else { idx.max([axis])? };
        Ok(res.cast(DType::I32))
    }*/
    /// Argmax
    fn argmax_impl(&self, axis: Axis, keepdim: bool) -> Result<Tensor, ZyxError> {
        // max values along the axis
        let max_vals = self.max_keepdim([axis])?;

        // mask where values equal the max
        let mask = self.equal(max_vals)?;

        // correct axis
        let shape = self.shape();
        let uaxis = into_axis(axis, shape.len())?;

        // create a range tensor [0, 1, 2, ...] along the axis
        let range = Tensor::arange(0, shape[uaxis] as i32, 1)?;
        let mut reshape_shape = vec![1; shape.len()];
        reshape_shape[uaxis] = shape[uaxis];
        let reshaped_range = range.reshape(&reshape_shape)?;

        // mask * range -> positions of max values
        let idx = mask * reshaped_range;

        // max along axis gives argmax
        let res = if keepdim { idx.max_keepdim([axis])? } else { idx.max([axis])? };

        Ok(res.cast(DType::I32))
    }

    /// Creates a new tensor by stacking the input tensors along the specified dimension.
    ///
    /// # Arguments
    ///
    /// * `tensors`: An iterator of tensor references to stack.
    /// * `dim`: The dimension along which to stack the tensors.
    ///
    /// # Returns
    ///
    /// A new tensor containing the stacked tensors.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::Tensor;
    /// let a = Tensor::from([[1, 2], [3, 4]]);
    /// let b = Tensor::from([[5, 6], [7, 8]]);
    /// assert_eq!(Tensor::stack([&a, &b], 0)?, [[[1, 2],
    ///                                           [3, 4]],
    ///                                          [[5, 6],
    ///                                           [7, 8]]]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Errors
    ///
    /// Returns error if the tensors have different shapes along the stacking dimension.
    ///
    /// # See also
    ///
    /// [`unsqueeze`](Tensor::unsqueeze), [`cat`](Tensor::cat)
    #[allow(clippy::missing_panics_doc)]
    pub fn stack<'a>(tensors: impl IntoIterator<Item = &'a Tensor>, dim: Axis) -> Result<Tensor, ZyxError> {
        // TODO handle dim corretly
        let tensors: Vec<Tensor> = tensors.into_iter().map(|t| t.unsqueeze(dim).unwrap()).collect();
        Tensor::cat(&tensors, dim)
    }

    /// Split tensor into multiple tensors at given dim/axis
    ///
    /// # Errors
    ///
    /// Returns error if self cannot be split along axis.
    #[allow(clippy::missing_panics_doc)]
    pub fn split(&self, sizes: impl IntoShape, axis: isize) -> Result<Vec<Tensor>, ZyxError> {
        // assert all_int(self.shape), f"does not support symbolic shape {self.shape}"
        // dim = self._resolve_dim(dim)
        // if isinstance(sizes, int): sizes = [min(sizes, self.shape[dim]-i) for i in range(0, max(1, self.shape[dim]), max(1, sizes))]
        // assert sum(sizes) == self.shape[dim], f"expect sizes to sum exactly to {self.shape[dim]}, but got {sum(sizes)}"
        // return tuple(self[sl] for sl in [tuple([slice(None)]*dim + [slice(sum(sizes[:i]), sum(sizes[:i + 1]))]) for i in range(len(sizes))])
        let sizes: Vec<Dim> = sizes.into_shape().collect();
        let shape = self.shape();
        let rank = shape.rank();
        let dim: usize = usize::try_from(if axis < 0 {
            axis + isize::try_from(rank).unwrap()
        } else {
            axis
        })
        .unwrap();
        if sizes.iter().sum::<Dim>() != shape[dim] {
            return Err(ZyxError::shape_error(
                format!(
                    "Sizes must sum exactly to {}, but got {:?}, which sums to {}",
                    shape[dim],
                    sizes,
                    sizes.iter().sum::<Dim>()
                )
                .into(),
            ));
        }

        let mut res = Vec::new();
        let mut acc_size: i64 = 0;
        for size in sizes {
            let size = size as i64;
            let mut index = Vec::new();
            for &d in shape.iter().take(dim) {
                index.push(0..d as i64);
            }
            index.push(acc_size..acc_size + size);
            //println!("Index {index:?}");
            res.push(self.slice(index)?);
            acc_size += size;
        }
        Ok(res)
    }

    /// Masked fill
    ///
    /// # Errors
    ///
    /// Returns error if self cannot be masked with mask.
    pub fn masked_fill(&self, mask: impl Into<Tensor>, value: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        mask.into().where_(value, self.clone())
    }

    /// Tri
    #[must_use]
    #[track_caller]
    #[allow(clippy::missing_panics_doc)]
    pub fn tri(r: Dim, c: Dim, diagonal: i64, dtype: DType) -> Tensor {
        if r == 0 || c == 0 || diagonal >= c as i64 {
            return Tensor::zeros([r, c], dtype);
        }
        if r as i64 + diagonal <= 0 {
            return Tensor::ones([r, c], dtype);
        }
        let s = r + c - 1;
        let t = Tensor::ones([s, s], dtype).rpad_zeros([(0i64, s as i64)]).unwrap();
        let t = t.reshape([2 * s * s]).unwrap();
        let t = t.rpad_zeros([(0i64, -(s as i64))]).unwrap();
        let t = t.reshape([s, 2 * s - 1]).unwrap();
        let t = t.rpad_zeros([(0i64, -((2 * s - 1 - s) as i64))]).unwrap();
        if diagonal <= 0 {
            t.slice((0..r as i64, (-diagonal)..(c as i64 - diagonal))).unwrap()
        } else {
            t.slice((diagonal..(r as i64 + diagonal), 0..c as i64)).unwrap()
        }
    }

    /// Returns upper triangular part of the input tensor, other elements are set to zero
    ///
    /// # Errors
    ///
    /// Returns error if the tensor rank is less than 2.
    pub fn triu(&self, diagonal: i64) -> Result<Tensor, ZyxError> {
        //return Tensor._tri(self.shape[-2], self.shape[-1], diagonal=diagonal, device=self.device, dtype=dtypes.bool).where(self, self.zeros_like())
        let [r, c] = self.rdims::<2>()?;
        Tensor::tri(r, c, diagonal, DType::Bool).where_(self, Tensor::zeros_like(self))
    }

    /// Returns lower triangular part of the input tensor, other elements are set to zero
    /// # Errors
    /// Returns error if self's rank < 2
    pub fn tril(&self, diagonal: i64) -> Result<Tensor, ZyxError> {
        //return Tensor._tri(self.shape[-2], self.shape[-1], diagonal=diagonal+1, device=self.device, dtype=dtypes.bool).where(self.zeros_like(), self)
        let [r, c] = self.rdims::<2>()?;
        Tensor::tri(r, c, diagonal + 1, DType::Bool).where_(Tensor::zeros_like(self), self)
    }

    /// Pooling function with kernel size, stride and dilation
    ///
    /// # Errors
    ///
    /// Returns error if self cannot be pooled with stride and dilation.
    #[allow(clippy::missing_panics_doc)]
    pub fn pool(
        &self,
        kernel_size: impl IntoShape,
        stride: impl IntoShape,
        dilation: impl IntoShape,
    ) -> Result<Tensor, ZyxError> {
        // What a complex function ...
        let k_: Vec<Dim> = kernel_size.into_shape().collect();
        let stride: Vec<Dim> = stride.into_shape().collect();
        let dilation: Vec<Dim> = dilation.into_shape().collect();

        let shape = self.shape();
        let rank = shape.len();

        let s_: Vec<Dim> = if stride.len() == 1 {
            vec![stride[0]; k_.len()]
        } else {
            stride
        };
        let d_: Vec<Dim> = if dilation.len() == 1 {
            vec![dilation[0]; k_.len()]
        } else {
            dilation
        };
        let i_ = &shape[rank - k_.len()..];
        let o_: Vec<Dim> = (i_, d_.iter(), k_.iter(), s_.iter())
            .zip()
            .map(|(i, d, k, s)| (*i - *d * (*k - 1)).div_ceil(*s))
            .collect();
        //println!("s_ {s_:?}, d_ {d_:?}, i_ {i_:?} o_ {o_:?}");
        let repeats: Vec<Dim> = repeat_n(1, rank - k_.len())
            .chain(
                k_.iter()
                    .copied()
                    .zip(i_.iter().copied())
                    .zip(d_.iter().copied())
                    .map(|((k, i), d)| (k * (i + d)).div_ceil(i)),
            )
            .collect();
        //println!("repeats {repeats:?}");
        let pad_b: Vec<Range<i64>> = shape[..rank - k_.len()].iter().map(|&d| 0..d as i64).collect();
        let sh_b: Vec<Dim> = shape[..rank - k_.len()].into();
        let mut xup = self.repeat(repeats)?;

        // dilation
        //println!("{xup:?} before padding");
        let padding: Vec<Range<i64>> = pad_b
            .iter()
            .cloned()
            .chain(
                k_.iter()
                    .copied()
                    .zip(i_.iter().copied())
                    .zip(d_.iter().copied())
                    .map(|((k, i), d)| 0..(k * (i + d)) as i64),
            )
            .collect();
        //println!("Padding {padding:?}");
        xup = xup.slice(padding)?;
        //println!("{xup} padded");
        let sh: Vec<Dim> = sh_b
            .iter()
            .copied()
            .chain(
                k_.iter()
                    .copied()
                    .zip(i_.iter().copied())
                    .zip(d_.iter().copied())
                    .flat_map(|((k, i), d)| [k, i + d]),
            )
            .collect();
        //println!("Reshape {sh:?}");
        xup = xup.reshape(sh)?;

        // stride
        // padding = noop_ + flatten(((0,k), (0,o*s)) for k,o,s in zip(k_, o_, s_))
        // xup = xup.shrink(padding)
        let padding: Vec<Range<i64>> = pad_b
            .iter()
            .cloned()
            .chain(
                k_.iter()
                    .copied()
                    .zip(o_.iter().copied())
                    .zip(s_.iter().copied())
                    .flat_map(|((k, o), s)| [(0..k as i64), (0..(o * s) as i64)]),
            )
            .collect();
        xup = xup.slice(padding)?;
        // sh = noop_ + flatten((k,o,s) for k,o,s in zip(k_, o_, s_))
        // xup = xup.reshape(sh)
        let sh: Vec<Dim> = sh_b
            .iter()
            .copied()
            .chain(
                k_.iter()
                    .copied()
                    .zip(o_.iter().copied())
                    .zip(s_.iter().copied())
                    .flat_map(|((k, o), s)| [k, o, s]),
            )
            .collect();
        xup = xup.reshape(sh)?;
        // padding = noop_ + flatten(((0,k), (0,o), (0,1)) for k,o in zip(k_, o_))
        // xup = xup.shrink(padding)
        let padding: Vec<Range<i64>> = pad_b
            .iter()
            .cloned()
            .chain(
                k_.iter()
                    .copied()
                    .zip(o_.iter().copied())
                    .flat_map(|(k, o)| [(0..k as i64), (0..o as i64), (0..1)]),
            )
            .collect();
        xup = xup.slice(padding)?;
        // sh = noop_ + flatten((k,o) for k,o in zip(k_, o_))
        // xup = xup.reshape(sh)
        let sh: Vec<Dim> = sh_b
            .iter()
            .copied()
            .chain(k_.iter().copied().zip(o_.iter().copied()).flat_map(Into::<[Dim; 2]>::into))
            .collect();
        xup = xup.reshape(sh)?;

        // xup.permute(*range(len(noop_)), *[len(noop_)+i*2+1 for i in range(len(i_))], *[len(noop_)+i*2 for i in range(len(i_))])
        let axes: Vec<Axis> = (0..rank - k_.len())
            .chain((0..i_.len()).map(|i| rank - k_.len() + i * 2 + 1))
            .chain((0..i_.len()).map(|i| rank - k_.len() + i * 2))
            .map(|i| Axis::try_from(i).unwrap())
            .collect();
        xup = xup.permute(axes)?;

        Ok(xup)
    }

    /// Performs an *N*-dimensional convolution on the tensor.
    ///
    /// This method supports arbitrary dimensionality (1D, 2D, 3D, etc.) and
    /// optional grouping, stride, dilation, and padding parameters.
    ///
    /// # Parameters
    /// - `weight`: Convolution kernel tensor of shape `[out_channels, in_channels / groups, ...]`.
    /// - `bias`: Optional bias tensor added to the output. Use `None` for no bias.
    /// - `groups`: Number of groups to divide the input and output channels into.
    /// - `stride`: Stride (step size) of the convolution, given per spatial dimension.
    /// - `dilation`: Spacing between kernel elements, given per spatial dimension.
    /// - `padding`: Number of padding elements added to each side per spatial dimension.
    ///
    /// # Returns
    /// A new [`Tensor`] containing the result of the convolution.
    ///
    /// # Example
    /// ```
    /// # use zyx::{Tensor, DType};
    ///
    /// // Input tensor: shape [1, 1, 3, 3]
    /// let t = Tensor::arange(0, 9, 1)?
    ///     .reshape([1, 1, 3, 3])?;
    ///
    /// // Kernel tensor: shape [1, 1, 2, 2]
    /// let w = Tensor::ones([1, 1, 2, 2], DType::F32);
    ///
    /// // Perform convolution (no bias, 1 group, stride=1, dilation=1, padding=0)
    /// let out = t.conv(&w, None, 1, [1, 1], [1, 1], [0, 0])?;
    ///
    /// println!("{out}");
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Errors
    /// Returns an error if the tensor shapes are incompatible for convolution.
    #[allow(clippy::missing_panics_doc)]
    pub fn conv(
        &self,
        weight: &Tensor,
        bias: Option<&Tensor>,
        groups: u64,
        stride: impl IntoShape,
        dilation: impl IntoShape,
        padding: impl IntoShape,
    ) -> Result<Tensor, ZyxError> {
        fn resolve_pool_pads(padding: &[Dim], dims: usize) -> Vec<i64> {
            if padding.len() == 1 {
                vec![padding[0] as i64; 2 * dims]
            } else if padding.len() == 2 * dims {
                padding.iter().map(|&p| p as i64).collect()
            } else {
                let mut npadding: Vec<i64> = Vec::new();
                for _ in 0..2 {
                    for &p in padding {
                        npadding.push(p as i64);
                    }
                }
                npadding.reverse();
                npadding
            }
        }

        let [bs, cin_] = self.shape()[..2] else {
            return Err(ZyxError::shape_error(
                format!("conv requires self rank >= 2, but rank = {}", self.rank()).into(),
            ));
        };
        let [cout, cin] = weight.shape()[..2] else {
            return Err(ZyxError::shape_error(
                format!("conv requires weight rank >= 2, but rank = {}", weight.rank()).into(),
            ));
        };
        if let Some(bias) = bias {
            if bias.shape().iter().product::<Dim>() != cout {
                return Err(ZyxError::shape_error(
                    format!(
                        "Bias length {} does not match output channels {}",
                        bias.shape().iter().product::<Dim>(),
                        cout
                    )
                    .into(),
                ));
            }
        }

        let hw = &weight.shape()[2..];

        let stride: Vec<Dim> = stride.into_shape().collect();
        let dilation: Vec<Dim> = dilation.into_shape().collect();
        /*if stride.len() != hw.len() || dilation.len() != hw.len() {
            return Err(ZyxError::shape_error("Stride/dilation length must match kernel spatial dimensions".into()));
        }*/

        let padding_: Vec<i64> = resolve_pool_pads(&padding.into_shape().collect::<Box<[Dim]>>(), hw.len());

        if (groups as Dim * cin != cin_) || (self.shape().len() != weight.shape().len()) {
            return Err(ZyxError::shape_error(
                format!(
                    "Input Tensor shape {:?} does not match the shape of the weights {:?}. ({} vs. {cin_})",
                    self.shape(),
                    weight.shape(),
                    groups as Dim * cin
                )
                .into(),
            ));
        }

        let x = self
            .rpad_zeros(padding_.chunks(2).map(|x| (x[0], x[1])))
            .unwrap()
            .pool(hw, stride, dilation)
            .unwrap();
        let rcout = cout / groups as Dim;
        let oyx = &x.shape()[2..x.shape().len() - hw.len()];

        // for now without winograd
        let shape: Vec<Dim> = [bs, groups as Dim, cin, 1].iter().chain(oyx).chain(hw).copied().collect();
        let x = x.reshape(shape).unwrap();
        let shape: Vec<Dim> = [bs, groups as Dim, cin, rcout].iter().chain(oyx).chain(hw).copied().collect();
        let x = x.expand(shape).unwrap();
        let mut axes = vec![0, 1, 3];
        for i in 0..oyx.len() {
            axes.push(4 + i);
        }
        axes.push(2);
        for i in 0..hw.len() {
            axes.push(4 + oyx.len() + i);
        }
        let x = x.permute(axes.iter().map(|&a| Axis::try_from(a).unwrap())).unwrap();

        let shape: Vec<Dim> = [1, groups as Dim, rcout]
            .iter()
            .chain(&vec![1; oyx.len()])
            .chain(&[cin])
            .chain(hw)
            .copied()
            .collect();
        let weight = weight.reshape(shape).unwrap();
        let mut axes: Vec<Axis> = Vec::new();
        for i in 0..=oyx.len() {
            axes.push(-1 - Axis::try_from(i).unwrap());
        }
        let shape: Vec<Dim> = [bs, cout].iter().chain(oyx).copied().collect();
        let mut ret = (x * weight).sum_keepdim(axes).unwrap().reshape(shape).unwrap();

        if let Some(bias) = bias {
            let shape: Vec<Dim> = once(1)
                .chain([bias.shape().iter().product::<Dim>()])
                .chain(repeat_n(1, hw.len()))
                .collect();
            ret = ret + bias.reshape(shape).unwrap();
        }

        Ok(ret)
    }

    // TODO we also need these two functions for pooling
    /*def _resolve_pool_pads(self, padding:int|Sequence[int], dims:int) -> Sequence[int]:
      if not isinstance(padding, int) and not (len(padding) == 2*dims or len(padding) == dims):
        raise ValueError(f"Padding must be an int or a sequence of length {dims} or {2*dims}, but got {padding=} for {self.shape=} with {dims=}.")
      return [padding]*2*dims if isinstance(padding, int) else (padding if len(padding) == 2*dims else [p for p in padding for _ in range(2)][::-1])

    def _apply_ceil_mode(self, pads:Sequence[int], k_:tuple[sint, ...], s_:int|tuple[int, ...], d_:int|tuple[int, ...]) -> list[int]:
      (d_,s_), i_ = (make_tuple(x, len(k_)) for x in (d_,s_)), self.shape[-len(k_):]
      pads, grouped_pads = list(pads), _flat_to_grouped(pads)
      # https://arxiv.org/pdf/1603.07285 section 5.1, relationship 15.
      o_ = [ceildiv(i+pB+pA - (d*(k-1)+1), s) + 1 for i,d,k,s,(pB,pA) in zip(i_,d_,k_,s_,grouped_pads)]
      for dim,(o,i,s,k,d,(pB,pA)) in enumerate(zip(o_,i_,s_,k_,d_,grouped_pads)):
        # we have to do additional padding before `_pool` so that `o_` in `_pool` is calculated correctly
        # `s*(o-1) + (d*(k-1)+1) - (i+pB+pA)` -> last_sliding_window_start + full_kernel_size - padded_input_shape
        # we decrease padding in the case that a sliding window starts in the end padded region, thereby decreasing `o_` in `_pool`
        # `smax(s*(o-1) - (pB+i-1), 0)` -> last_sliding_window_start - (pad_before + input_size - zero_offset)
        pads[-1-dim*2] += s*(o-1) + (d*(k-1)+1) - (i+pB+pA) - smax(s*(o-1) - (pB+i-1), 0)
      return pads*/

    /// Max pool
    ///
    /// # Errors
    ///
    /// Returns error if the kernel size, stride, or padding is invalid.
    pub fn max_pool(
        &self,
        kernel_size: impl IntoShape,
        stride: impl IntoShape,
        dilation: impl IntoShape,
        padding: impl IntoIterator<Item = (i64, i64)>,
        ceil_mode: bool,
        return_indices: bool,
    ) -> Result<Tensor, ZyxError> {
        let kernel_size: Vec<Dim> = kernel_size.into_shape().collect();
        let axis: Vec<Axis> = (-(kernel_size.len() as Axis)..0).collect();

        let padding: Vec<(i64, i64)> = padding.into_iter().collect();

        if ceil_mode {
            todo!("ceil mode is not implemented yet")
        }
        //if ceil_mode: pads = self._apply_ceil_mode(pads, k_, stride if stride is not None else k_, dilation)
        // TODO

        let dtype = self.dtype();
        let value: Tensor = Tensor { id: RT.lock().new_constant(dtype.min_constant()) };
        let pooled = self.pad(padding, value)?.pool(kernel_size, stride, dilation)?;

        if !return_indices {
            return pooled.max(axis);
        }

        //spatial_sz = int(math.prod(spatial_shape := self.shape[-len(k_):]))
        //idx = Tensor.arange(spatial_sz,0,-1, requires_grad=False, device=self.device).reshape(spatial_shape)
        //m = pooled == pooled.max(axis, keepdim=True)
        //idx = m * idx.pad(pads, value=dtypes.min(idx.dtype))._pool(k_, stride if stride is not None else k_, dilation)
        //return pooled.max(axis), spatial_sz - idx.max(axis)

        todo!()
    }

    /// Creates a new tensor by repeating the input tensor along its dimensions.
    ///
    /// The `repeats` parameter specifies how many times to repeat each dimension of the tensor. If the length of `repeats`
    /// is less than the rank of the tensor, it will be padded with ones at the beginning.
    ///
    /// # Examples
    ///
    /// ```
    /// use zyx::Tensor;
    ///
    /// let arr = Tensor::from(vec![1, 2, 3]);
    /// assert_eq!(arr.repeat([2])?, [1, 2, 3, 1, 2, 3]);
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Returns
    ///
    /// Returns a new tensor with the repeated values.
    ///
    /// # Errors
    ///
    /// Returns error if the input tensor has zero dimensions.
    #[allow(clippy::missing_panics_doc)]
    pub fn repeat(&self, repeats: impl IntoShape) -> Result<Tensor, ZyxError> {
        let repeats: Vec<Dim> = repeats.into_shape().collect();
        let shape = self.shape();
        let rank = shape.len();
        if repeats.len() < rank {
            return Err(ZyxError::shape_error(
                "Repeats must be greater or equal to rank of the tensor.".into(),
            ));
        }
        let base_shape: Vec<Dim> = repeat_n(1, repeats.len() - rank).chain(shape.iter().copied()).collect();
        let new_shape: Vec<Dim> = repeat_n(1, repeats.len() - rank)
            .chain(shape)
            .flat_map(|d| [1u64, d])
            .collect();
        let expand_shape: Vec<Dim> = repeats
            .iter()
            .copied()
            .zip(base_shape.iter().copied())
            .flat_map(Into::<[Dim; 2]>::into)
            .collect();
        let final_shape: Vec<Dim> = repeats
            .iter()
            .copied()
            .zip(base_shape.iter().copied())
            .map(|(r, d)| r * d)
            .collect();
        //println!("base_shape {base_shape:?} {new_shape:?} {expand_shape:?} {final_shape:?}");
        let mut x = self.reshape(new_shape).unwrap();
        x = x.expand(expand_shape).unwrap();
        x = x.reshape(final_shape).unwrap();
        Ok(x)
    }

    /// Applies Rotary Positional Encoding (`RoPE`) to a tensor.
    ///
    /// This method computes `RoPE` by taking two tensors representing sine and cosine frequency components,
    /// reshapes them appropriately, and combines them with the given input tensor to produce a new tensor
    /// representing the positional encodings.
    ///
    /// # Arguments
    ///
    /// * `sine_frequencies` - A tensor containing the sine frequency components for the `RoPE` computation.
    /// * `cosine_frequencies` - A tensor containing the cosine frequency components for the `RoPE` computation.
    ///
    /// # Returns
    ///
    /// * `Result<Tensor, ZyxError>` - A `Result` containing either the computed tensor with positional encodings
    ///   or an error describing the issue (e.g., shape mismatch, dtype mismatch, etc.).
    ///
    /// # Errors
    ///
    /// This function will return a `ZyxError` if:
    ///
    /// - The input tensors' shapes or dtypes do not match expectations.
    /// - The tensor is not at least 2D (requiring at least [`seq_len`, `embed_dim`]).
    ///
    /// # Example
    ///
    /// ```rust
    /// use zyx::{Tensor, DType};
    ///
    /// let input_tensor = Tensor::rand([10, 16], DType::F32)?;  // Example 2D tensor of shape [seq_len=10, embed_dim=16]
    /// let sine_frequencies = Tensor::rand([10, 8], DType::F32)?; // Shape [seq_len=10, embed_dim / 2 = 8]
    /// let cosine_frequencies = Tensor::rand([10, 8], DType::F32)?; // Shape [seq_len=10, embed_dim / 2 = 8]
    ///
    /// // Call rope to compute positional encodings
    /// let result = input_tensor.rope(sine_frequencies, cosine_frequencies)?;
    /// # Ok::<(), zyx::ZyxError>(())
    /// ```
    ///
    /// # Notes
    ///
    /// - The input tensor must be at least 2D: the first dimension represents the sequence length (`seq_len`),
    ///   and the second represents the embedding dimension (`embed_dim`).
    /// - The sine and cosine frequency tensors should have the shape `[seq_len, embed_dim / 2]`.
    /// - This method assumes the input tensor and the frequency tensors have the same dtype.
    ///
    /// # Panics
    /// This function may panic in the following cases:
    ///
    /// - Memory allocation failures or system-level errors when reshaping or performing tensor operations.
    /// - Internal logic errors in the library (e.g., unexpected failure when performing tensor slicing or concatenation).
    pub fn rope(&self, sine_frequencies: impl Into<Tensor>, cosine_frequencies: impl Into<Tensor>) -> Result<Tensor, ZyxError> {
        let sin_freqs: Tensor = sine_frequencies.into();
        let cos_freqs: Tensor = cosine_frequencies.into();
        if !RT.lock().implicit_casts {
            let dtype = self.dtype();
            let sdtype = sin_freqs.dtype();
            let cdtype = cos_freqs.dtype();
            if dtype != sdtype || dtype != cdtype {
                return Err(ZyxError::dtype_error(
                    format!(
                        "ROPE all inputs must have the same dtype self dtype {dtype}, sin_freqs {sdtype}, cos_freqs {cdtype}"
                    )
                    .into(),
                ));
            }
        }

        let sh: Vec<Dim> = self.shape();
        //println!("shape={sh:?}");
        //println!("sin_freqs={:?}", sin_freqs.shape());
        //println!("cos_freqs={:?}", cos_freqs.shape());
        if sh.len() < 2 {
            return Err(ZyxError::shape_error(
                format!("RoPE requires input >= 2d, but current input is {}d", sh.len()).into(),
            ));
        }

        let seq_len = sh[sh.len() - 2];
        let embed_dim = sh[sh.len() - 1];

        //let axes = 0..sh.len() as SAxis - 2;
        //println!("Squeeze axes: {axes:?}");

        if sin_freqs.shape() != [seq_len, embed_dim / 2] || cos_freqs.shape() != [seq_len, embed_dim / 2] {
            return Err(ZyxError::dtype_error(
                format!(
                    "sin_freqs and cos_freqs must have shape [seq_len, embed_dim / 2] after squeezing. \
                 However, after squeezing, sin_freqs has shape {:?} and cos_freqs has shape {:?}. \
                 Expected shapes: [{seq_len}, {}]",
                    sin_freqs.shape(),
                    cos_freqs.shape(),
                    embed_dim / 2
                )
                .into(),
            ));
        }

        let sin_freqs = sin_freqs.reshape([1u64, 1u64, seq_len, embed_dim / 2]).unwrap();
        let cos_freqs = cos_freqs.reshape([1u64, 1u64, seq_len, embed_dim / 2]).unwrap();

        let half = (embed_dim / 2) as usize;
        let a = self.rslice(..half).unwrap();
        let b = -self.rslice(half..).unwrap();
        let ro = a.clone() * cos_freqs.clone() - b.clone() * sin_freqs.clone();
        let co = a * sin_freqs + b * cos_freqs;
        let r = Tensor::cat([&co, &ro], -1).unwrap(); // Concatenate along the last dimension

        Ok(r)
    }

    /*#[must_use]
    pub fn conv(&self) -> Tensor {
        todo!()
    }*/

    /// Create new tensor from file on disk.
    pub(crate) fn from_path(shape: Vec<Dim>, dtype: DType, path: impl AsRef<Path>, offset: u64) -> Result<Tensor, ZyxError> {
        Ok(Tensor { id: RT.lock().tensor_from_path(shape, dtype, path.as_ref(), offset)? })
    }

    /// All tensor elements as contiguous `le_bytes` vector in row major order
    ///
    /// # Errors
    ///
    /// Returns error if self failed to realize.
    pub fn to_le_bytes(&self) -> Result<Vec<u8>, ZyxError> {
        Ok(match self.dtype() {
            DType::BF16 => {
                let data: Vec<bf16> = self.clone().try_into()?;
                data.into_iter().flat_map(bf16::to_le_bytes).collect()
            }
            DType::F16 => {
                let data: Vec<f16> = self.clone().try_into()?;
                data.into_iter().flat_map(f16::to_le_bytes).collect()
            }
            DType::F32 => {
                let data: Vec<f32> = self.clone().try_into()?;
                data.into_iter().flat_map(f32::to_le_bytes).collect()
            }
            DType::F64 => {
                let data: Vec<f64> = self.clone().try_into()?;
                data.into_iter().flat_map(f64::to_le_bytes).collect()
            }
            DType::U8 => {
                let data: Vec<u8> = self.clone().try_into()?;
                data.into_iter().flat_map(u8::to_le_bytes).collect()
            }
            DType::U16 => {
                let data: Vec<u16> = self.clone().try_into()?;
                data.into_iter().flat_map(u16::to_le_bytes).collect()
            }
            DType::U32 => {
                let data: Vec<u32> = self.clone().try_into()?;
                data.into_iter().flat_map(u32::to_le_bytes).collect()
            }
            DType::U64 => {
                let data: Vec<u64> = self.clone().try_into()?;
                data.into_iter().flat_map(u64::to_le_bytes).collect()
            }
            DType::I8 => {
                let data: Vec<i8> = self.clone().try_into()?;
                data.into_iter().flat_map(i8::to_le_bytes).collect()
            }
            DType::I16 => {
                let data: Vec<i16> = self.clone().try_into()?;
                data.into_iter().flat_map(i16::to_le_bytes).collect()
            }
            DType::I32 => {
                let data: Vec<i32> = self.clone().try_into()?;
                data.into_iter().flat_map(i32::to_le_bytes).collect()
            }
            DType::I64 => {
                let data: Vec<i64> = self.clone().try_into()?;
                data.into_iter().flat_map(i64::to_le_bytes).collect()
            }
            DType::Bool => {
                let data: Vec<bool> = self.clone().try_into()?;
                #[allow(clippy::transmute_undefined_repr)]
                unsafe {
                    std::mem::transmute::<Vec<bool>, Vec<u8>>(data)
                }
            }
        })
    }

    // Load tensor from `le_bytes` in row major order
    /*fn from_le_bytes(bytes: &[u8]) -> Result<Tensor, ZyxError> {
        let _ = bytes;
        todo!()
    }*/
}

#[cfg_attr(feature = "py", pyo3::pyclass)]
pub struct DebugGuard {
    debug: DebugMask,
}

impl Drop for DebugGuard {
    fn drop(&mut self) {
        if let Ok(mut rt) = RT.try_lock() {
            rt.debug = self.debug;
        } else {
            println!("Warning: Unable to drop DebugGuard due to runtime mutex lock.");
        }
    }
}

impl Tensor {
    /// If self is not float, then cast it to float
    #[track_caller]
    fn float_cast(&self) -> Result<Tensor, ZyxError> {
        let dtype = self.dtype();
        if !dtype.is_float() {
            if RT.lock().implicit_casts {
                return Ok(match (dtype.bit_size() / 8) as usize {
                    2 => self.cast(DType::F16),
                    4 => self.cast(DType::F32),
                    8 => self.cast(DType::F64),
                    _ => panic!(),
                });
            }
            return Err(ZyxError::dtype_error(format!("Called function that only supports float on a tensor that is of dtype = {dtype} while implitic casts were disabled.").into()));
        }
        Ok(self.clone())
    }

    /// Braodcasts to synchronize shapes and casts to synchronize dtypss
    /// This does both automatic expand AND automatic casting between dtypes.
    // TODO Broadcasting can be disable by changing a setting in the backend.
    #[track_caller]
    fn broadcast(x: impl Into<Tensor>, y: impl Into<Tensor>) -> Result<(Tensor, Tensor), ZyxError> {
        let mut x = x.into();
        let mut y = y.into();
        /*assert_eq!(
            graph.dtype(xid),
            graph.dtype(yid),
            "{op} parameters {xid} and {yid} have different dtypes: {} and {}",
            graph.dtype(xid),
            graph.dtype(yid)
        );*/
        // Now we just do implicit conversions. Not exactly rust style, but it's convenient.
        // We can later add option for backend to disable these implicit conversions.
        let x_dtype = x.dtype();
        let y_dtype = y.dtype();
        if x_dtype != y_dtype && RT.lock().implicit_casts {
            let common_dtype = x_dtype.least_upper_dtype(y_dtype);
            if x_dtype != common_dtype {
                x = x.cast(common_dtype);
            }
            if y_dtype != common_dtype {
                y = y.cast(common_dtype);
            }
        } else if x_dtype != y_dtype {
            return Err(ZyxError::dtype_error(
                format!("Binary inputs have different dtypes: {x_dtype} and {y_dtype}").into(),
            ));
        }

        let x_shape = x.shape();
        let y_shape = y.shape();

        for (&x, &y) in x_shape.iter().rev().zip(y_shape.iter().rev()) {
            if x != y && x != 1 && y != 1 {
                return Err(ZyxError::shape_error(
                    format!("Tensor shapes can not be broadcasted: {x_shape:?} and {y_shape:?}").into(),
                ));
            }
        }

        let rx = x_shape.rank();
        let ry = y_shape.rank();
        let mut nx_shape = x_shape.clone();
        let mut ny_shape = y_shape.clone();
        match rx.cmp(&ry) {
            Ordering::Less => {
                nx_shape = repeat_n(1, ry - rx).chain(nx_shape).collect();
            }
            Ordering::Greater => {
                ny_shape = repeat_n(1, rx - ry).chain(ny_shape).collect();
            }
            Ordering::Equal => {}
        }
        let mut eshape = Vec::new();
        for (x, y) in nx_shape.iter().zip(ny_shape.iter()) {
            eshape.push(*x.max(y));
        }
        if x_shape != eshape {
            x = x.expand(&eshape)?;
        }
        //println!("Second broadcast operand {y}");
        //println!("{x_shape:?}, {y_shape:?}, {eshape:?}");
        //println!("After reshape second broadcast operand {y}");
        //Tensor::plot_graph([], "graph");
        if y_shape != eshape {
            y = y.expand(&eshape)?;
        }
        //println!("Second broadcast operand {y}");
        //println!("Broadcasted to {eshape:?}");
        //println!("y shape {:?}", y.shape());
        Ok((x, y))
    }

    /// Tensor id
    #[must_use]
    pub const fn id(&self) -> TensorId {
        self.id
    }
}

impl TryFrom<Tensor> for bf16 {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [bf16::ZERO];
        RT.lock().load(value.id, &mut data)?;
        Ok(data[0])
    }
}

impl TryFrom<Tensor> for f16 {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [f16::ZERO];
        RT.lock().load(value.id, &mut data)?;
        Ok(data[0])
    }
}

impl TryFrom<Tensor> for f32 {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [0.];
        RT.lock().load(value.id, &mut data)?;
        Ok(data[0])
    }
}

impl TryFrom<Tensor> for f64 {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [0.];
        RT.lock().load(value.id, &mut data)?;
        Ok(data[0])
    }
}

impl TryFrom<Tensor> for u8 {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [0];
        RT.lock().load(value.id, &mut data)?;
        Ok(data[0])
    }
}

impl TryFrom<Tensor> for u32 {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [0];
        RT.lock().load(value.id, &mut data)?;
        Ok(data[0])
    }
}

impl TryFrom<Tensor> for i8 {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [0];
        RT.lock().load(value.id, &mut data)?;
        Ok(data[0])
    }
}

impl TryFrom<Tensor> for i16 {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [0];
        RT.lock().load(value.id, &mut data)?;
        Ok(data[0])
    }
}

impl TryFrom<Tensor> for i32 {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [0];
        RT.lock().load(value.id, &mut data)?;
        Ok(data[0])
    }
}

impl TryFrom<Tensor> for i64 {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [0];
        RT.lock().load(value.id, &mut data)?;
        Ok(data[0])
    }
}

impl TryFrom<Tensor> for bool {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [false];
        RT.lock().load(value.id, &mut data)?;
        Ok(data[0])
    }
}

impl<T: Scalar> TryFrom<Tensor> for Vec<T> {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let numel = value.numel() as usize;
        let mut data = vec![T::zero(); numel];
        RT.lock().load(value.id, &mut data)?;
        Ok(data)
    }
}

impl<T: Scalar, const D0: usize> TryFrom<Tensor> for [T; D0] {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [T::zero(); D0];
        RT.lock().load(value.id, &mut data)?;
        Ok(data)
    }
}

impl<T: Scalar, const D0: usize, const D1: usize> TryFrom<Tensor> for [[T; D1]; D0] {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [[T::zero(); D1]; D0];
        RT.lock().load(value.id, data.as_flattened_mut())?;
        Ok(data)
    }
}

impl<T: Scalar, const D0: usize, const D1: usize, const D2: usize> TryFrom<Tensor> for [[[T; D2]; D1]; D0] {
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [[[T::zero(); D2]; D1]; D0];
        RT.lock().load(value.id, data.as_flattened_mut().as_flattened_mut())?;
        Ok(data)
    }
}

impl<T: Scalar, const D0: usize, const D1: usize, const D2: usize, const D3: usize> TryFrom<Tensor>
    for [[[[T; D3]; D2]; D1]; D0]
{
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [[[[T::zero(); D3]; D2]; D1]; D0];
        RT.lock()
            .load(value.id, data.as_flattened_mut().as_flattened_mut().as_flattened_mut())?;
        Ok(data)
    }
}

impl<T: Scalar, const D0: usize, const D1: usize, const D2: usize, const D3: usize, const D4: usize> TryFrom<Tensor>
    for [[[[[T; D4]; D3]; D2]; D1]; D0]
{
    type Error = ZyxError;
    fn try_from(value: Tensor) -> Result<Self, Self::Error> {
        let mut data = [[[[[T::zero(); D4]; D3]; D2]; D1]; D0];
        RT.lock().load(
            value.id,
            data.as_flattened_mut()
                .as_flattened_mut()
                .as_flattened_mut()
                .as_flattened_mut(),
        )?;
        Ok(data)
    }
}

impl Debug for Tensor {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        f.write_fmt(format_args!("{self}"))
        //f.write_fmt(format_args!("Tensor {{ id = {:?} }}", self.id))
    }
}

impl Display for Tensor {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        // TODO don't print the whole tensor if it is too big
        let precision = f.precision().unwrap_or(3);
        let x = self.clone();
        let res = match self.dtype() {
            DType::BF16 => {
                let data: Result<Vec<bf16>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), precision, f.width()),
                    Err(e) => format!("f16 tensor failed to realize {e:?}"),
                }
            }
            DType::F16 => {
                let data: Result<Vec<f16>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), precision, f.width()),
                    Err(e) => format!("f16 tensor failed to realize {e:?}"),
                }
            }
            DType::F32 => {
                let data: Result<Vec<f32>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), precision, f.width()),
                    Err(e) => format!("f32 tensor failed to realize {e:?}"),
                }
            }
            DType::F64 => {
                let data: Result<Vec<f64>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), precision, f.width()),
                    Err(e) => format!("f64 tensor failed to realize {e:?}"),
                }
            }
            DType::U8 => {
                let data: Result<Vec<u8>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), 0, f.width()),
                    Err(e) => format!("u8 tensor failed to realize {e:?}"),
                }
            }
            DType::U16 => {
                let data: Result<Vec<u16>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), 0, f.width()),
                    Err(e) => format!("u16 tensor failed to realize {e:?}"),
                }
            }
            DType::U32 => {
                let data: Result<Vec<u32>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), 0, f.width()),
                    Err(e) => format!("u32 tensor failed to realize {e:?}"),
                }
            }
            DType::U64 => {
                let data: Result<Vec<u64>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), 0, f.width()),
                    Err(e) => format!("u64 tensor failed to realize {e:?}"),
                }
            }
            DType::I8 => {
                let data: Result<Vec<i8>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), 0, f.width()),
                    Err(e) => format!("i32 tensor failed to realize {e:?}"),
                }
            }
            DType::I16 => {
                let data: Result<Vec<i16>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), 0, f.width()),
                    Err(e) => format!("i32 tensor failed to realize {e:?}"),
                }
            }
            DType::I32 => {
                let data: Result<Vec<i32>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), 0, f.width()),
                    Err(e) => format!("i32 tensor failed to realize {e:?}"),
                }
            }
            DType::I64 => {
                let data: Result<Vec<i64>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), 0, f.width()),
                    Err(e) => format!("i32 tensor failed to realize {e:?}"),
                }
            }
            DType::Bool => {
                let data: Result<Vec<bool>, _> = x.try_into();
                match data {
                    Ok(data) => tensor_to_string(&data, &self.shape(), 5, f.width()),
                    Err(e) => format!("i32 tensor failed to realize {e:?}"),
                }
            }
        };
        f.write_fmt(format_args!("{res}\ntensor {} {:?}", self.dtype(), self.shape()))
    }
}

fn tensor_to_string<T: core::fmt::Display>(data: &[T], shape: &[Dim], precision: usize, width: Option<usize>) -> String {
    use core::fmt::Write;
    let n: Dim = shape.iter().product();
    let rank = shape.len();
    let mut res = String::new();
    if data.is_empty() {
        return "[]".into();
    }
    // get maximal width of single value
    let w = width.unwrap_or_else(|| data.iter().map(|x| format!("{x:>.precision$}").len()).max().unwrap_or(0));
    let d0 = shape[rank - 1];
    for (i, x) in data.iter().enumerate() {
        {
            let mut var: Dim = 1;
            let mut r = rank;
            while r > 0 {
                if (i as Dim).is_multiple_of(n / var) {
                    res += &(" ".repeat(rank - r) + "[".repeat(r - 1).as_str());
                    break;
                }
                var *= shape[rank - r];
                r -= 1;
            }
        }
        let _ = write!(res, "{x:>w$.precision$}");
        if !(i as Dim + 1).is_multiple_of(d0) {
            res += "  ";
        }
        {
            let mut var: Dim = 1;
            let mut r = rank;
            while r > 0 {
                if (i as Dim + 1).is_multiple_of(n / var) {
                    res += &"]".repeat(r - 1);
                    break;
                }
                var *= shape[rank - r];
                r -= 1;
            }
        }
        if (i as Dim + 1).is_multiple_of(d0) && i as Dim != n - 1 {
            res += "\n";
        }
    }
    res
}

impl From<&Tensor> for Tensor {
    fn from(value: &Tensor) -> Self {
        value.clone()
    }
}

impl<T: Scalar> From<T> for Tensor {
    fn from(value: T) -> Self {
        Tensor { id: RT.lock().new_tensor(vec![1], value).unwrap() }
    }
}

impl<T: Scalar> TempData for T {
    fn bytes(&self) -> Dim {
        Dim::from(T::bit_size() / 8)
    }

    fn dtype(&self) -> DType {
        T::dtype()
    }

    fn read(&self) -> Box<[u8]> {
        self.to_ne_bytes().iter().copied().collect()
    }
}

impl<T: Scalar> From<Vec<T>> for Tensor {
    fn from(data: Vec<T>) -> Self {
        Tensor { id: RT.lock().new_tensor(vec![data.len() as Dim], data).unwrap() }
    }
}

impl<T: Scalar> TempData for Vec<T> {
    fn bytes(&self) -> Dim {
        (self.len() * (T::bit_size() / 8) as usize) as Dim
    }

    fn dtype(&self) -> DType {
        T::dtype()
    }

    fn read(&self) -> Box<[u8]> {
        self.iter().flat_map(Scalar::to_ne_bytes).copied().collect()
    }
}

impl<T: Scalar> From<Vec<Vec<T>>> for Tensor {
    fn from(data: Vec<Vec<T>>) -> Self {
        Tensor {
            id: RT
                .lock()
                .new_tensor(vec![data.len() as Dim, data[0].len() as Dim], data)
                .unwrap(),
        }
    }
}

impl<T: Scalar> TempData for Vec<Vec<T>> {
    fn bytes(&self) -> Dim {
        (self.len() * self[0].len() * (T::bit_size() / 8) as usize) as Dim
    }

    fn dtype(&self) -> DType {
        T::dtype()
    }

    fn read(&self) -> Box<[u8]> {
        self.iter().flatten().flat_map(Scalar::to_ne_bytes).copied().collect()
    }
}

impl<T: Scalar> From<Vec<Vec<Vec<T>>>> for Tensor {
    fn from(data: Vec<Vec<Vec<T>>>) -> Self {
        Tensor {
            id: RT
                .lock()
                .new_tensor(vec![data.len() as Dim, data[0].len() as Dim, data[0][0].len() as Dim], data)
                .unwrap(),
        }
    }
}

impl<T: Scalar> TempData for Vec<Vec<Vec<T>>> {
    fn bytes(&self) -> Dim {
        (self.len() * self[0].len() * self[0][0].len() * (T::bit_size() / 8) as usize) as Dim
    }

    fn dtype(&self) -> DType {
        T::dtype()
    }

    fn read(&self) -> Box<[u8]> {
        self.iter()
            .flatten()
            .flatten()
            .flat_map(Scalar::to_ne_bytes)
            .copied()
            .collect()
    }
}

impl<T: Scalar> From<&'static [T]> for Tensor {
    fn from(data: &'static [T]) -> Self {
        let n = data.len() as Dim;
        Tensor { id: RT.lock().new_tensor(vec![n], data).unwrap() }
    }
}

impl<T: Scalar> TempData for &'static [T] {
    fn bytes(&self) -> Dim {
        (self.len() * (T::bit_size() / 8) as usize) as Dim
    }

    fn dtype(&self) -> DType {
        T::dtype()
    }

    fn read(&self) -> Box<[u8]> {
        self.iter().flat_map(Scalar::to_ne_bytes).copied().collect()
    }
}

impl<T: Scalar, const D0: usize> From<[T; D0]> for Tensor {
    fn from(data: [T; D0]) -> Self {
        Tensor { id: RT.lock().new_tensor(vec![D0 as Dim], data).unwrap() }
    }
}

impl<T: Scalar, const D0: usize> TempData for [T; D0] {
    fn bytes(&self) -> Dim {
        (D0 * (T::bit_size() / 8) as usize) as Dim
    }

    fn dtype(&self) -> DType {
        T::dtype()
    }

    fn read(&self) -> Box<[u8]> {
        self.iter().flat_map(Scalar::to_ne_bytes).copied().collect()
    }
}

impl<T: Scalar, const D0: usize, const D1: usize> From<[[T; D1]; D0]> for Tensor {
    fn from(data: [[T; D1]; D0]) -> Self {
        let data = unsafe { core::slice::from_raw_parts(data[0].as_ptr(), D0 * D1) };
        Tensor { id: RT.lock().new_tensor(vec![D0 as Dim, D1 as Dim], data).unwrap() }
    }
}

impl<T: Scalar, const D0: usize, const D1: usize> TempData for [[T; D1]; D0] {
    fn bytes(&self) -> Dim {
        (D0 * D1 * (T::bit_size() / 8) as usize) as Dim
    }

    fn dtype(&self) -> DType {
        T::dtype()
    }

    fn read(&self) -> Box<[u8]> {
        self.iter().flatten().flat_map(Scalar::to_ne_bytes).copied().collect()
    }
}

impl<T: Scalar, const D0: usize, const D1: usize, const D2: usize> From<[[[T; D2]; D1]; D0]> for Tensor {
    fn from(data: [[[T; D2]; D1]; D0]) -> Self {
        let data = unsafe { core::slice::from_raw_parts(data[0][0].as_ptr(), D0 * D1 * D2) };
        Tensor { id: RT.lock().new_tensor(vec![D0 as Dim, D1 as Dim, D2 as Dim], data).unwrap() }
    }
}

impl<T: Scalar, const D0: usize, const D1: usize, const D2: usize> TempData for [[[T; D2]; D1]; D0] {
    fn bytes(&self) -> Dim {
        (D0 * D1 * D2 * (T::bit_size() / 8) as usize) as Dim
    }

    fn dtype(&self) -> DType {
        T::dtype()
    }

    fn read(&self) -> Box<[u8]> {
        self.iter()
            .flatten()
            .flatten()
            .flat_map(Scalar::to_ne_bytes)
            .copied()
            .collect()
    }
}

impl<T: Scalar, const D0: usize, const D1: usize, const D2: usize, const D3: usize> From<[[[[T; D3]; D2]; D1]; D0]> for Tensor {
    fn from(data: [[[[T; D3]; D2]; D1]; D0]) -> Self {
        let data = unsafe { core::slice::from_raw_parts(data[0][0][0].as_ptr(), D0 * D1 * D2 * D3) };
        Tensor {
            id: RT
                .lock()
                .new_tensor(vec![D0 as Dim, D1 as Dim, D2 as Dim, D3 as Dim], data)
                .unwrap(),
        }
    }
}

impl<T: Scalar, const D0: usize, const D1: usize, const D2: usize, const D3: usize> TempData for [[[[T; D3]; D2]; D1]; D0] {
    fn bytes(&self) -> Dim {
        (D0 * D1 * D2 * D3 * (T::bit_size() / 8) as usize) as Dim
    }

    fn dtype(&self) -> DType {
        T::dtype()
    }

    fn read(&self) -> Box<[u8]> {
        self.iter()
            .flatten()
            .flatten()
            .flatten()
            .flat_map(Scalar::to_ne_bytes)
            .copied()
            .collect()
    }
}

impl PartialEq<f32> for Tensor {
    fn eq(&self, other: &f32) -> bool {
        self.clone().try_into().is_ok_and(|data| Scalar::is_equal(data, *other))
    }
}

impl PartialEq<f64> for Tensor {
    fn eq(&self, other: &f64) -> bool {
        self.clone().try_into().is_ok_and(|data| Scalar::is_equal(data, *other))
    }
}

impl PartialEq<i32> for Tensor {
    fn eq(&self, other: &i32) -> bool {
        self.clone().try_into().is_ok_and(|data| Scalar::is_equal(data, *other))
    }
}

impl<T: Scalar> PartialEq<Vec<T>> for Tensor {
    fn eq(&self, other: &Vec<T>) -> bool {
        if self.shape() != [other.len() as Dim] {
            return false;
        }
        if let Ok(data) = self.clone().try_into() {
            let data: Vec<T> = data;
            for (x, y) in data.into_iter().zip(other) {
                if !Scalar::is_equal(x, *y) {
                    return false;
                }
            }
            true
        } else {
            false
        }
    }
}

impl<T: Scalar> PartialEq<Vec<Vec<T>>> for Tensor {
    fn eq(&self, other: &Vec<Vec<T>>) -> bool {
        if self.shape() != [other.len() as Dim, other[0].len() as Dim] {
            return false;
        }
        if let Ok(data) = self.clone().try_into() {
            let data: Vec<T> = data;
            for (x, y) in data.into_iter().zip(other.iter().flatten()) {
                if !Scalar::is_equal(x, *y) {
                    return false;
                }
            }
            true
        } else {
            false
        }
    }
}

impl<T: Scalar> PartialEq<Vec<Vec<Vec<T>>>> for Tensor {
    fn eq(&self, other: &Vec<Vec<Vec<T>>>) -> bool {
        if self.shape() != [other.len() as Dim, other[0].len() as Dim, other[0][0].len() as Dim] {
            return false;
        }
        if let Ok(data) = self.clone().try_into() {
            let data: Vec<T> = data;
            for (x, y) in data.into_iter().zip(other.iter().flatten().flatten()) {
                if !Scalar::is_equal(x, *y) {
                    return false;
                }
            }
            true
        } else {
            false
        }
    }
}

impl<T: Scalar, const D0: usize> PartialEq<[T; D0]> for Tensor {
    fn eq(&self, other: &[T; D0]) -> bool {
        if self.shape() != [D0 as Dim] {
            return false;
        }
        if let Ok(data) = self.clone().try_into() {
            let data: [T; D0] = data;
            for (x, y) in data.into_iter().zip(other) {
                if !Scalar::is_equal(x, *y) {
                    return false;
                }
            }
            true
        } else {
            false
        }
    }
}

impl<T: Scalar, const D0: usize, const D1: usize> PartialEq<[[T; D1]; D0]> for Tensor {
    fn eq(&self, other: &[[T; D1]; D0]) -> bool {
        if self.shape() != [D0 as Dim, D1 as Dim] {
            return false;
        }
        if let Ok(data) = self.clone().try_into() {
            let data: [[T; D1]; D0] = data;
            for (x, y) in data.into_iter().flatten().zip(other.iter().flatten()) {
                if !Scalar::is_equal(x, *y) {
                    return false;
                }
            }
            true
        } else {
            false
        }
    }
}

impl<T: Scalar, const D0: usize, const D1: usize, const D2: usize> PartialEq<[[[T; D2]; D1]; D0]> for Tensor {
    fn eq(&self, other: &[[[T; D2]; D1]; D0]) -> bool {
        if self.shape() != [D0 as Dim, D1 as Dim, D2 as Dim] {
            return false;
        }
        if let Ok(data) = self.clone().try_into() {
            let data: [[[T; D2]; D1]; D0] = data;
            for (x, y) in data.into_iter().flatten().flatten().zip(other.iter().flatten().flatten()) {
                if !Scalar::is_equal(x, *y) {
                    return false;
                }
            }
            true
        } else {
            false
        }
    }
}

impl<T: Scalar, const D0: usize, const D1: usize, const D2: usize, const D3: usize> PartialEq<[[[[T; D3]; D2]; D1]; D0]>
    for Tensor
{
    fn eq(&self, other: &[[[[T; D3]; D2]; D1]; D0]) -> bool {
        if self.shape() != [D0 as Dim, D1 as Dim, D2 as Dim, D3 as Dim] {
            return false;
        }
        if let Ok(data) = self.clone().try_into() {
            let data: [[[[T; D3]; D2]; D1]; D0] = data;
            for (x, y) in data
                .into_iter()
                .flatten()
                .flatten()
                .flatten()
                .zip(other.iter().flatten().flatten().flatten())
            {
                if !Scalar::is_equal(x, *y) {
                    return false;
                }
            }
            true
        } else {
            false
        }
    }
}

impl<T: Scalar, const D0: usize, const D1: usize, const D2: usize, const D3: usize, const D4: usize>
    PartialEq<[[[[[T; D4]; D3]; D2]; D1]; D0]> for Tensor
{
    fn eq(&self, other: &[[[[[T; D4]; D3]; D2]; D1]; D0]) -> bool {
        if self.shape() != [D0 as Dim, D1 as Dim, D2 as Dim, D3 as Dim, D4 as Dim] {
            return false;
        }
        if let Ok(data) = self.clone().try_into() {
            let data: [[[[[T; D4]; D3]; D2]; D1]; D0] = data;
            for (x, y) in data
                .into_iter()
                .flatten()
                .flatten()
                .flatten()
                .flatten()
                .zip(other.iter().flatten().flatten().flatten().flatten())
            {
                if !Scalar::is_equal(x, *y) {
                    return false;
                }
            }
            true
        } else {
            false
        }
    }
}

impl Neg for Tensor {
    type Output = Tensor;
    fn neg(self) -> Self::Output {
        Tensor { id: RT.lock().unary(self.id, UOp::Neg) }
    }
}

impl Neg for &Tensor {
    type Output = Tensor;
    fn neg(self) -> Self::Output {
        Tensor { id: RT.lock().unary(self.id, UOp::Neg) }
    }
}

impl Not for Tensor {
    type Output = Tensor;
    fn not(self) -> Self::Output {
        self.equal(0).unwrap()
    }
}

impl Not for &Tensor {
    type Output = Tensor;
    fn not(self) -> Self::Output {
        self.equal(0).unwrap()
    }
}