burn-flex 0.0.1

//! Bool tensor operations for the Flex backend.

use alloc::vec;
use alloc::vec::Vec;
use burn_backend::{
    DType, ExecutionError, TensorData,
    ops::BoolTensorOps,
    tensor::{BoolTensor, Device, FloatTensor, IntTensor},
};
use burn_std::{Bytes, Shape, Slice};

use crate::{Flex, FlexTensor, Layout};

impl BoolTensorOps<Flex> for Flex {
    fn bool_from_data(data: TensorData, _device: &Device<Flex>) -> BoolTensor<Flex> {
        FlexTensor::from_data(data)
    }

    async fn bool_into_data(tensor: BoolTensor<Flex>) -> Result<TensorData, ExecutionError> {
        Ok(tensor.into_data())
    }

    fn bool_device(_tensor: &BoolTensor<Flex>) -> Device<Flex> {
        Default::default()
    }

    fn bool_to_device(tensor: BoolTensor<Flex>, _device: &Device<Flex>) -> BoolTensor<Flex> {
        tensor
    }

    fn bool_cat(tensors: Vec<BoolTensor<Flex>>, dim: usize) -> BoolTensor<Flex> {
        crate::ops::cat::cat(tensors, dim)
    }

    fn bool_reshape(tensor: BoolTensor<Flex>, shape: Shape) -> BoolTensor<Flex> {
        tensor.reshape(shape)
    }

    fn bool_slice(tensor: BoolTensor<Flex>, slices: &[Slice]) -> BoolTensor<Flex> {
        crate::ops::slice::slice(tensor, slices)
    }

    fn bool_empty(
        shape: Shape,
        _device: &Device<Flex>,
        _dtype: burn_std::BoolDType,
    ) -> BoolTensor<Flex> {
        FlexTensor::empty(shape, DType::Bool(burn_std::BoolStore::Native))
    }

    fn bool_slice_assign(
        tensor: BoolTensor<Flex>,
        slices: &[Slice],
        value: BoolTensor<Flex>,
    ) -> BoolTensor<Flex> {
        crate::ops::slice::slice_assign(tensor, slices, value)
    }

    fn bool_into_int(tensor: BoolTensor<Flex>, _out_dtype: burn_std::IntDType) -> IntTensor<Flex> {
        let tensor = tensor.to_contiguous();
        let shape = tensor.layout().shape().clone();
        // Bool is stored as u8 internally (0 = false, non-zero = true)
        // Use bytes() directly since storage() checks dtype
        let int_data: Vec<i64> = tensor
            .bytes()
            .iter()
            .map(|&x| if x != 0 { 1i64 } else { 0i64 })
            .collect();

        FlexTensor::new(
            Bytes::from_elems(int_data),
            Layout::contiguous(shape),
            DType::I64,
        )
    }

    fn bool_into_float(
        tensor: BoolTensor<Flex>,
        _out_dtype: burn_std::FloatDType,
    ) -> FloatTensor<Flex> {
        let tensor = tensor.to_contiguous();
        let shape = tensor.layout().shape().clone();
        // Bool is stored as u8 internally (0 = false, non-zero = true)
        // Use bytes() directly since storage() checks dtype
        let float_data: Vec<f32> = tensor
            .bytes()
            .iter()
            .map(|&x| if x != 0 { 1.0f32 } else { 0.0f32 })
            .collect();

        FlexTensor::new(
            Bytes::from_elems(float_data),
            Layout::contiguous(shape),
            DType::F32,
        )
    }

    fn bool_swap_dims(tensor: BoolTensor<Flex>, dim1: usize, dim2: usize) -> BoolTensor<Flex> {
        tensor.transpose(dim1, dim2)
    }

    fn bool_permute(tensor: BoolTensor<Flex>, axes: &[usize]) -> BoolTensor<Flex> {
        tensor.permute(axes)
    }

    fn bool_flip(tensor: BoolTensor<Flex>, axes: &[usize]) -> BoolTensor<Flex> {
        crate::ops::flip::flip(tensor, axes)
    }

    fn bool_equal(lhs: BoolTensor<Flex>, rhs: BoolTensor<Flex>) -> BoolTensor<Flex> {
        use crate::Layout;
        use crate::strided_index::StridedIter;
        use burn_backend::DType;
        use burn_std::Bytes;

        debug_assert_eq!(
            lhs.layout().shape(),
            rhs.layout().shape(),
            "bool_equal: shape mismatch"
        );

        let shape = lhs.layout().shape().clone();
        let lhs_storage: &[u8] = lhs.bytes();
        let rhs_storage: &[u8] = rhs.bytes();

        let result: Vec<u8> = match (
            lhs.layout().contiguous_offsets(),
            rhs.layout().contiguous_offsets(),
        ) {
            (Some((l_start, l_end)), Some((r_start, r_end))) => {
                let l_slice = &lhs_storage[l_start..l_end];
                let r_slice = &rhs_storage[r_start..r_end];
                l_slice
                    .iter()
                    .zip(r_slice)
                    .map(|(&a, &b)| (a == b) as u8)
                    .collect()
            }
            _ => {
                let lhs_iter = StridedIter::new(lhs.layout());
                let rhs_iter = StridedIter::new(rhs.layout());
                lhs_iter
                    .zip(rhs_iter)
                    .map(|(li, ri)| (lhs_storage[li] == rhs_storage[ri]) as u8)
                    .collect()
            }
        };

        let bytes = Bytes::from_elems(result);
        FlexTensor::new(
            bytes,
            Layout::contiguous(shape),
            DType::Bool(burn_std::BoolStore::Native),
        )
    }

    fn bool_not(mut tensor: BoolTensor<Flex>) -> BoolTensor<Flex> {
        use crate::Layout;
        use crate::strided_index::StridedIter;
        use burn_backend::DType;
        use burn_std::Bytes;

        // Fast path: in-place for unique, contiguous tensors at offset 0
        if tensor.is_unique()
            && tensor.layout().is_contiguous()
            && tensor.layout().start_offset() == 0
        {
            let storage = tensor.storage_mut::<u8>();
            crate::simd::bool_not_inplace_u8(storage);
            return tensor;
        }

        // Allocating path for shared or non-contiguous tensors
        let shape = tensor.layout().shape().clone();
        let storage: &[u8] = tensor.bytes();

        let result: Vec<u8> = match tensor.layout().contiguous_offsets() {
            Some((start, end)) => {
                let slice = &storage[start..end];
                let mut out = vec![0u8; slice.len()];
                crate::simd::bool_not_u8(slice, &mut out);
                out
            }
            None => StridedIter::new(tensor.layout())
                .map(|idx| (storage[idx] == 0) as u8)
                .collect(),
        };

        let bytes = Bytes::from_elems(result);
        FlexTensor::new(
            bytes,
            Layout::contiguous(shape),
            DType::Bool(burn_std::BoolStore::Native),
        )
    }

    fn bool_and(lhs: BoolTensor<Flex>, rhs: BoolTensor<Flex>) -> BoolTensor<Flex> {
        bool_binary_op_simd(lhs, rhs, BoolBinaryOp::And)
    }

    fn bool_or(lhs: BoolTensor<Flex>, rhs: BoolTensor<Flex>) -> BoolTensor<Flex> {
        bool_binary_op_simd(lhs, rhs, BoolBinaryOp::Or)
    }

    fn bool_xor(lhs: BoolTensor<Flex>, rhs: BoolTensor<Flex>) -> BoolTensor<Flex> {
        bool_binary_op_simd(lhs, rhs, BoolBinaryOp::Xor)
    }

    fn bool_expand(tensor: BoolTensor<Flex>, shape: Shape) -> BoolTensor<Flex> {
        crate::ops::expand::expand(tensor, shape)
    }

    // Missing methods
    fn bool_zeros(
        shape: Shape,
        device: &Device<Flex>,
        dtype: burn_std::BoolDType,
    ) -> BoolTensor<Flex> {
        Self::bool_empty(shape, device, dtype)
    }

    fn bool_ones(
        shape: Shape,
        _device: &Device<Flex>,
        _dtype: burn_std::BoolDType,
    ) -> BoolTensor<Flex> {
        let num_elements = shape.num_elements();
        let data = vec![1u8; num_elements];
        FlexTensor::new(
            Bytes::from_elems(data),
            Layout::contiguous(shape),
            DType::Bool(burn_std::BoolStore::Native),
        )
    }

    fn bool_mask_where(
        tensor: BoolTensor<Flex>,
        mask: BoolTensor<Flex>,
        value: BoolTensor<Flex>,
    ) -> BoolTensor<Flex> {
        crate::ops::mask::mask_where_bool(tensor, mask, value)
    }

    fn bool_mask_fill(
        tensor: BoolTensor<Flex>,
        mask: BoolTensor<Flex>,
        value: burn_backend::Scalar,
    ) -> BoolTensor<Flex> {
        let value: bool = value.elem();
        crate::ops::mask::mask_fill_bool(tensor, mask, value)
    }

    fn bool_gather(
        dim: usize,
        tensor: BoolTensor<Flex>,
        indices: IntTensor<Flex>,
    ) -> BoolTensor<Flex> {
        crate::ops::gather_scatter::gather_bool(tensor, dim, indices)
    }

    fn bool_scatter_or(
        dim: usize,
        tensor: BoolTensor<Flex>,
        indices: IntTensor<Flex>,
        value: BoolTensor<Flex>,
    ) -> BoolTensor<Flex> {
        crate::ops::gather_scatter::scatter_or(tensor, dim, indices, value)
    }

    fn bool_equal_elem(lhs: BoolTensor<Flex>, rhs: burn_backend::Scalar) -> BoolTensor<Flex> {
        use crate::Layout;
        use crate::strided_index::StridedIter;
        use burn_std::Bytes;

        let shape = lhs.layout().shape().clone();
        let storage: &[u8] = lhs.bytes();
        let rhs_bool: bool = rhs.elem();
        let rhs_val = rhs_bool as u8;

        let result: Vec<u8> = match lhs.layout().contiguous_offsets() {
            Some((start, end)) => storage[start..end]
                .iter()
                .map(|&v| (v == rhs_val) as u8)
                .collect(),
            None => StridedIter::new(lhs.layout())
                .map(|idx| (storage[idx] == rhs_val) as u8)
                .collect(),
        };

        let bytes = Bytes::from_elems(result);
        FlexTensor::new(
            bytes,
            Layout::contiguous(shape),
            DType::Bool(burn_std::BoolStore::Native),
        )
    }

    fn bool_unfold(
        tensor: BoolTensor<Flex>,
        dim: usize,
        size: usize,
        step: usize,
    ) -> BoolTensor<Flex> {
        crate::ops::unfold::unfold_bool(tensor, dim, size, step)
    }

    fn bool_not_equal(lhs: BoolTensor<Flex>, rhs: BoolTensor<Flex>) -> BoolTensor<Flex> {
        crate::ops::comparison::bool_not_equal(lhs, rhs)
    }

    fn bool_not_equal_elem(lhs: BoolTensor<Flex>, rhs: burn_backend::Scalar) -> BoolTensor<Flex> {
        let rhs: bool = rhs.elem();
        crate::ops::comparison::bool_not_equal_elem(lhs, rhs)
    }

    fn bool_any(tensor: BoolTensor<Flex>) -> BoolTensor<Flex> {
        crate::ops::comparison::any_bool(tensor)
    }

    fn bool_any_dim(tensor: BoolTensor<Flex>, dim: usize) -> BoolTensor<Flex> {
        crate::ops::comparison::any_bool_dim(tensor, dim)
    }

    fn bool_all(tensor: BoolTensor<Flex>) -> BoolTensor<Flex> {
        crate::ops::comparison::all_bool(tensor)
    }

    fn bool_all_dim(tensor: BoolTensor<Flex>, dim: usize) -> BoolTensor<Flex> {
        crate::ops::comparison::all_bool_dim(tensor, dim)
    }

    fn bool_select(
        tensor: BoolTensor<Flex>,
        dim: usize,
        indices: IntTensor<Flex>,
    ) -> BoolTensor<Flex> {
        crate::ops::gather_scatter::select::<u8>(tensor, dim, indices)
    }

    fn bool_select_or(
        tensor: BoolTensor<Flex>,
        dim: usize,
        indices: IntTensor<Flex>,
        value: BoolTensor<Flex>,
    ) -> BoolTensor<Flex> {
        let mut result = crate::ops::gather_scatter::select_add::<u8>(tensor, dim, indices, value);
        // Clamp to 0/1: select_add sums u8 values, but bool OR saturates at 1
        let storage: &mut [u8] = result.storage_mut();
        for v in storage.iter_mut() {
            if *v > 1 {
                *v = 1;
            }
        }
        result
    }
}

/// Boolean binary operation type.
#[derive(Clone, Copy)]
enum BoolBinaryOp {
    And,
    Or,
    Xor,
}

fn bool_binary_op_simd(mut lhs: FlexTensor, mut rhs: FlexTensor, op: BoolBinaryOp) -> FlexTensor {
    use crate::Layout;
    use crate::strided_index::StridedIter;
    use burn_std::Bytes;

    debug_assert_eq!(
        lhs.layout().shape(),
        rhs.layout().shape(),
        "bool_binary_op: shape mismatch"
    );

    let shape = lhs.layout().shape().clone();
    let l_offsets = lhs.layout().contiguous_offsets();
    let r_offsets = rhs.layout().contiguous_offsets();

    // Fast path 1: lhs is unique and contiguous at offset 0 -> in-place on lhs
    if lhs.is_unique()
        && let (Some((0, l_end)), Some((r_start, r_end))) = (l_offsets, r_offsets)
    {
        let rhs_storage: &[u8] = rhs.bytes();
        let r_slice = &rhs_storage[r_start..r_end];
        let lhs_storage: &mut [u8] = lhs.storage_mut();
        let l_slice = &mut lhs_storage[..l_end];

        match op {
            BoolBinaryOp::And => crate::simd::bool_and_inplace_u8(l_slice, r_slice),
            BoolBinaryOp::Or => crate::simd::bool_or_inplace_u8(l_slice, r_slice),
            BoolBinaryOp::Xor => crate::simd::bool_xor_inplace_u8(l_slice, r_slice),
        }
        return lhs;
    }

    // Fast path 2: rhs is unique and contiguous at offset 0 -> in-place on rhs
    // (And/Or/Xor are commutative, so we can swap operands)
    if rhs.is_unique()
        && let (Some((l_start, l_end)), Some((0, r_end))) = (l_offsets, r_offsets)
    {
        let lhs_storage: &[u8] = lhs.bytes();
        let l_slice = &lhs_storage[l_start..l_end];
        let rhs_storage: &mut [u8] = rhs.storage_mut();
        let r_slice = &mut rhs_storage[..r_end];

        match op {
            BoolBinaryOp::And => crate::simd::bool_and_inplace_u8(r_slice, l_slice),
            BoolBinaryOp::Or => crate::simd::bool_or_inplace_u8(r_slice, l_slice),
            BoolBinaryOp::Xor => crate::simd::bool_xor_inplace_u8(r_slice, l_slice),
        }
        return rhs;
    }

    // Allocating path: neither tensor is suitable for in-place
    let lhs_storage: &[u8] = lhs.bytes();
    let rhs_storage: &[u8] = rhs.bytes();

    let result: Vec<u8> = match (l_offsets, r_offsets) {
        (Some((l_start, l_end)), Some((r_start, r_end))) => {
            let l_slice = &lhs_storage[l_start..l_end];
            let r_slice = &rhs_storage[r_start..r_end];
            let mut out = vec![0u8; l_slice.len()];
            match op {
                BoolBinaryOp::And => crate::simd::bool_and_u8(l_slice, r_slice, &mut out),
                BoolBinaryOp::Or => crate::simd::bool_or_u8(l_slice, r_slice, &mut out),
                BoolBinaryOp::Xor => crate::simd::bool_xor_u8(l_slice, r_slice, &mut out),
            }
            out
        }
        _ => {
            let lhs_iter = StridedIter::new(lhs.layout());
            let rhs_iter = StridedIter::new(rhs.layout());
            match op {
                BoolBinaryOp::And => lhs_iter
                    .zip(rhs_iter)
                    .map(|(li, ri)| lhs_storage[li] & rhs_storage[ri])
                    .collect(),
                BoolBinaryOp::Or => lhs_iter
                    .zip(rhs_iter)
                    .map(|(li, ri)| lhs_storage[li] | rhs_storage[ri])
                    .collect(),
                BoolBinaryOp::Xor => lhs_iter
                    .zip(rhs_iter)
                    .map(|(li, ri)| lhs_storage[li] ^ rhs_storage[ri])
                    .collect(),
            }
        }
    };

    let bytes = Bytes::from_elems(result);
    FlexTensor::new(
        bytes,
        Layout::contiguous(shape),
        DType::Bool(burn_std::BoolStore::Native),
    )
}

#[cfg(test)]
mod tests {
    use burn_tensor::{Bool, Int, Tensor};

    use crate::Flex;

    #[test]
    fn test_bool_into_int() {
        let t: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, false, true, false], &Default::default());
        let int_t: Tensor<Flex, 1, Int> = t.int();
        let data: Vec<i64> = int_t.into_data().to_vec().unwrap();

        assert_eq!(data, vec![1i64, 0, 1, 0]);
    }

    #[test]
    fn test_bool_into_float() {
        let t: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, false, true, false], &Default::default());
        let float_t: Tensor<Flex, 1> = t.float();
        let data: Vec<f32> = float_t.into_data().to_vec().unwrap();

        assert_eq!(data, vec![1.0f32, 0.0, 1.0, 0.0]);
    }

    #[test]
    fn test_bool_into_int_2d() {
        let t: Tensor<Flex, 2, Bool> =
            Tensor::from_data([[true, false], [false, true]], &Default::default());
        let int_t: Tensor<Flex, 2, Int> = t.int();
        let data: Vec<i64> = int_t.into_data().to_vec().unwrap();

        assert_eq!(data, vec![1i64, 0, 0, 1]);
    }

    #[test]
    fn test_bool_into_float_2d() {
        let t: Tensor<Flex, 2, Bool> =
            Tensor::from_data([[true, false], [false, true]], &Default::default());
        let float_t: Tensor<Flex, 2> = t.float();
        let data: Vec<f32> = float_t.into_data().to_vec().unwrap();

        assert_eq!(data, vec![1.0f32, 0.0, 0.0, 1.0]);
    }

    // === Non-contiguous (negative stride) tests ===

    #[test]
    fn test_bool_into_int_flipped() {
        // [T, F, T, F] flipped -> [F, T, F, T]
        // Convert to int: [0, 1, 0, 1]
        let t: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, false, true, false], &Default::default());
        let t = t.flip([0]);
        let int_t: Tensor<Flex, 1, Int> = t.int();
        let data: Vec<i64> = int_t.into_data().to_vec().unwrap();

        assert_eq!(data, vec![0i64, 1, 0, 1]);
    }

    #[test]
    fn test_bool_into_float_flipped() {
        // [T, F, T, F] flipped -> [F, T, F, T]
        // Convert to float: [0.0, 1.0, 0.0, 1.0]
        let t: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, false, true, false], &Default::default());
        let t = t.flip([0]);
        let float_t: Tensor<Flex, 1> = t.float();
        let data: Vec<f32> = float_t.into_data().to_vec().unwrap();

        assert_eq!(data, vec![0.0f32, 1.0, 0.0, 1.0]);
    }

    #[test]
    fn test_bool_not_flipped() {
        // [T, F, T, F] flipped -> [F, T, F, T]
        // NOT: [T, F, T, F]
        let t: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, false, true, false], &Default::default());
        let t = t.flip([0]);
        let result = t.bool_not();
        let data: Vec<bool> = result.into_data().to_vec().unwrap();

        assert_eq!(data, vec![true, false, true, false]);
    }

    #[test]
    fn test_bool_and_flipped() {
        // a: [T, F, T, F] flipped -> [F, T, F, T]
        // b: [T, T, F, F]
        // AND: [F, T, F, F]
        let a: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, false, true, false], &Default::default());
        let b: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, true, false, false], &Default::default());
        let a = a.flip([0]);

        let result = a.bool_and(b);
        let data: Vec<bool> = result.into_data().to_vec().unwrap();

        assert_eq!(data, vec![false, true, false, false]);
    }

    #[test]
    fn test_bool_or_flipped() {
        // a: [T, F, T, F] flipped -> [F, T, F, T]
        // b: [T, F, F, F]
        // OR: [T, T, F, T]
        let a: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, false, true, false], &Default::default());
        let b: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, false, false, false], &Default::default());
        let a = a.flip([0]);

        let result = a.bool_or(b);
        let data: Vec<bool> = result.into_data().to_vec().unwrap();

        assert_eq!(data, vec![true, true, false, true]);
    }

    #[test]
    fn test_bool_xor_flipped() {
        // a: [T, F, T, F] flipped -> [F, T, F, T]
        // b: [T, T, F, F]
        // XOR: [T, F, F, T]
        let a: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, false, true, false], &Default::default());
        let b: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, true, false, false], &Default::default());
        let a = a.flip([0]);

        let result = a.bool_xor(b);
        let data: Vec<bool> = result.into_data().to_vec().unwrap();

        assert_eq!(data, vec![true, false, false, true]);
    }

    #[test]
    fn test_bool_equal_flipped() {
        // a: [T, F, T, F] flipped -> [F, T, F, T]
        // b: [F, T, F, T]
        // EQUAL: [T, T, T, T]
        let a: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, false, true, false], &Default::default());
        let b: Tensor<Flex, 1, Bool> =
            Tensor::from_data([false, true, false, true], &Default::default());
        let a = a.flip([0]);

        let result = a.equal(b);
        let data: Vec<bool> = result.into_data().to_vec().unwrap();

        assert_eq!(data, vec![true, true, true, true]);
    }

    #[test]
    fn test_bool_and_both_flipped() {
        // a: [T, F, T, F] flipped -> [F, T, F, T]
        // b: [T, T, F, F] flipped -> [F, F, T, T]
        // AND: [F, F, F, T]
        let a: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, false, true, false], &Default::default());
        let b: Tensor<Flex, 1, Bool> =
            Tensor::from_data([true, true, false, false], &Default::default());
        let a = a.flip([0]);
        let b = b.flip([0]);

        let result = a.bool_and(b);
        let data: Vec<bool> = result.into_data().to_vec().unwrap();

        assert_eq!(data, vec![false, false, false, true]);
    }

    #[test]
    fn test_bool_into_int_flipped_2d() {
        // [[T, F], [F, T]] with axis 0 flipped -> [[F, T], [T, F]]
        // Convert to int: [[0, 1], [1, 0]]
        let t: Tensor<Flex, 2, Bool> =
            Tensor::from_data([[true, false], [false, true]], &Default::default());
        let t = t.flip([0]);
        let int_t: Tensor<Flex, 2, Int> = t.int();
        let data: Vec<i64> = int_t.into_data().to_vec().unwrap();

        assert_eq!(data, vec![0i64, 1, 1, 0]);
    }
}