rustorch 0.6.29

//! Phase 8: Tensor Utilities Implementation - Refactored
//! フェーズ8: テンソルユーティリティ実装 - リファクタリング版
//!
//! Advanced tensor operations for conditional selection, indexing, and statistics.
//! High-performance implementations with unified error handling and optimized memory usage.
//! 条件選択、インデックス操作、統計のための高度なテンソル操作。
//! 統一されたエラーハンドリングとメモリ使用量最適化による高性能実装。

use crate::error::{RusTorchError, RusTorchResult};
use crate::tensor::Tensor;
use ndarray::ArrayD;
use num_traits::Float;
use std::cmp::Ordering;

/// Type aliases for better type safety and readability
/// より良いタイプセーフティと可読性のためのタイプエイリアス
type BoolMask = ArrayD<bool>;
type IndexArray = ArrayD<i64>;
type Shape = Vec<usize>;

/// Broadcasting utilities for efficient shape calculations
/// 効率的な形状計算のためのブロードキャスティングユーティリティ
mod broadcasting {
    use super::*;

    /// Check if two shapes can be broadcast together (optimized version)
    pub(super) fn can_broadcast(shape1: &[usize], shape2: &[usize]) -> bool {
        let len1 = shape1.len();
        let len2 = shape2.len();
        let max_len = len1.max(len2);

        for i in 0..max_len {
            let dim1 = shape1
                .get(len1.saturating_sub(max_len - i))
                .copied()
                .unwrap_or(1);
            let dim2 = shape2
                .get(len2.saturating_sub(max_len - i))
                .copied()
                .unwrap_or(1);

            if dim1 != dim2 && dim1 != 1 && dim2 != 1 {
                return false;
            }
        }
        true
    }

    /// Calculate the output shape after broadcasting (optimized version)
    pub(super) fn broadcast_shape(shape1: &[usize], shape2: &[usize]) -> RusTorchResult<Shape> {
        let len1 = shape1.len();
        let len2 = shape2.len();
        let max_len = len1.max(len2);
        let mut result = Vec::with_capacity(max_len);

        for i in 0..max_len {
            let dim1 = shape1
                .get(len1.saturating_sub(max_len - i))
                .copied()
                .unwrap_or(1);
            let dim2 = shape2
                .get(len2.saturating_sub(max_len - i))
                .copied()
                .unwrap_or(1);

            if dim1 != dim2 && dim1 != 1 && dim2 != 1 {
                return Err(RusTorchError::shape_mismatch(shape1, shape2));
            }

            result.push(dim1.max(dim2));
        }

        Ok(result)
    }

    /// Calculate broadcast indices with better cache locality
    pub(super) fn broadcast_index(
        flat_idx: usize,
        original_shape: &[usize],
        target_shape: &[usize],
    ) -> usize {
        if original_shape == target_shape {
            return flat_idx;
        }

        let mut coords = Vec::with_capacity(target_shape.len());
        let mut remaining = flat_idx;

        // Convert flat index to coordinates
        for &dim_size in target_shape.iter().rev() {
            coords.push(remaining % dim_size);
            remaining /= dim_size;
        }
        coords.reverse();

        // Map to original tensor coordinates
        let mut result_idx = 0;
        let mut stride = 1;

        for i in (0..original_shape.len()).rev() {
            let coord_idx = coords.len().saturating_sub(original_shape.len() - i);
            let coord = coords.get(coord_idx).copied().unwrap_or(0);
            let mapped_coord = if original_shape[i] == 1 { 0 } else { coord };

            result_idx += mapped_coord * stride;
            stride *= original_shape[i];
        }

        result_idx
    }
}

/// Memory-efficient stride calculations
/// メモリ効率的なストライド計算
mod stride_calc {
    /// Calculate strides for n-dimensional array with better cache efficiency
    pub(super) fn calculate_strides(shape: &[usize]) -> Vec<usize> {
        let mut strides = vec![1; shape.len()];

        for i in (0..shape.len().saturating_sub(1)).rev() {
            strides[i] = strides[i + 1] * shape[i + 1];
        }

        strides
    }

    /// Convert flat index to multi-dimensional coordinates
    pub(super) fn flat_to_coords(flat_idx: usize, strides: &[usize]) -> Vec<usize> {
        let mut coords = Vec::with_capacity(strides.len());
        let mut remaining = flat_idx;

        for &stride in strides.iter() {
            coords.push(remaining / stride);
            remaining %= stride;
        }

        coords
    }

    /// Convert multi-dimensional coordinates to flat index
    pub(super) fn coords_to_flat(coords: &[usize], strides: &[usize]) -> usize {
        coords
            .iter()
            .zip(strides.iter())
            .map(|(&coord, &stride)| coord * stride)
            .sum()
    }
}

/// Conditional and selection operations with optimized broadcasting
/// 最適化されたブロードキャスティングによる条件・選択操作
pub mod conditional {
    use super::broadcasting::{broadcast_index, broadcast_shape, can_broadcast};
    use super::*;

    /// Select elements from x or y based on condition (optimized version)
    /// 条件に基づいてxまたはyから要素を選択（最適化版）
    ///
    /// # Arguments
    /// * `condition` - Boolean mask for selection
    /// * `x` - Tensor to select from when condition is true
    /// * `y` - Tensor to select from when condition is false
    ///
    /// # Returns
    /// New tensor with selected elements
    ///
    /// # Performance
    /// Optimized broadcasting with better cache locality and reduced memory allocations
    pub fn where_<T: Float + 'static>(
        condition: &BoolMask,
        x: &Tensor<T>,
        y: &Tensor<T>,
    ) -> RusTorchResult<Tensor<T>> {
        // Validate all shapes can be broadcast together
        let shapes = [condition.shape(), x.shape(), y.shape()];
        for i in 0..shapes.len() {
            for j in (i + 1)..shapes.len() {
                if !can_broadcast(shapes[i], shapes[j]) {
                    return Err(RusTorchError::shape_mismatch(shapes[i], shapes[j]));
                }
            }
        }

        // Calculate final output shape
        let output_shape =
            broadcast_shape(&broadcast_shape(condition.shape(), x.shape())?, y.shape())?;
        let total_elements: usize = output_shape.iter().product();

        // Get data slices with proper error handling
        let condition_data = condition
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Condition mask data not accessible"))?;
        let x_data = x
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("X tensor data not accessible"))?;
        let y_data = y
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Y tensor data not accessible"))?;

        // Pre-allocate result with exact capacity
        let mut result_data = Vec::with_capacity(total_elements);

        // Optimized element selection with better broadcasting
        for i in 0..total_elements {
            let cond_idx = broadcast_index(i, condition.shape(), &output_shape);
            let x_idx = broadcast_index(i, x.shape(), &output_shape);
            let y_idx = broadcast_index(i, y.shape(), &output_shape);

            let value = if cond_idx < condition_data.len() && condition_data[cond_idx] {
                if x_idx < x_data.len() {
                    x_data[x_idx]
                } else {
                    T::zero()
                }
            } else {
                if y_idx < y_data.len() {
                    y_data[y_idx]
                } else {
                    T::zero()
                }
            };

            result_data.push(value);
        }

        Ok(Tensor::from_vec(result_data, output_shape))
    }

    /// Select elements from input tensor where mask is true (optimized version)
    /// マスクがtrueの位置から要素を選択（最適化版）
    ///
    /// # Arguments
    /// * `input` - Input tensor
    /// * `mask` - Boolean mask array
    ///
    /// # Returns
    /// 1D tensor containing selected elements
    ///
    /// # Performance
    /// Uses iterator chaining for better performance and reduced allocations
    pub fn masked_select<T: Float + 'static>(
        input: &Tensor<T>,
        mask: &BoolMask,
    ) -> RusTorchResult<Tensor<T>> {
        if input.shape() != mask.shape() {
            return Err(RusTorchError::shape_mismatch(input.shape(), mask.shape()));
        }

        let input_data = input
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Input tensor data not accessible"))?;
        let mask_data = mask
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Mask array data not accessible"))?;

        // Count true values first to pre-allocate correctly
        let true_count = mask_data.iter().filter(|&&val| val).count();
        let mut selected = Vec::with_capacity(true_count);

        // Efficient selection without intermediate collections
        for (value, mask_val) in input_data.iter().zip(mask_data.iter()) {
            if *mask_val {
                selected.push(*value);
            }
        }

        let len = selected.len();
        Ok(Tensor::from_vec(selected, vec![len]))
    }

    /// Fill elements in input tensor where mask is true with specified value (optimized)
    /// マスクがtrueの位置を指定値で埋める（最適化版）
    ///
    /// # Arguments
    /// * `input` - Mutable input tensor
    /// * `mask` - Boolean mask array
    /// * `value` - Value to fill with
    ///
    /// # Performance
    /// Direct iterator-based filling for better cache performance
    pub fn masked_fill_<T: Float + 'static>(
        input: &mut Tensor<T>,
        mask: &BoolMask,
        value: T,
    ) -> RusTorchResult<()> {
        if input.shape() != mask.shape() {
            return Err(RusTorchError::shape_mismatch(input.shape(), mask.shape()));
        }

        let input_data = input.data.as_slice_mut().ok_or_else(|| {
            RusTorchError::tensor_op("Input tensor data not accessible for mutation")
        })?;
        let mask_data = mask
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Mask array data not accessible"))?;

        // Direct iterator-based filling for optimal performance
        input_data
            .iter_mut()
            .zip(mask_data.iter())
            .filter(|(_, &mask_val)| mask_val)
            .for_each(|(elem, _)| *elem = value);

        Ok(())
    }

    /// Non-mutable version of masked_fill that returns a new tensor
    /// 新しいテンサーを返すmasked_fillの非破壊版
    ///
    /// # Performance
    /// Optimized version that avoids unnecessary cloning when possible
    pub fn masked_fill<T: Float + 'static>(
        input: &Tensor<T>,
        mask: &BoolMask,
        value: T,
    ) -> RusTorchResult<Tensor<T>> {
        // Check for early optimization opportunities
        if mask
            .as_slice()
            .map_or(false, |data| data.iter().all(|&x| !x))
        {
            // No elements to fill, return clone
            return Ok(input.clone());
        }

        let mut result = input.clone();
        masked_fill_(&mut result, mask, value)?;
        Ok(result)
    }
}

/// Index operations with optimized memory access patterns
/// メモリアクセスパターンを最適化したインデックス操作
pub mod indexing {
    use super::stride_calc::{calculate_strides, coords_to_flat, flat_to_coords};
    use super::*;

    /// Gather values along an axis specified by index (optimized version)
    /// インデックスで指定された軸に沿って値を収集（最適化版）
    ///
    /// # Arguments
    /// * `input` - Input tensor
    /// * `dim` - Dimension to gather along
    /// * `index` - Index array with indices to gather
    ///
    /// # Returns
    /// Gathered tensor with same shape as index
    ///
    /// # Performance
    /// Optimized stride calculations and bounds checking for better performance
    pub fn gather<T: Float + 'static>(
        input: &Tensor<T>,
        dim: usize,
        index: &IndexArray,
    ) -> RusTorchResult<Tensor<T>> {
        let input_shape = input.shape();

        if dim >= input_shape.len() {
            return Err(RusTorchError::invalid_dimension(
                dim,
                input_shape.len().saturating_sub(1),
            ));
        }

        let input_data = input
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Input tensor data not accessible"))?;
        let index_data = index
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Index array data not accessible"))?;

        let index_shape = index.shape();
        let mut result_data = Vec::with_capacity(index_data.len());

        // Pre-calculate strides for better performance
        let input_strides = calculate_strides(input_shape);
        let index_strides = calculate_strides(index_shape);

        // Batch bounds checking for better performance
        let dim_size = input_shape[dim];
        for &idx in index_data.iter() {
            if idx < 0 || idx as usize >= dim_size {
                return Err(RusTorchError::tensor_op(format!(
                    "Index {} out of bounds for dimension {} with size {}",
                    idx, dim, dim_size
                )));
            }
        }

        // Optimized gathering with better memory access patterns
        for flat_idx in 0..index_data.len() {
            let index_coords = flat_to_coords(flat_idx, &index_strides);
            let gather_idx = index_data[flat_idx] as usize;

            let mut input_coords = index_coords;
            // Ensure we have enough dimensions
            input_coords.resize(input_shape.len(), 0);
            if dim < input_coords.len() {
                input_coords[dim] = gather_idx;
            }

            let input_flat_idx = coords_to_flat(&input_coords, &input_strides);

            if input_flat_idx < input_data.len() {
                result_data.push(input_data[input_flat_idx]);
            } else {
                return Err(RusTorchError::tensor_op(
                    "gather: Calculated index exceeds tensor bounds",
                ));
            }
        }

        Ok(Tensor::from_vec(result_data, index_shape.to_vec()))
    }

    /// Scatter values from src to input along specified dimension using index (optimized)
    /// インデックスを使用して指定次元に沿ってsrcの値をinputに散布（最適化版）
    ///
    /// # Arguments
    /// * `input` - Mutable input tensor to scatter into
    /// * `dim` - Dimension to scatter along
    /// * `index` - Index array
    /// * `src` - Source tensor with values to scatter
    ///
    /// # Performance
    /// Optimized with better stride calculations and bounds checking
    pub fn scatter_<T: Float + 'static>(
        input: &mut Tensor<T>,
        dim: usize,
        index: &IndexArray,
        src: &Tensor<T>,
    ) -> RusTorchResult<()> {
        let input_shape = input.shape();

        if dim >= input_shape.len() {
            return Err(RusTorchError::invalid_dimension(
                dim,
                input_shape.len().saturating_sub(1),
            ));
        }

        if index.shape() != src.shape() {
            return Err(RusTorchError::shape_mismatch(index.shape(), src.shape()));
        }

        let input_shape_owned = input_shape.to_vec();
        let index_shape = index.shape().to_vec();
        let dim_size = input_shape_owned[dim];

        let index_data = index
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Index array data not accessible"))?;
        let src_data = src
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Source tensor data not accessible"))?;

        // Batch bounds checking for better performance
        for &idx in index_data.iter() {
            if idx < 0 || idx as usize >= dim_size {
                return Err(RusTorchError::tensor_op(format!(
                    "Index {} out of bounds for dimension {} with size {}",
                    idx, dim, dim_size
                )));
            }
        }

        let input_data = input.data.as_slice_mut().ok_or_else(|| {
            RusTorchError::tensor_op("Input tensor data not accessible for mutation")
        })?;

        // Pre-calculate strides for better performance
        let input_strides = calculate_strides(&input_shape_owned);
        let index_strides = calculate_strides(&index_shape);

        // Optimized scattering with better memory access patterns
        for flat_idx in 0..index_data.len() {
            let index_coords = flat_to_coords(flat_idx, &index_strides);
            let scatter_idx = index_data[flat_idx] as usize;

            let mut input_coords = index_coords;
            // Ensure we have enough dimensions
            input_coords.resize(input_shape_owned.len(), 0);
            if dim < input_coords.len() {
                input_coords[dim] = scatter_idx;
            }

            let input_flat_idx = coords_to_flat(&input_coords, &input_strides);

            if input_flat_idx < input_data.len() {
                input_data[input_flat_idx] = src_data[flat_idx];
            } else {
                return Err(RusTorchError::tensor_op(
                    "scatter: Calculated index exceeds tensor bounds",
                ));
            }
        }

        Ok(())
    }

    /// Non-mutable version of scatter that returns a new tensor
    /// 新しいテンサーを返すscatterの非破壊版
    ///
    /// # Performance
    /// Uses copy-on-write semantics for better performance when appropriate
    pub fn scatter<T: Float + 'static>(
        input: &Tensor<T>,
        dim: usize,
        index: &IndexArray,
        src: &Tensor<T>,
    ) -> RusTorchResult<Tensor<T>> {
        let mut result = input.clone();
        scatter_(&mut result, dim, index, src)?;
        Ok(result)
    }

    /// Select values from input tensor along dimension using index (optimized)
    /// インデックスを使用して次元に沿って値を選択（最適化版）
    ///
    /// # Arguments
    /// * `input` - Input tensor
    /// * `dim` - Dimension to select along
    /// * `index` - Index array containing indices to select
    ///
    /// # Returns
    /// Tensor with selected values
    ///
    /// # Performance
    /// Optimized memory layout and bounds checking for better cache performance
    pub fn index_select<T: Float + 'static>(
        input: &Tensor<T>,
        dim: usize,
        index: &IndexArray,
    ) -> RusTorchResult<Tensor<T>> {
        let input_shape = input.shape();

        if dim >= input_shape.len() {
            return Err(RusTorchError::invalid_dimension(
                dim,
                input_shape.len().saturating_sub(1),
            ));
        }

        let input_data = input
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Input tensor data not accessible"))?;
        let index_data = index
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Index array data not accessible"))?;

        let dim_size = input_shape[dim];
        let index_len = index_data.len();

        // Batch bounds checking for better performance
        for &idx in index_data.iter() {
            if idx < 0 || idx as usize >= dim_size {
                return Err(RusTorchError::tensor_op(format!(
                    "Index {} out of bounds for dimension {} with size {}",
                    idx, dim, dim_size
                )));
            }
        }

        // Calculate output shape efficiently
        let mut output_shape = input_shape.to_vec();
        output_shape[dim] = index_len;
        let output_size: usize = output_shape.iter().product();

        let mut result_data = Vec::with_capacity(output_size);

        // Optimized index calculation with better memory access patterns
        let outer_size: usize = input_shape[..dim].iter().product();
        let inner_size: usize = input_shape[dim + 1..].iter().product();
        let dim_stride = inner_size;
        let outer_stride = input_shape[dim] * inner_size;

        // Memory-efficient selection with better cache locality
        for outer_idx in 0..outer_size {
            let base_outer = outer_idx * outer_stride;

            for &selected_idx in index_data.iter() {
                let selected_idx = selected_idx as usize;
                let base_selected = base_outer + selected_idx * dim_stride;

                for inner_idx in 0..inner_size {
                    let input_idx = base_selected + inner_idx;

                    if input_idx < input_data.len() {
                        result_data.push(input_data[input_idx]);
                    } else {
                        return Err(RusTorchError::tensor_op(
                            "index_select: Calculated index exceeds tensor bounds",
                        ));
                    }
                }
            }
        }

        Ok(Tensor::from_vec(result_data, output_shape))
    }
}

/// Statistics and ordering operations with optimized implementations
/// 最適化実装による統計・順序操作
pub mod statistics {
    use super::*;

    /// Find top-k largest elements along specified dimension (optimized version)
    /// 指定次元に沿って上位k個の最大要素を検索（最適化版）
    ///
    /// # Arguments
    /// * `input` - Input tensor
    /// * `k` - Number of elements to select
    /// * `dim` - Dimension to search along
    /// * `largest` - If true, return largest elements; if false, return smallest
    /// * `sorted` - If true, return sorted results
    ///
    /// # Returns
    /// Tuple of (values, indices) tensors
    ///
    /// # Performance
    /// Optimized memory access patterns and efficient sorting
    pub fn topk_util<T: Float + 'static>(
        input: &Tensor<T>,
        k: usize,
        dim: usize,
        largest: bool,
        sorted: bool,
    ) -> RusTorchResult<(Tensor<T>, IndexArray)> {
        let input_shape = input.shape();

        if dim >= input_shape.len() {
            return Err(RusTorchError::invalid_dimension(
                dim,
                input_shape.len().saturating_sub(1),
            ));
        }

        if k > input_shape[dim] {
            return Err(RusTorchError::tensor_op(format!(
                "k ({}) cannot be larger than dimension size ({})",
                k, input_shape[dim]
            )));
        }

        if k == 0 {
            let mut output_shape = input_shape.to_vec();
            output_shape[dim] = 0;
            return Ok((
                Tensor::from_vec(Vec::new(), output_shape.clone()),
                ArrayD::from_shape_vec(output_shape, Vec::new()).map_err(|_| {
                    RusTorchError::tensor_op("Failed to create empty indices array")
                })?,
            ));
        }

        let input_data = input
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Input tensor data not accessible"))?;

        let outer_size: usize = input_shape[..dim].iter().product();
        let dim_size = input_shape[dim];
        let inner_size: usize = input_shape[dim + 1..].iter().product();

        let mut output_shape = input_shape.to_vec();
        output_shape[dim] = k;
        let output_size: usize = output_shape.iter().product();

        let mut values = Vec::with_capacity(output_size);
        let mut indices = Vec::with_capacity(output_size);

        // Optimized sorting with better memory access patterns
        for outer_idx in 0..outer_size {
            for inner_idx in 0..inner_size {
                // Collect values and indices for this slice
                let mut slice_data = Vec::with_capacity(dim_size);

                for dim_idx in 0..dim_size {
                    let flat_idx =
                        outer_idx * dim_size * inner_size + dim_idx * inner_size + inner_idx;
                    if flat_idx < input_data.len() {
                        slice_data.push((input_data[flat_idx], dim_idx));
                    }
                }

                // Sort slice data efficiently
                if largest {
                    slice_data.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(Ordering::Equal));
                } else {
                    slice_data.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
                }

                // Take top-k elements with bounds checking
                let take_count = k.min(slice_data.len());
                for i in 0..take_count {
                    values.push(slice_data[i].0);
                    indices.push(slice_data[i].1 as i64);
                }

                // Handle edge case: if not sorted and largest, reverse for PyTorch compatibility
                if !sorted && largest && take_count > 0 {
                    let start_idx = values.len() - take_count;
                    values[start_idx..].reverse();
                    indices[start_idx..].reverse();
                }

                // Pad if necessary (shouldn't happen with correct validation)
                while values.len() % k != 0 && (outer_idx + 1) * (inner_idx + 1) * k <= output_size
                {
                    values.push(T::zero());
                    indices.push(0);
                }
            }
        }

        Ok((
            Tensor::from_vec(values, output_shape.clone()),
            ArrayD::from_shape_vec(output_shape, indices)
                .map_err(|_| RusTorchError::tensor_op("Failed to create indices array"))?,
        ))
    }

    /// Find k-th smallest element along specified dimension (optimized version)
    /// 指定次元に沿ってk番目に小さい要素を検索（最適化版）
    ///
    /// # Arguments
    /// * `input` - Input tensor
    /// * `k` - Position to find (0-indexed)
    /// * `dim` - Dimension to search along
    /// * `keepdim` - Whether to keep the dimension
    ///
    /// # Returns
    /// Tuple of (values, indices) tensors
    ///
    /// # Performance
    /// Optimized sorting and error handling
    pub fn kthvalue<T: Float + 'static>(
        input: &Tensor<T>,
        k: usize,
        dim: usize,
        keepdim: bool,
    ) -> RusTorchResult<(Tensor<T>, IndexArray)> {
        let input_shape = input.shape();

        if dim >= input_shape.len() {
            return Err(RusTorchError::invalid_dimension(
                dim,
                input_shape.len().saturating_sub(1),
            ));
        }

        if k >= input_shape[dim] {
            return Err(RusTorchError::tensor_op(format!(
                "k ({}) must be less than dimension size ({})",
                k, input_shape[dim]
            )));
        }

        let input_data = input
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Input tensor data not accessible"))?;

        let outer_size: usize = input_shape[..dim].iter().product();
        let dim_size = input_shape[dim];
        let inner_size: usize = input_shape[dim + 1..].iter().product();

        let mut output_shape = input_shape.to_vec();
        if keepdim {
            output_shape[dim] = 1;
        } else {
            output_shape.remove(dim);
        }

        let output_size: usize = output_shape.iter().product();
        let mut values = Vec::with_capacity(output_size);
        let mut indices = Vec::with_capacity(output_size);

        for outer_idx in 0..outer_size {
            for inner_idx in 0..inner_size {
                // Collect values and indices for this slice
                let mut slice_data = Vec::with_capacity(dim_size);

                for dim_idx in 0..dim_size {
                    let flat_idx =
                        outer_idx * dim_size * inner_size + dim_idx * inner_size + inner_idx;
                    if flat_idx < input_data.len() {
                        slice_data.push((input_data[flat_idx], dim_idx));
                    }
                }

                if slice_data.is_empty() {
                    return Err(RusTorchError::tensor_op(
                        "kthvalue: No elements found in slice",
                    ));
                }

                // Sort to find k-th value
                slice_data.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));

                if k < slice_data.len() {
                    values.push(slice_data[k].0);
                    indices.push(slice_data[k].1 as i64);
                } else {
                    return Err(RusTorchError::tensor_op(
                        "kthvalue: k index exceeds available elements",
                    ));
                }
            }
        }

        Ok((
            Tensor::from_vec(values, output_shape.clone()),
            ArrayD::from_shape_vec(output_shape, indices)
                .map_err(|_| RusTorchError::tensor_op("Failed to create indices array"))?,
        ))
    }

    /// Compute quantiles along specified dimension
    /// 指定次元に沿って分位数を計算
    ///
    /// # Arguments
    /// * `input` - Input tensor
    /// * `q` - Quantile values tensor (between 0 and 1)
    /// * `dim` - Dimension to compute quantiles along
    /// * `keepdim` - Whether to keep the dimension
    ///
    /// # Returns
    /// Tensor with computed quantiles
    pub fn quantile_util<T: Float + 'static + std::fmt::Display>(
        input: &Tensor<T>,
        q: &Tensor<T>,
        dim: Option<usize>,
        keepdim: bool,
    ) -> RusTorchResult<Tensor<T>> {
        let q_data = q
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Quantile tensor data not accessible"))?;

        // Validate quantile values are in [0, 1]
        for &q_val in q_data.iter() {
            if q_val < T::zero() || q_val > T::one() {
                return Err(RusTorchError::tensor_op(format!(
                    "Quantile values must be in [0, 1], got {}",
                    q_val
                )));
            }
        }

        let input_data = input
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::tensor_op("Input tensor data not accessible"))?;

        match dim {
            Some(dim_val) => {
                if dim_val >= input.shape().len() {
                    return Err(RusTorchError::invalid_dimension(
                        dim_val,
                        input.shape().len().saturating_sub(1),
                    ));
                }

                let input_shape = input.shape();
                let outer_size: usize = input_shape[..dim_val].iter().product();
                let dim_size = input_shape[dim_val];
                let inner_size: usize = input_shape[dim_val + 1..].iter().product();

                let mut output_shape = input_shape.to_vec();
                if keepdim {
                    output_shape[dim_val] = q_data.len();
                } else {
                    output_shape.remove(dim_val);
                    output_shape.insert(dim_val, q_data.len());
                }

                let mut result_data = Vec::new();

                for outer_idx in 0..outer_size {
                    for inner_idx in 0..inner_size {
                        // Collect values for this slice
                        let mut slice_values = Vec::with_capacity(dim_size);

                        for dim_idx in 0..dim_size {
                            let flat_idx = outer_idx * dim_size * inner_size
                                + dim_idx * inner_size
                                + inner_idx;
                            if flat_idx < input_data.len() {
                                slice_values.push(input_data[flat_idx]);
                            }
                        }

                        // Sort values for quantile calculation
                        slice_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));

                        // Compute quantiles for this slice
                        for &q_val in q_data.iter() {
                            let quantile_val = compute_quantile(&slice_values, q_val);
                            result_data.push(quantile_val);
                        }
                    }
                }

                Ok(Tensor::from_vec(result_data, output_shape))
            }
            None => {
                // Flatten and compute quantiles over entire tensor
                let mut sorted_data = input_data.to_vec();
                sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));

                let mut result_data = Vec::with_capacity(q_data.len());
                for &q_val in q_data.iter() {
                    let quantile_val = compute_quantile(&sorted_data, q_val);
                    result_data.push(quantile_val);
                }

                Ok(Tensor::from_vec(result_data, vec![q_data.len()]))
            }
        }
    }

    /// Helper function to compute quantile from sorted data
    /// ソート済みデータから分位数を計算するヘルパー関数
    fn compute_quantile<T: Float>(sorted_data: &[T], q: T) -> T {
        if sorted_data.is_empty() {
            return T::zero();
        }

        if sorted_data.len() == 1 {
            return sorted_data[0];
        }

        let n = sorted_data.len();
        let index = q * T::from(n - 1).unwrap();
        let lower_idx = index.floor().to_usize().unwrap_or(0).min(n - 1);
        let upper_idx = index.ceil().to_usize().unwrap_or(0).min(n - 1);

        if lower_idx == upper_idx {
            sorted_data[lower_idx]
        } else {
            let fraction = index - T::from(lower_idx).unwrap();
            let lower_val = sorted_data[lower_idx];
            let upper_val = sorted_data[upper_idx];
            lower_val + fraction * (upper_val - lower_val)
        }
    }
}

/// Advanced tensor utilities
/// 高度なテンソルユーティリティ
pub mod advanced {
    use super::*;

    /// Compute unique elements and their inverse indices
    /// 一意要素とその逆インデックスを計算
    pub fn unique<T: Float + 'static>(
        input: &Tensor<T>,
        sorted: bool,
        return_inverse: bool,
        return_counts: bool,
        dim: Option<usize>,
    ) -> RusTorchResult<(Tensor<T>, Option<ArrayD<i64>>, Option<ArrayD<i64>>)> {
        let input_data = input
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::invalid_parameter("Input tensor data not accessible"))?;

        match dim {
            None => {
                // Flatten and find unique elements
                let mut indexed_values: Vec<(T, usize)> = input_data
                    .iter()
                    .enumerate()
                    .map(|(i, &val)| (val, i))
                    .collect();

                if sorted {
                    indexed_values.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
                }

                let mut unique_values = Vec::new();
                let mut inverse_indices = vec![0i64; input_data.len()];
                let mut counts = Vec::new();

                if !indexed_values.is_empty() {
                    let mut current_value = indexed_values[0].0;
                    let mut current_count = 1;
                    unique_values.push(current_value);

                    let mut unique_idx = 0;
                    inverse_indices[indexed_values[0].1] = unique_idx;

                    for i in 1..indexed_values.len() {
                        let (value, original_idx) = indexed_values[i];

                        if (value - current_value).abs() < T::from(1e-7).unwrap() {
                            current_count += 1;
                            inverse_indices[original_idx] = unique_idx;
                        } else {
                            counts.push(current_count as i64);
                            current_value = value;
                            current_count = 1;
                            unique_idx += 1;
                            unique_values.push(current_value);
                            inverse_indices[original_idx] = unique_idx;
                        }
                    }
                    counts.push(current_count as i64);
                }

                let inverse_tensor = if return_inverse {
                    Some(
                        ArrayD::from_shape_vec(input.shape().to_vec(), inverse_indices).map_err(
                            |_| {
                                RusTorchError::invalid_parameter(
                                    "Invalid shape for inverse indices".to_string(),
                                )
                            },
                        )?,
                    )
                } else {
                    None
                };

                let counts_tensor = if return_counts {
                    Some(
                        ArrayD::from_shape_vec(vec![unique_values.len()], counts).map_err(
                            |_| {
                                RusTorchError::invalid_parameter(
                                    "Invalid shape for counts".to_string(),
                                )
                            },
                        )?,
                    )
                } else {
                    None
                };

                let unique_len = unique_values.len();
                Ok((
                    Tensor::from_vec(unique_values, vec![unique_len]),
                    inverse_tensor,
                    counts_tensor,
                ))
            }
            Some(_dim) => {
                // For now, return error for dimension-specific unique
                Err(RusTorchError::UnsupportedOperation(
                    "Unique along specific dimension not yet implemented",
                ))
            }
        }
    }

    /// Compute histogram of tensor values
    /// テンソル値のヒストグラムを計算
    pub fn histogram<T: Float + 'static>(
        input: &Tensor<T>,
        bins: usize,
        range: Option<(T, T)>,
        density: bool,
    ) -> RusTorchResult<(ArrayD<i64>, Tensor<T>)> {
        let input_data = input
            .data
            .as_slice()
            .ok_or_else(|| RusTorchError::invalid_parameter("Input tensor data not accessible"))?;

        if input_data.is_empty() {
            return Err(RusTorchError::invalid_parameter(
                "Cannot compute histogram of empty tensor",
            ));
        }

        if bins == 0 {
            return Err(RusTorchError::invalid_parameter(
                "Number of bins must be positive",
            ));
        }

        // Determine range
        let (min_val, max_val) = match range {
            Some((min, max)) => {
                if min >= max {
                    return Err(RusTorchError::invalid_parameter(
                        "Range min must be less than max",
                    ));
                }
                (min, max)
            }
            None => {
                let min_val = input_data
                    .iter()
                    .min_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal))
                    .copied()
                    .unwrap();
                let max_val = input_data
                    .iter()
                    .max_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal))
                    .copied()
                    .unwrap();
                (min_val, max_val)
            }
        };

        // Create bins
        let bin_width = (max_val - min_val) / T::from(bins).unwrap();
        let mut bin_edges = Vec::with_capacity(bins + 1);
        let mut bin_counts = vec![0i64; bins];

        for i in 0..=bins {
            bin_edges.push(min_val + T::from(i).unwrap() * bin_width);
        }

        // Count values in each bin
        for &value in input_data.iter() {
            if value >= min_val && value <= max_val {
                let bin_idx = if value == max_val {
                    bins - 1 // Handle edge case where value equals max
                } else {
                    let idx = ((value - min_val) / bin_width)
                        .floor()
                        .to_usize()
                        .unwrap_or(0);
                    idx.min(bins - 1)
                };
                bin_counts[bin_idx] += 1;
            }
        }

        // Note: Density conversion would need separate tensor for float results
        // For now, return raw counts as i64 and let user convert if needed
        if density {
            return Err(RusTorchError::tensor_op(
                "histogram: Density mode not supported with integer counts, use raw counts and normalize separately"
            ));
        }

        Ok((
            ArrayD::from_shape_vec(vec![bins], bin_counts).map_err(|_| {
                RusTorchError::invalid_parameter("Invalid shape for histogram counts".to_string())
            })?,
            Tensor::from_vec(bin_edges, vec![bins + 1]),
        ))
    }
}

// Tests temporarily disabled - bool tensor creation issue needs fixing
/*
#[cfg(test)]
mod tests {
    use super::*;
    use crate::tensor::Tensor;

    #[test]
    fn test_where_operation() {
        let condition = Tensor::from_vec(vec![true, false, true, false], vec![2, 2]);
        let x = Tensor::from_vec(vec![1.0f32, 2.0, 3.0, 4.0], vec![2, 2]);
        let y = Tensor::from_vec(vec![5.0f32, 6.0, 7.0, 8.0], vec![2, 2]);

        let result = conditional::where_(&condition, &x, &y).unwrap();
        let expected = vec![1.0f32, 6.0, 3.0, 8.0];

        assert_eq!(result.data.as_slice().unwrap(), &expected);
        assert_eq!(result.shape(), &[2, 2]);
    }

    #[test]
    fn test_masked_select() {
        let input = Tensor::from_vec(vec![1.0f32, 2.0, 3.0, 4.0], vec![2, 2]);
        let mask = Tensor::from_vec(vec![true, false, true, false], vec![2, 2]);

        let result = conditional::masked_select(&input, &mask).unwrap();
        let expected = vec![1.0f32, 3.0];

        assert_eq!(result.data.as_slice().unwrap(), &expected);
        assert_eq!(result.shape(), &[2]);
    }

    #[test]
    fn test_masked_fill() {
        let input = Tensor::from_vec(vec![1.0f32, 2.0, 3.0, 4.0], vec![2, 2]);
        let mask = Tensor::from_vec(vec![true, false, true, false], vec![2, 2]);

        let result = conditional::masked_fill(&input, &mask, 999.0).unwrap();
        let expected = vec![999.0f32, 2.0, 999.0, 4.0];

        assert_eq!(result.data.as_slice().unwrap(), &expected);
        assert_eq!(result.shape(), &[2, 2]);
    }

    #[test]
    fn test_gather() {
        let input = Tensor::from_vec(vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0], vec![2, 3]);
        let index = ArrayD::from_shape_vec(vec![3], vec![0i64, 2, 1]).unwrap();

        let result = indexing::gather(&input, 1, &index).unwrap();
        // Should gather columns 0, 2, 1 from each row
        // Row 0: [1, 2, 3] -> [1, 3, 2]
        // Row 1: [4, 5, 6] -> [4, 6, 5]
        let expected = vec![1.0f32, 3.0, 2.0, 4.0, 6.0, 5.0];

        assert_eq!(result.data.as_slice().unwrap(), &expected);
        assert_eq!(result.shape(), &[2, 3]);
    }

    #[test]
    fn test_topk() {
        let input = Tensor::from_vec(vec![3.0f32, 1.0, 4.0, 2.0], vec![4]);
        let (values, indices) = statistics::topk(&input, 2, 0, true, true).unwrap();

        // Top 2 largest: 4.0 (index 2), 3.0 (index 0)
        assert_eq!(values.data.as_slice().unwrap(), &[4.0f32, 3.0]);
        assert_eq!(indices.data.as_slice().unwrap(), &[2i64, 0]);
    }

    #[test]
    fn test_kthvalue() {
        let input = Tensor::from_vec(vec![3.0f32, 1.0, 4.0, 2.0], vec![4]);
        let (value, index) = statistics::kthvalue(&input, 1, 0, false).unwrap();

        // 2nd smallest value (1-indexed in k): 2.0 at original index 3
        assert_eq!(value.data.as_slice().unwrap(), &[2.0f32]);
        assert_eq!(index.as_slice().unwrap(), &[3i64]);
    }

    #[test]
    fn test_quantile() {
        let input = Tensor::from_vec(vec![1.0f32, 2.0, 3.0, 4.0, 5.0], vec![5]);
        let q = Tensor::from_vec(vec![0.0f32, 0.5, 1.0], vec![3]);

        let result = statistics::quantile(&input, &q, None, false).unwrap();

        // Expected: min (1.0), median (3.0), max (5.0)
        let expected = vec![1.0f32, 3.0, 5.0];
        assert_eq!(result.data.as_slice().unwrap(), &expected);
    }

    #[test]
    fn test_unique() {
        let input = Tensor::from_vec(vec![1.0f32, 2.0, 1.0, 3.0, 2.0], vec![5]);
        let (unique, inverse, counts) = advanced::unique(&input, true, true, true, None).unwrap();

        // Expected unique values: [1.0, 2.0, 3.0]
        assert_eq!(unique.data.as_slice().unwrap(), &[1.0f32, 2.0, 3.0]);

        // Expected counts: [2, 2, 1] (1.0 appears twice, 2.0 twice, 3.0 once)
        if let Some(counts_tensor) = counts {
            assert_eq!(counts_tensor.data.as_slice().unwrap(), &[2i64, 2, 1]);
        }
    }

    #[test]
    fn test_histogram() {
        let input = Tensor::from_vec(vec![1.0f32, 2.0, 3.0, 4.0, 5.0], vec![5]);
        let (counts, edges) = advanced::histogram(&input, 3, Some((1.0, 5.0)), false).unwrap();

        // 3 bins: [1-2.33), [2.33-3.67), [3.67-5]
        // Values: 1.0, 2.0 in bin 0; 3.0 in bin 1; 4.0, 5.0 in bin 2
        assert_eq!(counts.shape(), &[3]);
        assert_eq!(edges.shape(), &[4]); // bins + 1 edges

        // Check that counts sum to total elements
        let total_count: f32 = counts.data.as_slice().unwrap().iter().sum();
        assert_eq!(total_count, 5.0);
    }

    #[test]
    fn test_scatter() {
        let input = Tensor::from_vec(vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0], vec![2, 3]);
        let index = ArrayD::from_shape_vec(vec![2], vec![0i64, 2]).unwrap();
        let src = Tensor::from_vec(vec![10.0f32, 11.0], vec![2]);

        let result = indexing::scatter(&input, 1, &index, &src).unwrap();

        // Should scatter src values into columns 0 and 2
        assert_eq!(result.shape(), &[2, 3]);
        // First row: [10.0, 2.0, 11.0] (scattered 10.0 at col 0, 11.0 at col 2)
        assert_eq!(result.data.as_slice().unwrap()[0], 10.0); // row 0, col 0
        assert_eq!(result.data.as_slice().unwrap()[2], 11.0); // row 0, col 2
    }

    #[test]
    fn test_index_select() {
        let input = Tensor::from_vec(vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0], vec![2, 3]);
        let index = ArrayD::from_shape_vec(vec![2], vec![0i64, 2]).unwrap();

        let result = indexing::index_select(&input, 1, &index).unwrap();

        // Should select columns 0 and 2
        // Row 0: [1, 3], Row 1: [4, 6]
        let expected = vec![1.0f32, 3.0, 4.0, 6.0];
        assert_eq!(result.data.as_slice().unwrap(), &expected);
        assert_eq!(result.shape(), &[2, 2]);
    }

    #[test]
    fn test_where_operation() {
        let input_a = Tensor::from_vec(vec![1.0f32, 2.0, 3.0, 4.0], vec![2, 2]);
        let input_b = Tensor::from_vec(vec![10.0f32, 20.0, 30.0, 40.0], vec![2, 2]);
        let condition = ArrayD::from_shape_vec(vec![2, 2], vec![true, false, true, false]).unwrap();

        let result = conditional::where_(&input_a, &condition, &input_b).unwrap();

        // Should select from input_a where true, input_b where false
        let expected = vec![1.0f32, 20.0, 3.0, 40.0];
        assert_eq!(result.data.as_slice().unwrap(), &expected);
        assert_eq!(result.shape(), &[2, 2]);
    }
}
*/