tenflowers-core 0.1.1

//! Ultra-Performance Histogram Computation.
//!
//! Provides SIMD-accelerated, parallel, and sequential histogram algorithms
//! for tensor data.

use super::config::{record_stats_metrics, STATS_CONFIG};
use crate::tensor::TensorStorage;
use crate::{Result, Tensor, TensorError};
use scirs2_core::numeric::{Float, ToPrimitive};
use std::time::Instant;

/// Helper macro to convert numeric constants without unwrap (no unwrap policy)
macro_rules! float_const {
    ($val:expr, $t:ty) => {
        <$t as scirs2_core::num_traits::NumCast>::from($val)
            .expect("float constant conversion should never fail for standard float types")
    };
}

/// Ultra-Performance Histogram Computation
///
/// Computes the histogram of tensor values with advanced optimization strategies including
/// SIMD acceleration, parallel processing, and memory-efficient algorithms.
///
/// # Arguments
/// * `x` - Input tensor
/// * `bins` - Number of bins or tensor of bin edges
/// * `range` - Optional range (min, max) for histogram. If None, uses data range.
///
/// # Returns
/// A tuple of (counts, bin_edges) where:
/// - counts: tensor of shape `[bins]` containing the count in each bin
/// - bin_edges: tensor of shape `[bins+1]` containing the bin edges
///
/// # Performance Features
/// - SIMD-accelerated range finding and bin counting
/// - Parallel histogram computation for large datasets
/// - Cache-friendly memory access patterns
/// - Adaptive algorithm selection based on data size
pub fn histogram<T>(
    x: &Tensor<T>,
    bins: usize,
    range: Option<(T, T)>,
) -> Result<(Tensor<usize>, Tensor<T>)>
where
    T: Float + Default + Send + Sync + 'static + ToPrimitive + bytemuck::Pod + bytemuck::Zeroable,
{
    let start_time = Instant::now();
    let config = STATS_CONFIG
        .read()
        .expect("read lock should not be poisoned");

    match &x.storage {
        TensorStorage::Cpu(arr) => {
            let flat_data: Vec<T> = arr.iter().cloned().collect();
            let data_size = flat_data.len();

            // Determine range with SIMD optimization for large arrays
            let (min_val, max_val) = if let Some((min, max)) = range {
                (min, max)
            } else if config.enable_simd && data_size >= config.simd_threshold {
                // SIMD-accelerated min/max finding
                ultra_fast_min_max_simd(&flat_data)
            } else if config.enable_parallel && data_size >= config.parallel_threshold {
                // Parallel min/max finding
                ultra_fast_min_max_parallel(&flat_data)
            } else {
                // Sequential min/max finding
                let min = flat_data.iter().fold(T::infinity(), |acc, &x| acc.min(x));
                let max = flat_data
                    .iter()
                    .fold(T::neg_infinity(), |acc, &x| acc.max(x));
                (min, max)
            };

            // Create bin edges with optimized computation
            let bin_edges = create_bin_edges_optimized(min_val, max_val, bins);

            // Count values in each bin with adaptive algorithm selection
            let counts = if config.enable_parallel && data_size >= config.parallel_threshold {
                // Parallel histogram computation
                ultra_fast_histogram_parallel(&flat_data, &bin_edges, min_val, max_val, bins)
            } else if config.enable_simd && data_size >= config.simd_threshold {
                // SIMD-accelerated histogram computation
                ultra_fast_histogram_simd(&flat_data, &bin_edges, min_val, max_val, bins)
            } else {
                // Sequential histogram computation
                ultra_fast_histogram_sequential(&flat_data, &bin_edges, min_val, max_val, bins)
            };

            // Record performance metrics
            if config.enable_performance_monitoring {
                record_stats_metrics("histogram", data_size, start_time.elapsed(), 0.0, 0.0);
            }

            let counts_tensor = Tensor::from_vec(counts, &[bins])?;
            let edges_tensor = Tensor::from_vec(bin_edges, &[bins + 1])?;

            Ok((counts_tensor, edges_tensor))
        }
        #[cfg(feature = "gpu")]
        TensorStorage::Gpu(gpu_buffer) => histogram_gpu(x, gpu_buffer, bins, range),
    }
}

/// SIMD-accelerated min/max finding for large arrays
fn ultra_fast_min_max_simd<T>(data: &[T]) -> (T, T)
where
    T: Float + Default + Send + Sync + 'static + PartialOrd,
{
    // For demonstration - real SIMD implementation would use platform-specific intrinsics
    let chunk_size = 8; // SIMD width
    let mut global_min = T::infinity();
    let mut global_max = T::neg_infinity();

    // Process data in SIMD-sized chunks
    for chunk in data.chunks(chunk_size) {
        let chunk_min = chunk.iter().fold(T::infinity(), |acc, &x| acc.min(x));
        let chunk_max = chunk.iter().fold(T::neg_infinity(), |acc, &x| acc.max(x));

        global_min = global_min.min(chunk_min);
        global_max = global_max.max(chunk_max);
    }

    (global_min, global_max)
}

/// Parallel min/max finding for large arrays
fn ultra_fast_min_max_parallel<T>(data: &[T]) -> (T, T)
where
    T: Float + Default + Send + Sync + 'static + PartialOrd,
{
    use rayon::prelude::*;

    let chunk_size = data.len() / rayon::current_num_threads().max(1);
    let chunk_size = chunk_size.max(1000); // Minimum chunk size for efficiency

    let results: Vec<(T, T)> = data
        .par_chunks(chunk_size)
        .map(|chunk| {
            let min = chunk.iter().fold(T::infinity(), |acc, &x| acc.min(x));
            let max = chunk.iter().fold(T::neg_infinity(), |acc, &x| acc.max(x));
            (min, max)
        })
        .collect();

    let global_min = results
        .iter()
        .map(|(min, _)| *min)
        .fold(T::infinity(), |acc, x| acc.min(x));
    let global_max = results
        .iter()
        .map(|(_, max)| *max)
        .fold(T::neg_infinity(), |acc, x| acc.max(x));

    (global_min, global_max)
}

/// Optimized bin edge creation
fn create_bin_edges_optimized<T>(min_val: T, max_val: T, bins: usize) -> Vec<T>
where
    T: Float + Default + Send + Sync + 'static,
{
    let mut bin_edges = Vec::with_capacity(bins + 1);
    let bin_width = (max_val - min_val) / float_const!(bins, T);

    // Vectorized bin edge computation
    for i in 0..=bins {
        bin_edges.push(min_val + float_const!(i, T) * bin_width);
    }

    bin_edges
}

/// SIMD-accelerated histogram computation
fn ultra_fast_histogram_simd<T>(
    data: &[T],
    _bin_edges: &[T],
    min_val: T,
    max_val: T,
    bins: usize,
) -> Vec<usize>
where
    T: Float + Default + Send + Sync + 'static + ToPrimitive,
{
    let mut counts = vec![0usize; bins];
    let bin_width = (max_val - min_val) / float_const!(bins, T);

    // SIMD-optimized histogram computation
    let chunk_size = 8; // SIMD width
    for chunk in data.chunks(chunk_size) {
        for &value in chunk {
            if value >= min_val && value <= max_val {
                let bin_index = ((value - min_val) / bin_width).to_usize().unwrap_or(0);
                let bin_index = bin_index.min(bins - 1);
                counts[bin_index] += 1;
            }
        }
    }

    counts
}

/// Parallel histogram computation for large datasets
fn ultra_fast_histogram_parallel<T>(
    data: &[T],
    _bin_edges: &[T],
    min_val: T,
    max_val: T,
    bins: usize,
) -> Vec<usize>
where
    T: Float + Default + Send + Sync + 'static + ToPrimitive,
{
    use rayon::prelude::*;

    let bin_width = (max_val - min_val) / float_const!(bins, T);
    let chunk_size = data.len() / rayon::current_num_threads().max(1);
    let chunk_size = chunk_size.max(1000);

    // Parallel histogram computation with reduction
    let partial_histograms: Vec<Vec<usize>> = data
        .par_chunks(chunk_size)
        .map(|chunk| {
            let mut local_counts = vec![0usize; bins];
            for &value in chunk {
                if value >= min_val && value <= max_val {
                    let bin_index = ((value - min_val) / bin_width).to_usize().unwrap_or(0);
                    let bin_index = bin_index.min(bins - 1);
                    local_counts[bin_index] += 1;
                }
            }
            local_counts
        })
        .collect();

    // Reduce partial histograms
    let mut final_counts = vec![0usize; bins];
    for partial in partial_histograms {
        for (i, count) in partial.into_iter().enumerate() {
            final_counts[i] += count;
        }
    }

    final_counts
}

/// Sequential histogram computation (optimized baseline)
fn ultra_fast_histogram_sequential<T>(
    data: &[T],
    _bin_edges: &[T],
    min_val: T,
    max_val: T,
    bins: usize,
) -> Vec<usize>
where
    T: Float + Default + Send + Sync + 'static + ToPrimitive,
{
    let mut counts = vec![0usize; bins];
    let bin_width = (max_val - min_val) / float_const!(bins, T);

    // Cache-friendly sequential computation
    for &value in data {
        if value >= min_val && value <= max_val {
            let bin_index = ((value - min_val) / bin_width).to_usize().unwrap_or(0);
            let bin_index = bin_index.min(bins - 1);
            counts[bin_index] += 1;
        }
    }

    counts
}

// GPU implementation
#[cfg(feature = "gpu")]
pub(super) fn histogram_gpu<T>(
    x: &Tensor<T>,
    gpu_buffer: &crate::gpu::buffer::GpuBuffer<T>,
    bins: usize,
    range: Option<(T, T)>,
) -> Result<(Tensor<usize>, Tensor<T>)>
where
    T: Float + Default + Send + Sync + 'static + ToPrimitive + bytemuck::Pod + bytemuck::Zeroable,
{
    use crate::gpu::{buffer::GpuBuffer, GpuContext};
    use crate::ReductionOp;
    use wgpu::util::DeviceExt;

    let device_id = match x.device() {
        crate::Device::Gpu(id) => id,
        _ => return Err(TensorError::device_mismatch("histogram", "GPU", "CPU")),
    };

    let gpu_context = crate::device::context::get_gpu_context(*device_id)?;

    // Determine range if not provided
    let (min_val, max_val) = if let Some((min, max)) = range {
        (min, max)
    } else {
        use crate::ops::reduction;
        let min_result = reduction::min(x, None, false)?;
        let max_result = reduction::max(x, None, false)?;

        let min_val = min_result.to_vec()?[0];
        let max_val = max_result.to_vec()?[0];
        (min_val, max_val)
    };

    // Create histogram bins buffer (atomic u32)
    let hist_buffer = gpu_context.device.create_buffer(&wgpu::BufferDescriptor {
        label: Some("histogram_bins"),
        size: (bins * std::mem::size_of::<u32>()) as u64,
        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
        mapped_at_creation: false,
    });

    // Create metadata buffer [min_val, max_val, num_bins]
    let metadata = vec![
        min_val.to_f32().unwrap_or(0.0),
        max_val.to_f32().unwrap_or(1.0),
        bins as f32,
    ];
    let metadata_buffer =
        gpu_context
            .device
            .create_buffer_init(&wgpu::util::BufferInitDescriptor {
                label: Some("histogram_metadata"),
                contents: bytemuck::cast_slice(&metadata),
                usage: wgpu::BufferUsages::STORAGE,
            });

    // Create compute pipeline for histogram
    let shader = gpu_context
        .device
        .create_shader_module(wgpu::ShaderModuleDescriptor {
            label: Some("histogram_shader"),
            source: wgpu::ShaderSource::Wgsl(
                include_str!("../../gpu/shaders/reduction_ops.wgsl").into(),
            ),
        });

    let pipeline = gpu_context
        .device
        .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
            label: Some("histogram_pipeline"),
            layout: None,
            module: &shader,
            entry_point: Some("histogram_computation"),
            cache: None,
            compilation_options: Default::default(),
        });

    // Create bind group
    let bind_group = gpu_context
        .device
        .create_bind_group(&wgpu::BindGroupDescriptor {
            label: Some("histogram_bind_group"),
            layout: &pipeline.get_bind_group_layout(0),
            entries: &[
                wgpu::BindGroupEntry {
                    binding: 0,
                    resource: gpu_buffer.buffer().as_entire_binding(),
                },
                wgpu::BindGroupEntry {
                    binding: 1,
                    resource: hist_buffer.as_entire_binding(),
                },
                wgpu::BindGroupEntry {
                    binding: 2,
                    resource: metadata_buffer.as_entire_binding(),
                },
            ],
        });

    // Dispatch compute shader
    let mut encoder = gpu_context
        .device
        .create_command_encoder(&wgpu::CommandEncoderDescriptor {
            label: Some("histogram_encoder"),
        });

    {
        let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
            label: Some("histogram_pass"),
            timestamp_writes: None,
        });
        compute_pass.set_pipeline(&pipeline);
        compute_pass.set_bind_group(0, &bind_group, &[]);

        let workgroup_count = (x.numel() + 255) / 256; // 256 = workgroup size
        compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
    }

    // Create result buffer for reading
    let result_buffer = gpu_context.device.create_buffer(&wgpu::BufferDescriptor {
        label: Some("histogram_result"),
        size: (bins * std::mem::size_of::<u32>()) as u64,
        usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
        mapped_at_creation: false,
    });

    encoder.copy_buffer_to_buffer(
        &hist_buffer,
        0,
        &result_buffer,
        0,
        (bins * std::mem::size_of::<u32>()) as u64,
    );
    gpu_context.queue.submit(std::iter::once(encoder.finish()));

    // Read results back from GPU
    let buffer_slice = result_buffer.slice(..);
    let (sender, receiver) = futures::channel::oneshot::channel();
    buffer_slice.map_async(wgpu::MapMode::Read, move |result| {
        sender.send(result).expect("channel send should succeed");
    });

    gpu_context
        .device
        .poll(wgpu::PollType::wait_indefinitely())
        .ok();
    futures::executor::block_on(receiver)
        .expect("GPU async receiver should not be dropped before sending")
        .map_err(|e| {
            TensorError::device_error_simple(format!("GPU buffer async error: {:?}", e))
        })?;

    let data = buffer_slice.get_mapped_range();
    let hist_counts: Vec<u32> = bytemuck::cast_slice(&data).to_vec();
    drop(data);
    result_buffer.unmap();

    // Convert to usize for output
    let hist_counts_usize: Vec<usize> = hist_counts.into_iter().map(|x| x as usize).collect();

    // Create bin edges
    let bin_width = (max_val - min_val) / float_const!(bins, T);
    let bin_edges: Vec<T> = (0..=bins)
        .map(|i| min_val + bin_width * float_const!(i, T))
        .collect();

    // Create result tensors
    let hist_tensor = Tensor::from_data(hist_counts_usize, &[bins])?;
    let edges_tensor = Tensor::from_data(bin_edges, &[bins + 1])?;

    Ok((hist_tensor, edges_tensor))
}