use super::config::{record_stats_metrics, STATS_CONFIG};
use crate::tensor::TensorStorage;
use crate::{Result, Tensor, TensorError};
use scirs2_core::numeric::{Float, ToPrimitive};
use std::time::Instant;
macro_rules! float_const {
($val:expr, $t:ty) => {
<$t as scirs2_core::num_traits::NumCast>::from($val)
.expect("float constant conversion should never fail for standard float types")
};
}
pub fn histogram<T>(
x: &Tensor<T>,
bins: usize,
range: Option<(T, T)>,
) -> Result<(Tensor<usize>, Tensor<T>)>
where
T: Float + Default + Send + Sync + 'static + ToPrimitive + bytemuck::Pod + bytemuck::Zeroable,
{
let start_time = Instant::now();
let config = STATS_CONFIG
.read()
.expect("read lock should not be poisoned");
match &x.storage {
TensorStorage::Cpu(arr) => {
let flat_data: Vec<T> = arr.iter().cloned().collect();
let data_size = flat_data.len();
let (min_val, max_val) = if let Some((min, max)) = range {
(min, max)
} else if config.enable_simd && data_size >= config.simd_threshold {
ultra_fast_min_max_simd(&flat_data)
} else if config.enable_parallel && data_size >= config.parallel_threshold {
ultra_fast_min_max_parallel(&flat_data)
} else {
let min = flat_data.iter().fold(T::infinity(), |acc, &x| acc.min(x));
let max = flat_data
.iter()
.fold(T::neg_infinity(), |acc, &x| acc.max(x));
(min, max)
};
let bin_edges = create_bin_edges_optimized(min_val, max_val, bins);
let counts = if config.enable_parallel && data_size >= config.parallel_threshold {
ultra_fast_histogram_parallel(&flat_data, &bin_edges, min_val, max_val, bins)
} else if config.enable_simd && data_size >= config.simd_threshold {
ultra_fast_histogram_simd(&flat_data, &bin_edges, min_val, max_val, bins)
} else {
ultra_fast_histogram_sequential(&flat_data, &bin_edges, min_val, max_val, bins)
};
if config.enable_performance_monitoring {
record_stats_metrics("histogram", data_size, start_time.elapsed(), 0.0, 0.0);
}
let counts_tensor = Tensor::from_vec(counts, &[bins])?;
let edges_tensor = Tensor::from_vec(bin_edges, &[bins + 1])?;
Ok((counts_tensor, edges_tensor))
}
#[cfg(feature = "gpu")]
TensorStorage::Gpu(gpu_buffer) => histogram_gpu(x, gpu_buffer, bins, range),
}
}
fn ultra_fast_min_max_simd<T>(data: &[T]) -> (T, T)
where
T: Float + Default + Send + Sync + 'static + PartialOrd,
{
let chunk_size = 8; let mut global_min = T::infinity();
let mut global_max = T::neg_infinity();
for chunk in data.chunks(chunk_size) {
let chunk_min = chunk.iter().fold(T::infinity(), |acc, &x| acc.min(x));
let chunk_max = chunk.iter().fold(T::neg_infinity(), |acc, &x| acc.max(x));
global_min = global_min.min(chunk_min);
global_max = global_max.max(chunk_max);
}
(global_min, global_max)
}
fn ultra_fast_min_max_parallel<T>(data: &[T]) -> (T, T)
where
T: Float + Default + Send + Sync + 'static + PartialOrd,
{
use rayon::prelude::*;
let chunk_size = data.len() / rayon::current_num_threads().max(1);
let chunk_size = chunk_size.max(1000);
let results: Vec<(T, T)> = data
.par_chunks(chunk_size)
.map(|chunk| {
let min = chunk.iter().fold(T::infinity(), |acc, &x| acc.min(x));
let max = chunk.iter().fold(T::neg_infinity(), |acc, &x| acc.max(x));
(min, max)
})
.collect();
let global_min = results
.iter()
.map(|(min, _)| *min)
.fold(T::infinity(), |acc, x| acc.min(x));
let global_max = results
.iter()
.map(|(_, max)| *max)
.fold(T::neg_infinity(), |acc, x| acc.max(x));
(global_min, global_max)
}
fn create_bin_edges_optimized<T>(min_val: T, max_val: T, bins: usize) -> Vec<T>
where
T: Float + Default + Send + Sync + 'static,
{
let mut bin_edges = Vec::with_capacity(bins + 1);
let bin_width = (max_val - min_val) / float_const!(bins, T);
for i in 0..=bins {
bin_edges.push(min_val + float_const!(i, T) * bin_width);
}
bin_edges
}
fn ultra_fast_histogram_simd<T>(
data: &[T],
_bin_edges: &[T],
min_val: T,
max_val: T,
bins: usize,
) -> Vec<usize>
where
T: Float + Default + Send + Sync + 'static + ToPrimitive,
{
let mut counts = vec![0usize; bins];
let bin_width = (max_val - min_val) / float_const!(bins, T);
let chunk_size = 8; for chunk in data.chunks(chunk_size) {
for &value in chunk {
if value >= min_val && value <= max_val {
let bin_index = ((value - min_val) / bin_width).to_usize().unwrap_or(0);
let bin_index = bin_index.min(bins - 1);
counts[bin_index] += 1;
}
}
}
counts
}
fn ultra_fast_histogram_parallel<T>(
data: &[T],
_bin_edges: &[T],
min_val: T,
max_val: T,
bins: usize,
) -> Vec<usize>
where
T: Float + Default + Send + Sync + 'static + ToPrimitive,
{
use rayon::prelude::*;
let bin_width = (max_val - min_val) / float_const!(bins, T);
let chunk_size = data.len() / rayon::current_num_threads().max(1);
let chunk_size = chunk_size.max(1000);
let partial_histograms: Vec<Vec<usize>> = data
.par_chunks(chunk_size)
.map(|chunk| {
let mut local_counts = vec![0usize; bins];
for &value in chunk {
if value >= min_val && value <= max_val {
let bin_index = ((value - min_val) / bin_width).to_usize().unwrap_or(0);
let bin_index = bin_index.min(bins - 1);
local_counts[bin_index] += 1;
}
}
local_counts
})
.collect();
let mut final_counts = vec![0usize; bins];
for partial in partial_histograms {
for (i, count) in partial.into_iter().enumerate() {
final_counts[i] += count;
}
}
final_counts
}
fn ultra_fast_histogram_sequential<T>(
data: &[T],
_bin_edges: &[T],
min_val: T,
max_val: T,
bins: usize,
) -> Vec<usize>
where
T: Float + Default + Send + Sync + 'static + ToPrimitive,
{
let mut counts = vec![0usize; bins];
let bin_width = (max_val - min_val) / float_const!(bins, T);
for &value in data {
if value >= min_val && value <= max_val {
let bin_index = ((value - min_val) / bin_width).to_usize().unwrap_or(0);
let bin_index = bin_index.min(bins - 1);
counts[bin_index] += 1;
}
}
counts
}
#[cfg(feature = "gpu")]
pub(super) fn histogram_gpu<T>(
x: &Tensor<T>,
gpu_buffer: &crate::gpu::buffer::GpuBuffer<T>,
bins: usize,
range: Option<(T, T)>,
) -> Result<(Tensor<usize>, Tensor<T>)>
where
T: Float + Default + Send + Sync + 'static + ToPrimitive + bytemuck::Pod + bytemuck::Zeroable,
{
use crate::gpu::{buffer::GpuBuffer, GpuContext};
use crate::ReductionOp;
use wgpu::util::DeviceExt;
let device_id = match x.device() {
crate::Device::Gpu(id) => id,
_ => return Err(TensorError::device_mismatch("histogram", "GPU", "CPU")),
};
let gpu_context = crate::device::context::get_gpu_context(*device_id)?;
let (min_val, max_val) = if let Some((min, max)) = range {
(min, max)
} else {
use crate::ops::reduction;
let min_result = reduction::min(x, None, false)?;
let max_result = reduction::max(x, None, false)?;
let min_val = min_result.to_vec()?[0];
let max_val = max_result.to_vec()?[0];
(min_val, max_val)
};
let hist_buffer = gpu_context.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("histogram_bins"),
size: (bins * std::mem::size_of::<u32>()) as u64,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
});
let metadata = vec![
min_val.to_f32().unwrap_or(0.0),
max_val.to_f32().unwrap_or(1.0),
bins as f32,
];
let metadata_buffer =
gpu_context
.device
.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("histogram_metadata"),
contents: bytemuck::cast_slice(&metadata),
usage: wgpu::BufferUsages::STORAGE,
});
let shader = gpu_context
.device
.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some("histogram_shader"),
source: wgpu::ShaderSource::Wgsl(
include_str!("../../gpu/shaders/reduction_ops.wgsl").into(),
),
});
let pipeline = gpu_context
.device
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: Some("histogram_pipeline"),
layout: None,
module: &shader,
entry_point: Some("histogram_computation"),
cache: None,
compilation_options: Default::default(),
});
let bind_group = gpu_context
.device
.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("histogram_bind_group"),
layout: &pipeline.get_bind_group_layout(0),
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: gpu_buffer.buffer().as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 1,
resource: hist_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 2,
resource: metadata_buffer.as_entire_binding(),
},
],
});
let mut encoder = gpu_context
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("histogram_encoder"),
});
{
let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: Some("histogram_pass"),
timestamp_writes: None,
});
compute_pass.set_pipeline(&pipeline);
compute_pass.set_bind_group(0, &bind_group, &[]);
let workgroup_count = (x.numel() + 255) / 256; compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
}
let result_buffer = gpu_context.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("histogram_result"),
size: (bins * std::mem::size_of::<u32>()) as u64,
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
mapped_at_creation: false,
});
encoder.copy_buffer_to_buffer(
&hist_buffer,
0,
&result_buffer,
0,
(bins * std::mem::size_of::<u32>()) as u64,
);
gpu_context.queue.submit(std::iter::once(encoder.finish()));
let buffer_slice = result_buffer.slice(..);
let (sender, receiver) = futures::channel::oneshot::channel();
buffer_slice.map_async(wgpu::MapMode::Read, move |result| {
sender.send(result).expect("channel send should succeed");
});
gpu_context
.device
.poll(wgpu::PollType::wait_indefinitely())
.ok();
futures::executor::block_on(receiver)
.expect("GPU async receiver should not be dropped before sending")
.map_err(|e| {
TensorError::device_error_simple(format!("GPU buffer async error: {:?}", e))
})?;
let data = buffer_slice.get_mapped_range();
let hist_counts: Vec<u32> = bytemuck::cast_slice(&data).to_vec();
drop(data);
result_buffer.unmap();
let hist_counts_usize: Vec<usize> = hist_counts.into_iter().map(|x| x as usize).collect();
let bin_width = (max_val - min_val) / float_const!(bins, T);
let bin_edges: Vec<T> = (0..=bins)
.map(|i| min_val + bin_width * float_const!(i, T))
.collect();
let hist_tensor = Tensor::from_data(hist_counts_usize, &[bins])?;
let edges_tensor = Tensor::from_data(bin_edges, &[bins + 1])?;
Ok((hist_tensor, edges_tensor))
}