use crate::{Device, Result, TensorError};
use crate::benchmarks::{BenchmarkConfig, KernelBenchmark, BenchmarkResult};
use crate::device::async_execution::{AsyncKernel, Priority, kernels::*};
use std::pin::Pin;
use std::future::Future;
use std::time::Duration;
use std::sync::Arc;
pub struct ManipulationBenchmarks {
benchmark: Arc<KernelBenchmark>,
}
impl ManipulationBenchmarks {
pub fn new(config: BenchmarkConfig) -> Self {
Self {
benchmark: Arc::new(KernelBenchmark::new(config)),
}
}
pub async fn benchmark_gather_operations(&self) -> Result<Vec<BenchmarkResult>> {
let mut all_results = Vec::new();
let test_sizes = vec![
(1000, 100), (10000, 1000), (100000, 10000), (1000000, 50000), ];
for (input_size, indices_size) in test_sizes {
let kernel_factory = move |device: Device| {
TensorManipulationKernel::new_gather(input_size, indices_size)
};
let results = self.benchmark.benchmark_kernel(
&format!("gather_{}x{}", input_size, indices_size),
kernel_factory
).await?;
all_results.extend(results);
}
Ok(all_results)
}
pub async fn benchmark_scatter_operations(&self) -> Result<Vec<BenchmarkResult>> {
let mut all_results = Vec::new();
let test_sizes = vec![
(1000, 100),
(10000, 1000),
(100000, 10000),
(1000000, 50000),
];
for (tensor_size, updates_size) in test_sizes {
let kernel_factory = move |device: Device| {
TensorManipulationKernel::new_scatter(tensor_size, updates_size)
};
let results = self.benchmark.benchmark_kernel(
&format!("scatter_{}x{}", tensor_size, updates_size),
kernel_factory
).await?;
all_results.extend(results);
}
Ok(all_results)
}
pub async fn benchmark_roll_operations(&self) -> Result<Vec<BenchmarkResult>> {
let mut all_results = Vec::new();
let test_sizes = vec![
1000, 10000, 100000, 1000000, ];
for size in test_sizes {
let kernel_factory = move |device: Device| {
TensorManipulationKernel::new_roll(size)
};
let results = self.benchmark.benchmark_kernel(
&format!("roll_{}", size),
kernel_factory
).await?;
all_results.extend(results);
}
Ok(all_results)
}
pub async fn benchmark_where_operations(&self) -> Result<Vec<BenchmarkResult>> {
let mut all_results = Vec::new();
let test_sizes = vec![
(1000, 3000), (10000, 30000), (100000, 300000), (500000, 1500000), ];
for (condition_size, total_size) in test_sizes {
let kernel_factory = move |device: Device| {
TensorManipulationKernel::new_where(condition_size, total_size)
};
let results = self.benchmark.benchmark_kernel(
&format!("where_{}x{}", condition_size, total_size),
kernel_factory
).await?;
all_results.extend(results);
}
Ok(all_results)
}
pub async fn run_all_benchmarks(&self) -> Result<Vec<BenchmarkResult>> {
let mut all_results = Vec::new();
let gather_results = self.benchmark_gather_operations().await?;
all_results.extend(gather_results);
let scatter_results = self.benchmark_scatter_operations().await?;
all_results.extend(scatter_results);
let roll_results = self.benchmark_roll_operations().await?;
all_results.extend(roll_results);
let where_results = self.benchmark_where_operations().await?;
all_results.extend(where_results);
Ok(all_results)
}
pub async fn generate_manipulation_report(&self) -> Result<String> {
let results = self.run_all_benchmarks().await?;
let mut report = String::new();
report.push_str("# GPU Manipulation Operations Benchmark Report\n\n");
let mut operation_groups = std::collections::HashMap::new();
for result in results {
let operation = result.kernel_name.split('_').next().unwrap_or("unknown").to_string();
operation_groups.entry(operation).or_insert_with(Vec::new).push(result);
}
for (operation, results) in operation_groups {
report.push_str(&format!("## {} Operation Performance\n\n", operation.to_uppercase()));
report.push_str("| Input Size | Device | Mean Time | Throughput (ops/s) | Bandwidth (GB/s) |\n");
report.push_str("|-----------|---------|-----------|-------------------|------------------|\n");
for result in results {
let input_size = result.kernel_name.split('_').nth(1).unwrap_or("unknown");
report.push_str(&format!(
"| {} | {:?} | {:?} | {:.2e} | {:.2} |\n",
input_size,
result.device,
result.mean_time,
result.throughput_ops_per_sec,
result.memory_bandwidth_gb_per_sec
));
}
report.push_str("\n");
}
Ok(report)
}
}
pub struct ConvolutionBenchmarks {
benchmark: Arc<KernelBenchmark>,
}
impl ConvolutionBenchmarks {
pub fn new(config: BenchmarkConfig) -> Self {
Self {
benchmark: Arc::new(KernelBenchmark::new(config)),
}
}
pub async fn benchmark_conv2d_operations(&self) -> Result<Vec<BenchmarkResult>> {
let mut all_results = Vec::new();
let test_configs = vec![
(1, 3, 32, (224, 224), (3, 3)), (1, 64, 64, (112, 112), (3, 3)), (1, 128, 256, (56, 56), (3, 3)), (8, 3, 64, (224, 224), (7, 7)), (8, 64, 128, (112, 112), (1, 1)), (1, 512, 512, (14, 14), (3, 3)), ];
for (batch_size, in_channels, out_channels, input_size, kernel_size) in test_configs {
let kernel_factory = move |device: Device| {
ConvKernel {
batch_size,
in_channels,
out_channels,
input_size,
kernel_size,
}
};
let results = self.benchmark.benchmark_kernel(
&format!("conv2d_{}x{}x{}_{}x{}_{}x{}",
batch_size, in_channels, out_channels,
input_size.0, input_size.1,
kernel_size.0, kernel_size.1),
kernel_factory
).await?;
all_results.extend(results);
}
Ok(all_results)
}
pub async fn benchmark_matmul_operations(&self) -> Result<Vec<BenchmarkResult>> {
let mut all_results = Vec::new();
let test_sizes = vec![
(128, 128, 128, 1), (256, 256, 256, 1), (512, 512, 512, 1), (1024, 1024, 1024, 1), (128, 128, 128, 8), (256, 256, 256, 8), ];
for (m, k, n, batch_size) in test_sizes {
let kernel_factory = move |device: Device| {
MatMulKernel { m, k, n, batch_size }
};
let results = self.benchmark.benchmark_kernel(
&format!("matmul_{}x{}x{}_batch{}", m, k, n, batch_size),
kernel_factory
).await?;
all_results.extend(results);
}
Ok(all_results)
}
pub async fn benchmark_reduction_operations(&self) -> Result<Vec<BenchmarkResult>> {
let mut all_results = Vec::new();
let test_configs = vec![
(10000, 1000, 1), (100000, 10000, 1), (1000000, 1000, 2), (10000000, 10000, 3), ];
for (input_size, output_size, num_axes) in test_configs {
let sum_kernel_factory = move |device: Device| {
ReductionKernel::new_sum(input_size, output_size, num_axes)
};
let sum_results = self.benchmark.benchmark_kernel(
&format!("sum_{}_{}_axes{}", input_size, output_size, num_axes),
sum_kernel_factory
).await?;
all_results.extend(sum_results);
let mean_kernel_factory = move |device: Device| {
ReductionKernel::new_mean(input_size, output_size, num_axes)
};
let mean_results = self.benchmark.benchmark_kernel(
&format!("mean_{}_{}_axes{}", input_size, output_size, num_axes),
mean_kernel_factory
).await?;
all_results.extend(mean_results);
let max_kernel_factory = move |device: Device| {
ReductionKernel::new_max(input_size, output_size, num_axes)
};
let max_results = self.benchmark.benchmark_kernel(
&format!("max_{}_{}_axes{}", input_size, output_size, num_axes),
max_kernel_factory
).await?;
all_results.extend(max_results);
}
Ok(all_results)
}
pub async fn run_all_benchmarks(&self) -> Result<Vec<BenchmarkResult>> {
let mut all_results = Vec::new();
let conv_results = self.benchmark_conv2d_operations().await?;
all_results.extend(conv_results);
let matmul_results = self.benchmark_matmul_operations().await?;
all_results.extend(matmul_results);
let reduction_results = self.benchmark_reduction_operations().await?;
all_results.extend(reduction_results);
Ok(all_results)
}
pub async fn generate_convolution_report(&self) -> Result<String> {
let results = self.run_all_benchmarks().await?;
let mut report = String::new();
report.push_str("# Convolution Operations Benchmark Report\n\n");
let mut operation_groups = std::collections::HashMap::new();
for result in results {
let operation = result.kernel_name.split('_').next().unwrap_or("unknown").to_string();
operation_groups.entry(operation).or_insert_with(Vec::new).push(result);
}
for (operation, results) in operation_groups {
report.push_str(&format!("## {} Operation Performance\n\n", operation.to_uppercase()));
report.push_str("| Configuration | Device | Mean Time | Throughput (ops/s) | Bandwidth (GB/s) | Efficiency (%) |\n");
report.push_str("|---------------|---------|-----------|-------------------|------------------|----------------|\n");
for result in results {
let config = result.kernel_name.split('_').skip(1).collect::<Vec<_>>().join("_");
report.push_str(&format!(
"| {} | {:?} | {:?} | {:.2e} | {:.2} | {:.1} |\n",
config,
result.device,
result.mean_time,
result.throughput_ops_per_sec,
result.memory_bandwidth_gb_per_sec,
result.efficiency_percentage
));
}
report.push_str("\n");
}
Ok(report)
}
}
pub struct ComprehensiveBenchmarkSuite {
manipulation_benchmarks: ManipulationBenchmarks,
convolution_benchmarks: ConvolutionBenchmarks,
}
impl ComprehensiveBenchmarkSuite {
pub fn new(config: BenchmarkConfig) -> Self {
Self {
manipulation_benchmarks: ManipulationBenchmarks::new(config.clone()),
convolution_benchmarks: ConvolutionBenchmarks::new(config),
}
}
pub async fn run_complete_benchmark_suite(&self) -> Result<String> {
let mut full_report = String::new();
full_report.push_str("# TenfloweRS Complete Benchmark Suite\n\n");
full_report.push_str("This report contains comprehensive benchmarks for all recently implemented operations.\n\n");
let manipulation_report = self.manipulation_benchmarks.generate_manipulation_report().await?;
full_report.push_str(&manipulation_report);
full_report.push_str("\n---\n\n");
let convolution_report = self.convolution_benchmarks.generate_convolution_report().await?;
full_report.push_str(&convolution_report);
Ok(full_report)
}
pub async fn generate_cpu_gpu_comparison(&self) -> Result<String> {
let mut report = String::new();
report.push_str("# CPU vs GPU Performance Comparison\n\n");
let manipulation_results = self.manipulation_benchmarks.run_all_benchmarks().await?;
let convolution_results = self.convolution_benchmarks.run_all_benchmarks().await?;
let mut all_results = manipulation_results;
all_results.extend(convolution_results);
let mut operation_comparisons = std::collections::HashMap::new();
for result in all_results {
let operation = result.kernel_name.split('_').next().unwrap_or("unknown").to_string();
operation_comparisons.entry(operation).or_insert_with(Vec::new).push(result);
}
report.push_str("| Operation | CPU Time | GPU Time | Speedup | Winner |\n");
report.push_str("|-----------|----------|----------|---------|--------|\n");
for (operation, results) in operation_comparisons {
let cpu_results: Vec<_> = results.iter().filter(|r| matches!(r.device, Device::Cpu)).collect();
let gpu_results: Vec<_> = results.iter().filter(|r| !matches!(r.device, Device::Cpu)).collect();
if let (Some(cpu_result), Some(gpu_result)) = (cpu_results.first(), gpu_results.first()) {
let speedup = cpu_result.mean_time.as_nanos() as f64 / gpu_result.mean_time.as_nanos() as f64;
let winner = if speedup > 1.0 { "GPU" } else { "CPU" };
report.push_str(&format!(
"| {} | {:?} | {:?} | {:.2}x | {} |\n",
operation,
cpu_result.mean_time,
gpu_result.mean_time,
speedup.max(1.0 / speedup),
winner
));
}
}
Ok(report)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_manipulation_benchmarks_creation() {
let config = BenchmarkConfig::default();
let benchmarks = ManipulationBenchmarks::new(config);
assert!(std::ptr::addr_of!(benchmarks.benchmark).is_null() == false);
}
#[test]
fn test_convolution_benchmarks_creation() {
let config = BenchmarkConfig::default();
let benchmarks = ConvolutionBenchmarks::new(config);
assert!(std::ptr::addr_of!(benchmarks.benchmark).is_null() == false);
}
#[test]
fn test_comprehensive_suite_creation() {
let config = BenchmarkConfig::default();
let suite = ComprehensiveBenchmarkSuite::new(config);
assert!(std::mem::size_of_val(&suite) > 0);
}
}