use crate::kernel_fusion::graph::Device;
use crate::kernel_fusion::operation_types::OperationType;
use std::collections::HashMap;
#[derive(Debug, Default)]
pub struct PerformanceDatabase {
pub operation_costs: HashMap<OperationType, OperationCost>,
pub fusion_benefits: HashMap<String, f64>, pub device_characteristics: HashMap<Device, DeviceCharacteristics>,
}
#[derive(Debug, Clone)]
pub struct OperationCost {
pub ops_per_element: f64,
pub memory_bandwidth_factor: f64,
pub launch_overhead_ns: u64,
pub parallelization_efficiency: f64,
}
#[derive(Debug, Clone)]
pub struct DeviceCharacteristics {
pub peak_compute_ops: f64, pub memory_bandwidth_gbps: f64, pub cache_size_kb: usize,
pub warp_size: usize,
pub max_threads_per_block: usize,
pub register_file_size: usize,
}
#[derive(Debug, Clone, Default)]
pub struct FusionStatistics {
pub total_fusions_attempted: u64,
pub successful_fusions: u64,
pub total_speedup: f64,
pub memory_saved_bytes: u64,
pub patterns_used: HashMap<String, u64>,
}
impl PerformanceDatabase {
pub fn new() -> Self {
Self::default()
}
pub fn add_operation_cost(&mut self, op_type: OperationType, cost: OperationCost) {
self.operation_costs.insert(op_type, cost);
}
pub fn get_operation_cost(&self, op_type: &OperationType) -> Option<&OperationCost> {
self.operation_costs.get(op_type)
}
pub fn record_fusion_benefit(&mut self, pattern_hash: String, speedup: f64) {
self.fusion_benefits.insert(pattern_hash, speedup);
}
pub fn get_fusion_benefit(&self, pattern_hash: &str) -> Option<f64> {
self.fusion_benefits.get(pattern_hash).copied()
}
pub fn add_device_characteristics(
&mut self,
device: Device,
characteristics: DeviceCharacteristics,
) {
self.device_characteristics.insert(device, characteristics);
}
pub fn get_device_characteristics(&self, device: &Device) -> Option<&DeviceCharacteristics> {
self.device_characteristics.get(device)
}
}
impl OperationCost {
pub fn new(ops_per_element: f64, memory_bandwidth_factor: f64) -> Self {
Self {
ops_per_element,
memory_bandwidth_factor,
launch_overhead_ns: 1000, parallelization_efficiency: 0.8, }
}
pub fn with_launch_overhead(mut self, overhead_ns: u64) -> Self {
self.launch_overhead_ns = overhead_ns;
self
}
pub fn with_parallelization_efficiency(mut self, efficiency: f64) -> Self {
self.parallelization_efficiency = efficiency;
self
}
}
impl DeviceCharacteristics {
pub fn new(peak_compute_ops: f64, memory_bandwidth_gbps: f64) -> Self {
Self {
peak_compute_ops,
memory_bandwidth_gbps,
cache_size_kb: 256, warp_size: 32, max_threads_per_block: 1024, register_file_size: 65536, }
}
pub fn cpu_characteristics() -> Self {
Self {
peak_compute_ops: 100.0, memory_bandwidth_gbps: 50.0, cache_size_kb: 32768, warp_size: 1, max_threads_per_block: std::thread::available_parallelism()
.map(|p| p.get())
.unwrap_or(1),
register_file_size: 16 * 32, }
}
pub fn gpu_characteristics() -> Self {
Self {
peak_compute_ops: 10000.0, memory_bandwidth_gbps: 900.0, cache_size_kb: 6144, warp_size: 32, max_threads_per_block: 1024, register_file_size: 65536, }
}
pub fn is_compute_bound(&self, ops_per_byte: f64) -> bool {
ops_per_byte > (self.peak_compute_ops / self.memory_bandwidth_gbps)
}
}
impl FusionStatistics {
pub fn new() -> Self {
Self::default()
}
pub fn record_fusion_attempt(&mut self) {
self.total_fusions_attempted += 1;
}
pub fn record_successful_fusion(
&mut self,
pattern_name: &str,
speedup: f64,
memory_saved: u64,
) {
self.successful_fusions += 1;
self.total_speedup += speedup;
self.memory_saved_bytes += memory_saved;
*self.patterns_used.entry(pattern_name.to_string()).or_insert(0) += 1;
}
pub fn success_rate(&self) -> f64 {
if self.total_fusions_attempted == 0 {
0.0
} else {
self.successful_fusions as f64 / self.total_fusions_attempted as f64
}
}
pub fn average_speedup(&self) -> f64 {
if self.successful_fusions == 0 {
1.0
} else {
self.total_speedup / self.successful_fusions as f64
}
}
}