use crate::cpu::autotuning::{TuningConfig, TuningResult, PerformanceMeasurement, TuningCache};
use crate::cpu::platform_optimization::{X86Microarchitecture, ArmMicroarchitecture, CpuFeatures};
use crate::error::BackendResult;
use torsh_core::error::TorshError;
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
#[cfg(feature = "serialize")]
use serde::{Serialize, Deserialize};
#[cfg(not(feature = "std"))]
use alloc::{boxed::Box, string::String, vec::Vec};
#[derive(Debug)]
pub struct SciRS2AutoTuner {
cache: TuningCache,
optimization_profiles: Arc<Mutex<HashMap<String, OptimizationProfile>>>,
performance_tracker: Arc<Mutex<PerformanceTracker>>,
kernel_registry: Arc<Mutex<KernelRegistry>>,
adaptive_controller: AdaptiveTuningController,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct OptimizationProfile {
pub architecture: String,
pub microarchitecture: String,
pub preferred_vector_width: usize,
pub optimal_thread_ratios: Vec<f32>,
pub cache_blocking_factors: Vec<usize>,
pub memory_access_patterns: MemoryAccessProfile,
pub kernel_fusion_opportunities: Vec<FusionOpportunity>,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct MemoryAccessProfile {
pub sequential_bandwidth: f64, pub random_bandwidth: f64, pub latency_penalty: f64, pub prefetch_effectiveness: f64, pub cache_line_utilization: f64, }
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct FusionOpportunity {
pub pattern: String,
pub kernel_types: Vec<String>,
pub performance_gain: f64, pub memory_reduction: f64, }
#[derive(Debug)]
struct PerformanceTracker {
operation_stats: HashMap<String, OperationStats>,
global_stats: GlobalPerformanceStats,
thermal_history: Vec<ThermalMeasurement>,
power_history: Vec<PowerMeasurement>,
}
#[derive(Debug, Clone)]
struct OperationStats {
total_calls: usize,
total_time: Duration,
average_throughput: f64,
performance_variance: f64,
cache_efficiency: f64,
thermal_impact: f64,
}
#[derive(Debug, Default)]
struct GlobalPerformanceStats {
cpu_utilization: f64,
memory_bandwidth_utilization: f64,
cache_hit_ratio: f64,
thermal_throttling_events: usize,
power_efficiency_score: f64,
}
#[derive(Debug, Clone)]
struct ThermalMeasurement {
timestamp: Instant,
temperature: f32, throttling_level: f32, }
#[derive(Debug, Clone)]
struct PowerMeasurement {
timestamp: Instant,
power_draw: f32, efficiency: f32, }
#[derive(Debug)]
struct KernelRegistry {
kernels: HashMap<String, KernelDescriptor>,
tuning_sessions: HashMap<String, TuningSession>,
optimization_history: Vec<OptimizationEvent>,
}
#[derive(Debug, Clone)]
struct KernelDescriptor {
name: String,
operation_type: OperationType,
parameter_space: ParameterSpace,
constraints: Vec<OptimizationConstraint>,
scirs2_kernel_id: Option<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum OperationType {
ElementWise,
MatrixMultiply,
Convolution,
Reduction,
FFT,
Sort,
Scan,
Gather,
Scatter,
Custom,
}
#[derive(Debug, Clone)]
struct ParameterSpace {
thread_counts: Vec<usize>,
block_sizes: Vec<usize>,
vector_widths: Vec<usize>,
unroll_factors: Vec<usize>,
tile_sizes: Vec<(usize, usize)>,
scheduling_strategies: Vec<SchedulingStrategy>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum SchedulingStrategy {
Static,
Dynamic,
Guided,
WorkStealing,
NUMA_Aware,
ThermalAware,
}
#[derive(Debug, Clone)]
enum OptimizationConstraint {
MaxMemoryUsage(usize),
MaxPowerDraw(f32),
MaxTemperature(f32),
MinPerformance(f64),
LatencyBound(Duration),
}
#[derive(Debug)]
struct TuningSession {
kernel_name: String,
start_time: Instant,
tested_configurations: Vec<TestedConfiguration>,
best_configuration: Option<TestedConfiguration>,
convergence_threshold: f64,
max_evaluations: usize,
}
#[derive(Debug, Clone)]
struct TestedConfiguration {
parameters: HashMap<String, usize>,
performance: PerformanceMeasurement,
thermal_impact: f32,
power_efficiency: f32,
stability_score: f32,
}
#[derive(Debug, Clone)]
struct OptimizationEvent {
timestamp: Instant,
kernel_name: String,
input_characteristics: InputCharacteristics,
optimization_result: OptimizationResult,
environment_conditions: EnvironmentConditions,
}
#[derive(Debug, Clone)]
struct InputCharacteristics {
size: usize,
shape: Vec<usize>,
data_type: String,
access_pattern: String,
locality_score: f64,
}
#[derive(Debug, Clone)]
struct OptimizationResult {
speedup_achieved: f64,
memory_efficiency_gain: f64,
power_efficiency_gain: f64,
thermal_impact_reduction: f64,
}
#[derive(Debug, Clone)]
struct EnvironmentConditions {
cpu_temperature: f32,
memory_pressure: f64,
system_load: f64,
thermal_throttling: bool,
}
#[derive(Debug)]
struct AdaptiveTuningController {
learning_rate: f64,
exploration_factor: f64,
exploitation_threshold: f64,
convergence_history: Vec<f64>,
prediction_model: PredictionModel,
}
#[derive(Debug)]
struct PredictionModel {
feature_weights: HashMap<String, f64>,
bias: f64,
confidence_threshold: f64,
}
impl SciRS2AutoTuner {
pub fn new() -> BackendResult<Self> {
let optimization_profiles = Self::initialize_optimization_profiles()?;
Ok(Self {
cache: TuningCache::new(),
optimization_profiles: Arc::new(Mutex::new(optimization_profiles)),
performance_tracker: Arc::new(Mutex::new(PerformanceTracker::new())),
kernel_registry: Arc::new(Mutex::new(KernelRegistry::new())),
adaptive_controller: AdaptiveTuningController::new(),
})
}
fn initialize_optimization_profiles() -> BackendResult<HashMap<String, OptimizationProfile>> {
let mut profiles = HashMap::new();
profiles.insert("intel_haswell".to_string(), OptimizationProfile {
architecture: "x86_64".to_string(),
microarchitecture: "Haswell".to_string(),
preferred_vector_width: 256,
optimal_thread_ratios: vec![1.0, 0.8, 0.6, 0.4],
cache_blocking_factors: vec![16, 128, 4096],
memory_access_patterns: MemoryAccessProfile {
sequential_bandwidth: 25.6,
random_bandwidth: 8.5,
latency_penalty: 300.0,
prefetch_effectiveness: 0.85,
cache_line_utilization: 0.7,
},
kernel_fusion_opportunities: vec![
FusionOpportunity {
pattern: "element_wise_chain".to_string(),
kernel_types: vec!["add".to_string(), "mul".to_string(), "relu".to_string()],
performance_gain: 2.1,
memory_reduction: 0.6,
}
],
});
profiles.insert("intel_skylake".to_string(), OptimizationProfile {
architecture: "x86_64".to_string(),
microarchitecture: "Skylake".to_string(),
preferred_vector_width: 512,
optimal_thread_ratios: vec![1.0, 0.85, 0.7, 0.5],
cache_blocking_factors: vec![24, 192, 6144],
memory_access_patterns: MemoryAccessProfile {
sequential_bandwidth: 38.4,
random_bandwidth: 12.8,
latency_penalty: 280.0,
prefetch_effectiveness: 0.9,
cache_line_utilization: 0.8,
},
kernel_fusion_opportunities: vec![
FusionOpportunity {
pattern: "conv_bn_relu".to_string(),
kernel_types: vec!["conv2d".to_string(), "batch_norm".to_string(), "relu".to_string()],
performance_gain: 3.2,
memory_reduction: 0.75,
}
],
});
profiles.insert("apple_m1".to_string(), OptimizationProfile {
architecture: "aarch64".to_string(),
microarchitecture: "M1".to_string(),
preferred_vector_width: 128,
optimal_thread_ratios: vec![1.0, 0.8, 0.5, 0.3], cache_blocking_factors: vec![48, 3072, 24576],
memory_access_patterns: MemoryAccessProfile {
sequential_bandwidth: 68.25,
random_bandwidth: 45.5,
latency_penalty: 120.0,
prefetch_effectiveness: 0.95,
cache_line_utilization: 0.9,
},
kernel_fusion_opportunities: vec![
FusionOpportunity {
pattern: "neural_engine_chain".to_string(),
kernel_types: vec!["matmul".to_string(), "add".to_string(), "activation".to_string()],
performance_gain: 4.5,
memory_reduction: 0.8,
}
],
});
profiles.insert("apple_m3".to_string(), OptimizationProfile {
architecture: "aarch64".to_string(),
microarchitecture: "M3".to_string(),
preferred_vector_width: 128,
optimal_thread_ratios: vec![1.0, 0.85, 0.6, 0.35],
cache_blocking_factors: vec![48, 4608, 36864],
memory_access_patterns: MemoryAccessProfile {
sequential_bandwidth: 150.0,
random_bandwidth: 100.0,
latency_penalty: 100.0,
prefetch_effectiveness: 0.97,
cache_line_utilization: 0.95,
},
kernel_fusion_opportunities: vec![
FusionOpportunity {
pattern: "advanced_ml_chain".to_string(),
kernel_types: vec!["matmul".to_string(), "gelu".to_string(), "layernorm".to_string()],
performance_gain: 5.2,
memory_reduction: 0.85,
}
],
});
Ok(profiles)
}
pub fn register_kernel(&self, kernel_descriptor: KernelDescriptor) -> BackendResult<()> {
let mut registry = self.kernel_registry.lock().map_err(|_| {
TorshError::AllocationError("Failed to acquire kernel registry lock".to_string())
})?;
registry.kernels.insert(kernel_descriptor.name.clone(), kernel_descriptor);
Ok(())
}
pub fn tune_kernel(
&self,
kernel_name: &str,
input_chars: InputCharacteristics,
environment: EnvironmentConditions,
) -> BackendResult<TuningResult> {
let cache_key = self.generate_cache_key(kernel_name, &input_chars, &environment);
if let Some(cached_result) = self.cache.get(&cache_key) {
return Ok(cached_result);
}
let kernel_desc = {
let registry = self.kernel_registry.lock().map_err(|_| {
TorshError::AllocationError("Failed to acquire kernel registry lock".to_string())
})?;
registry.kernels.get(kernel_name)
.ok_or_else(|| TorshError::InvalidArgument(format!("Kernel {} not found", kernel_name)))?
.clone()
};
let profile = self.get_optimization_profile()?;
let mut session = TuningSession {
kernel_name: kernel_name.to_string(),
start_time: Instant::now(),
tested_configurations: Vec::new(),
best_configuration: None,
convergence_threshold: 0.01, max_evaluations: 100,
};
let candidate_configs = self.generate_candidate_configurations(&kernel_desc, &profile, &input_chars)?;
for config in candidate_configs.into_iter().take(session.max_evaluations) {
let test_result = self.evaluate_configuration(&kernel_desc, &config, &input_chars, &environment)?;
session.tested_configurations.push(test_result);
if session.best_configuration.is_none() ||
self.is_better_configuration(&test_result, session.best_configuration.as_ref().expect("best configuration should be present")) {
session.best_configuration = Some(test_result);
}
if self.check_convergence(&session) {
break;
}
}
let tuning_result = self.create_tuning_result(&session, &kernel_desc)?;
self.cache.put(cache_key, tuning_result.clone());
self.record_optimization_event(kernel_name, input_chars, &tuning_result, environment)?;
Ok(tuning_result)
}
fn generate_cache_key(
&self,
kernel_name: &str,
input_chars: &InputCharacteristics,
environment: &EnvironmentConditions,
) -> String {
format!(
"{}:{}:{}:{}:{}",
kernel_name,
input_chars.size,
input_chars.data_type,
(environment.cpu_temperature as u32),
(environment.memory_pressure * 100.0) as u32
)
}
fn get_optimization_profile(&self) -> BackendResult<OptimizationProfile> {
let profiles = self.optimization_profiles.lock().map_err(|_| {
TorshError::AllocationError("Failed to acquire optimization profiles lock".to_string())
})?;
let arch_key = self.detect_architecture_key()?;
profiles.get(&arch_key)
.cloned()
.or_else(|| profiles.get("default"))
.cloned()
.ok_or_else(|| TorshError::BackendError("No optimization profile available".to_string()))
}
fn detect_architecture_key(&self) -> BackendResult<String> {
#[cfg(target_arch = "x86_64")]
{
use crate::cpu::x86_64_enhanced::get_optimizer;
let optimizer = get_optimizer();
let info = optimizer.get_microarch_info();
let key = match info.name.as_str() {
"Haswell" | "Broadwell" => "intel_haswell",
"Skylake" | "KabyLake" | "CoffeeLake" | "IceLake" | "TigerLake" => "intel_skylake",
_ => "intel_skylake", };
Ok(key.to_string())
}
#[cfg(target_arch = "aarch64")]
{
use crate::cpu::arm64_enhanced::get_arm64_optimizer;
let optimizer = get_arm64_optimizer();
let info = optimizer.get_microarch_info();
let key = match info.name.as_str() {
"M1" => "apple_m1",
"M2" => "apple_m1", "M3" => "apple_m3",
_ => "apple_m1", };
Ok(key.to_string())
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
Ok("default".to_string())
}
}
fn generate_candidate_configurations(
&self,
kernel_desc: &KernelDescriptor,
profile: &OptimizationProfile,
input_chars: &InputCharacteristics,
) -> BackendResult<Vec<HashMap<String, usize>>> {
let mut configs = Vec::new();
match kernel_desc.operation_type {
OperationType::MatrixMultiply => {
self.generate_matmul_configurations(&mut configs, profile, input_chars)?;
}
OperationType::Convolution => {
self.generate_conv_configurations(&mut configs, profile, input_chars)?;
}
OperationType::ElementWise => {
self.generate_elementwise_configurations(&mut configs, profile, input_chars)?;
}
_ => {
self.generate_default_configurations(&mut configs, profile, input_chars)?;
}
}
let suggested_configs = self.adaptive_controller.suggest_configurations(kernel_desc, input_chars)?;
configs.extend(suggested_configs);
Ok(configs)
}
fn generate_matmul_configurations(
&self,
configs: &mut Vec<HashMap<String, usize>>,
profile: &OptimizationProfile,
input_chars: &InputCharacteristics,
) -> BackendResult<()> {
let size = input_chars.size;
let cache_factors = &profile.cache_blocking_factors;
for &l1_block in cache_factors.iter() {
for &l2_block in cache_factors.iter() {
if l2_block > l1_block {
for &thread_count in &[1, 2, 4, 8, 16] {
if thread_count <= num_cpus::get() {
let mut config = HashMap::new();
config.insert("block_m".to_string(), l1_block.min(size));
config.insert("block_n".to_string(), l1_block.min(size));
config.insert("block_k".to_string(), l2_block.min(size));
config.insert("thread_count".to_string(), thread_count);
config.insert("vector_width".to_string(), profile.preferred_vector_width);
configs.push(config);
}
}
}
}
}
Ok(())
}
fn generate_conv_configurations(
&self,
configs: &mut Vec<HashMap<String, usize>>,
profile: &OptimizationProfile,
input_chars: &InputCharacteristics,
) -> BackendResult<()> {
let base_tile_sizes = vec![4, 8, 16, 32];
for &tile_h in &base_tile_sizes {
for &tile_w in &base_tile_sizes {
for &channel_unroll in &[1, 2, 4, 8, 16] {
for &thread_count in &[1, 2, 4, 8] {
if thread_count <= num_cpus::get() {
let mut config = HashMap::new();
config.insert("tile_height".to_string(), tile_h);
config.insert("tile_width".to_string(), tile_w);
config.insert("channel_unroll".to_string(), channel_unroll);
config.insert("thread_count".to_string(), thread_count);
config.insert("vector_width".to_string(), profile.preferred_vector_width);
configs.push(config);
}
}
}
}
}
Ok(())
}
fn generate_elementwise_configurations(
&self,
configs: &mut Vec<HashMap<String, usize>>,
profile: &OptimizationProfile,
input_chars: &InputCharacteristics,
) -> BackendResult<()> {
let size = input_chars.size;
let chunk_sizes = vec![64, 256, 1024, 4096, 16384];
for &chunk_size in &chunk_sizes {
for &thread_count in &[1, 2, 4, 8, 16] {
if thread_count <= num_cpus::get() {
let mut config = HashMap::new();
config.insert("chunk_size".to_string(), chunk_size.min(size / thread_count + 1));
config.insert("thread_count".to_string(), thread_count);
config.insert("vector_width".to_string(), profile.preferred_vector_width);
config.insert("unroll_factor".to_string(), 4);
configs.push(config);
}
}
}
Ok(())
}
fn generate_default_configurations(
&self,
configs: &mut Vec<HashMap<String, usize>>,
profile: &OptimizationProfile,
_input_chars: &InputCharacteristics,
) -> BackendResult<()> {
for &thread_count in &[1, 2, 4, 8] {
if thread_count <= num_cpus::get() {
let mut config = HashMap::new();
config.insert("thread_count".to_string(), thread_count);
config.insert("vector_width".to_string(), profile.preferred_vector_width);
configs.push(config);
}
}
Ok(())
}
fn evaluate_configuration(
&self,
_kernel_desc: &KernelDescriptor,
config: &HashMap<String, usize>,
_input_chars: &InputCharacteristics,
_environment: &EnvironmentConditions,
) -> BackendResult<TestedConfiguration> {
let start_time = Instant::now();
let simulated_performance = self.simulate_kernel_execution(config)?;
let execution_time = start_time.elapsed();
Ok(TestedConfiguration {
parameters: config.clone(),
performance: PerformanceMeasurement {
execution_time,
throughput: simulated_performance.throughput,
efficiency: simulated_performance.efficiency,
cache_hit_ratio: simulated_performance.cache_hit_ratio,
},
thermal_impact: simulated_performance.thermal_impact,
power_efficiency: simulated_performance.power_efficiency,
stability_score: simulated_performance.stability_score,
})
}
fn simulate_kernel_execution(&self, config: &HashMap<String, usize>) -> BackendResult<SimulatedPerformance> {
let thread_count = config.get("thread_count").unwrap_or(&1);
let vector_width = config.get("vector_width").unwrap_or(&128);
let base_throughput = 1e6; let thread_scaling = (*thread_count as f64).sqrt(); let vector_scaling = (*vector_width as f64) / 128.0;
let throughput = base_throughput * thread_scaling * vector_scaling;
let efficiency = (thread_scaling / *thread_count as f64).min(1.0);
let cache_hit_ratio = 0.95 - ((*thread_count as f64 - 1.0) * 0.05);
Ok(SimulatedPerformance {
throughput,
efficiency,
cache_hit_ratio: cache_hit_ratio.max(0.7),
thermal_impact: (*thread_count as f32) * 0.1,
power_efficiency: efficiency as f32 * 0.8,
stability_score: 0.95,
})
}
fn is_better_configuration(&self, a: &TestedConfiguration, b: &TestedConfiguration) -> bool {
a.performance.composite_score() > b.performance.composite_score()
}
fn check_convergence(&self, session: &TuningSession) -> bool {
if session.tested_configurations.len() < 5 {
return false;
}
let recent_scores: Vec<f64> = session.tested_configurations
.iter()
.rev()
.take(5)
.map(|c| c.performance.composite_score())
.collect();
let max_score = recent_scores.iter().fold(0.0f64, |a, &b| a.max(b));
let min_score = recent_scores.iter().fold(f64::INFINITY, |a, &b| a.min(b));
(max_score - min_score) / max_score < session.convergence_threshold
}
fn create_tuning_result(&self, session: &TuningSession, kernel_desc: &KernelDescriptor) -> BackendResult<TuningResult> {
let best_config = session.best_configuration.as_ref()
.ok_or_else(|| TorshError::BackendError("No best configuration found".to_string()))?;
let config = TuningConfig {
operation_name: kernel_desc.name.clone(),
input_size_ranges: vec![(100, 1000000)], thread_counts: vec![best_config.parameters.get("thread_count").unwrap_or(&1).clone()],
chunk_sizes: vec![best_config.parameters.get("chunk_size").unwrap_or(&1024).clone()],
block_sizes: vec![best_config.parameters.get("block_size").unwrap_or(&64).clone()],
iterations_per_test: 10,
warmup_iterations: 3,
};
Ok(TuningResult {
config,
optimal_thread_count: best_config.parameters.get("thread_count").unwrap_or(&1).clone(),
optimal_chunk_size: best_config.parameters.get("chunk_size").unwrap_or(&1024).clone(),
optimal_block_size: best_config.parameters.get("block_size").cloned(),
best_performance: best_config.performance,
size_range: (100, 1000000), })
}
fn record_optimization_event(
&self,
kernel_name: &str,
input_chars: InputCharacteristics,
tuning_result: &TuningResult,
environment: EnvironmentConditions,
) -> BackendResult<()> {
let event = OptimizationEvent {
timestamp: Instant::now(),
kernel_name: kernel_name.to_string(),
input_characteristics: input_chars,
optimization_result: OptimizationResult {
speedup_achieved: 1.5, memory_efficiency_gain: 0.2,
power_efficiency_gain: 0.1,
thermal_impact_reduction: 0.05,
},
environment_conditions: environment,
};
let mut registry = self.kernel_registry.lock().map_err(|_| {
TorshError::AllocationError("Failed to acquire kernel registry lock".to_string())
})?;
registry.optimization_history.push(event);
self.adaptive_controller.update_model(tuning_result)?;
Ok(())
}
pub fn get_performance_stats(&self) -> BackendResult<GlobalPerformanceStats> {
let tracker = self.performance_tracker.lock().map_err(|_| {
TorshError::AllocationError("Failed to acquire performance tracker lock".to_string())
})?;
Ok(tracker.global_stats.clone())
}
}
#[derive(Debug)]
struct SimulatedPerformance {
throughput: f64,
efficiency: f64,
cache_hit_ratio: f64,
thermal_impact: f32,
power_efficiency: f32,
stability_score: f32,
}
impl PerformanceTracker {
fn new() -> Self {
Self {
operation_stats: HashMap::new(),
global_stats: GlobalPerformanceStats::default(),
thermal_history: Vec::new(),
power_history: Vec::new(),
}
}
}
impl KernelRegistry {
fn new() -> Self {
Self {
kernels: HashMap::new(),
tuning_sessions: HashMap::new(),
optimization_history: Vec::new(),
}
}
}
impl AdaptiveTuningController {
fn new() -> Self {
Self {
learning_rate: 0.01,
exploration_factor: 0.1,
exploitation_threshold: 0.8,
convergence_history: Vec::new(),
prediction_model: PredictionModel {
feature_weights: HashMap::new(),
bias: 0.0,
confidence_threshold: 0.7,
},
}
}
fn suggest_configurations(
&self,
_kernel_desc: &KernelDescriptor,
_input_chars: &InputCharacteristics,
) -> BackendResult<Vec<HashMap<String, usize>>> {
Ok(Vec::new())
}
fn update_model(&self, _tuning_result: &TuningResult) -> BackendResult<()> {
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_autotuner_creation() {
let tuner = SciRS2AutoTuner::new();
assert!(tuner.is_ok());
}
#[test]
fn test_optimization_profile_creation() {
let profiles = SciRS2AutoTuner::initialize_optimization_profiles();
assert!(profiles.is_ok());
let profiles = profiles.expect("operation should succeed");
assert!(profiles.contains_key("intel_haswell"));
assert!(profiles.contains_key("apple_m1"));
}
#[test]
fn test_kernel_registration() {
let tuner = SciRS2AutoTuner::new().expect("Sci RS2Auto Tuner should succeed");
let kernel_desc = KernelDescriptor {
name: "test_kernel".to_string(),
operation_type: OperationType::ElementWise,
parameter_space: ParameterSpace {
thread_counts: vec![1, 2, 4],
block_sizes: vec![64, 128],
vector_widths: vec![128, 256],
unroll_factors: vec![2, 4],
tile_sizes: vec![(8, 8), (16, 16)],
scheduling_strategies: vec![SchedulingStrategy::Static],
},
constraints: vec![],
scirs2_kernel_id: None,
};
let result = tuner.register_kernel(kernel_desc);
assert!(result.is_ok());
}
#[test]
fn test_cache_key_generation() {
let tuner = SciRS2AutoTuner::new().expect("Sci RS2Auto Tuner should succeed");
let input_chars = InputCharacteristics {
size: 1000,
shape: vec![100, 10],
data_type: "f32".to_string(),
access_pattern: "sequential".to_string(),
locality_score: 0.8,
};
let environment = EnvironmentConditions {
cpu_temperature: 65.0,
memory_pressure: 0.7,
system_load: 0.5,
thermal_throttling: false,
};
let key = tuner.generate_cache_key("test_kernel", &input_chars, &environment);
assert!(!key.is_empty());
assert!(key.contains("test_kernel"));
assert!(key.contains("1000"));
}
#[test]
fn test_configuration_generation() {
let tuner = SciRS2AutoTuner::new().expect("Sci RS2Auto Tuner should succeed");
let profile = tuner.get_optimization_profile().expect("optimization profile retrieval should succeed");
let input_chars = InputCharacteristics {
size: 1024,
shape: vec![32, 32],
data_type: "f32".to_string(),
access_pattern: "sequential".to_string(),
locality_score: 0.9,
};
let mut configs = Vec::new();
let result = tuner.generate_elementwise_configurations(&mut configs, &profile, &input_chars);
assert!(result.is_ok());
assert!(!configs.is_empty());
for config in configs {
assert!(config.contains_key("thread_count"));
assert!(config.contains_key("chunk_size"));
assert!(config.contains_key("vector_width"));
}
}
}