use crate::error::BackendResult as Result;
use crate::performance_modeling::{
EnvironmentalFactors, PerformanceMeasurement, RuntimePerformanceModeler,
};
use crate::performance_tuning::{
AccessPattern, ActualPerformance, DataType, OperationType, PerformancePrediction, SystemState,
TuningParameters, WorkloadCharacteristics,
};
use crate::{BackendType, Device};
use std::collections::HashMap;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex, RwLock};
use std::time::{Duration, Instant};
use torsh_core::error::TorshError;
#[cfg(feature = "serialize")]
use serde::{Deserialize, Serialize};
#[cfg(not(feature = "std"))]
use alloc::{boxed::Box, format, string::String, vec::Vec};
static MEASUREMENT_ID_COUNTER: AtomicU64 = AtomicU64::new(1);
fn generate_measurement_id() -> u64 {
MEASUREMENT_ID_COUNTER.fetch_add(1, Ordering::SeqCst)
}
pub struct AdaptiveKernelSelector {
kernel_registry: Arc<RwLock<KernelRegistry>>,
performance_modeler: Arc<RuntimePerformanceModeler>,
selection_algorithm: SelectionAlgorithm,
performance_tracker: Arc<Mutex<PerformanceTracker>>,
config: AdaptiveSelectionConfig,
}
#[derive(Debug)]
pub struct KernelRegistry {
kernels: HashMap<(OperationType, BackendType), Vec<KernelImplementation>>,
custom_kernels: HashMap<String, Box<dyn CustomKernel + Send + Sync>>,
kernel_characteristics: HashMap<String, KernelCharacteristics>,
#[allow(dead_code)]
default_kernels: HashMap<(OperationType, BackendType), String>,
}
#[derive(Debug, Clone)]
pub struct KernelImplementation {
pub id: String,
pub name: String,
pub operation_type: OperationType,
pub backend_type: BackendType,
pub variant: KernelVariant,
pub characteristics: KernelCharacteristics,
pub constraints: KernelConstraints,
pub implementation: Arc<dyn KernelExecutor + Send + Sync>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub enum KernelVariant {
Naive,
Optimized,
Tiled,
Vectorized,
Parallel,
Fused,
HardwareSpecific(String),
Custom(String),
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct KernelCharacteristics {
pub optimal_size_range: (usize, usize),
pub memory_pattern: AccessPattern,
pub compute_intensity: f64,
pub parallelization_efficiency: f64,
pub cache_efficiency: f64,
pub memory_bandwidth_utilization: f64,
pub initialization_overhead: Duration,
pub scalability: ScalabilityCharacteristics,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct ScalabilityCharacteristics {
pub size_scaling: ScalingBehavior,
pub thread_scaling: ScalingBehavior,
pub memory_hierarchy_scaling: ScalingBehavior,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub enum ScalingBehavior {
Linear,
Logarithmic,
Exponential,
Constant,
Custom(String),
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct KernelConstraints {
pub min_size: usize,
pub max_size: Option<usize>,
pub supported_dtypes: Vec<DataType>,
pub required_alignment: usize,
pub supported_shapes: Option<Vec<Vec<usize>>>,
pub required_features: Vec<String>,
}
pub trait KernelExecutor: std::fmt::Debug + Send + Sync {
fn execute(&self, inputs: &KernelInputs) -> Result<KernelOutputs>;
fn estimate_execution_time(&self, inputs: &KernelInputs) -> Duration;
fn can_handle(&self, inputs: &KernelInputs) -> bool;
fn get_resource_requirements(&self, inputs: &KernelInputs) -> ResourceRequirements;
}
#[derive(Debug, Clone)]
pub struct KernelInputs {
pub input_shapes: Vec<Vec<usize>>,
pub data_types: Vec<DataType>,
pub total_size: usize,
pub operation_params: HashMap<String, KernelParameter>,
pub device: Device,
}
#[derive(Debug, Clone)]
pub struct KernelOutputs {
pub output_shapes: Vec<Vec<usize>>,
pub execution_time: Duration,
pub memory_usage: usize,
pub success: bool,
pub error_message: Option<String>,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub enum KernelParameter {
Integer(i64),
Float(f64),
String(String),
Boolean(bool),
IntegerArray(Vec<i64>),
FloatArray(Vec<f64>),
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct ResourceRequirements {
pub memory: usize,
pub compute_units: usize,
pub bandwidth: usize,
pub temporary_storage: usize,
}
pub trait CustomKernel: std::fmt::Debug + Send + Sync {
fn name(&self) -> &str;
fn operation_type(&self) -> OperationType;
fn backend_type(&self) -> BackendType;
fn characteristics(&self) -> KernelCharacteristics;
fn constraints(&self) -> KernelConstraints;
fn execute(&self, inputs: &KernelInputs) -> Result<KernelOutputs>;
fn benchmark(&self, inputs: &KernelInputs, iterations: usize) -> Result<BenchmarkResult>;
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct BenchmarkResult {
pub avg_execution_time: Duration,
pub min_execution_time: Duration,
pub max_execution_time: Duration,
pub std_deviation: Duration,
pub throughput: f64,
pub memory_bandwidth: f64,
pub cache_hit_rate: f64,
}
#[derive(Debug, Clone)]
pub enum SelectionAlgorithm {
ScoreBased(ScoreBasedConfig),
MachineLearning(MLBasedConfig),
Hybrid(HybridConfig),
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct ScoreBasedConfig {
pub execution_time_weight: f64,
pub memory_usage_weight: f64,
pub cache_efficiency_weight: f64,
pub historical_weight: f64,
pub switching_penalty: f64,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct MLBasedConfig {
pub model_type: MLModelType,
pub training_params: MLTrainingParams,
pub feature_weights: HashMap<String, f64>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub enum MLModelType {
DecisionTree,
RandomForest,
NeuralNetwork,
SupportVectorMachine,
LinearRegression,
Custom(String),
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct MLTrainingParams {
pub learning_rate: f64,
pub epochs: usize,
pub batch_size: usize,
pub regularization: f64,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct HybridConfig {
pub score_based: ScoreBasedConfig,
pub ml_based: MLBasedConfig,
pub ml_threshold: f64,
}
#[derive(Debug)]
pub struct PerformanceTracker {
#[allow(dead_code)]
performance_history: HashMap<String, Vec<KernelPerformanceRecord>>,
usage_stats: HashMap<String, KernelUsageStats>,
selection_accuracy: SelectionAccuracyTracker,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct KernelPerformanceRecord {
pub timestamp: std::time::SystemTime,
pub input_characteristics: WorkloadCharacteristics,
pub system_state: SystemState,
pub actual_performance: ActualPerformance,
pub predicted_performance: Option<PerformancePrediction>,
pub selection_confidence: f64,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct KernelUsageStats {
pub total_executions: usize,
pub successful_executions: usize,
pub avg_execution_time: Duration,
pub last_used: std::time::SystemTime,
pub selection_frequency: f64,
}
#[derive(Debug)]
pub struct SelectionAccuracyTracker {
total_selections: usize,
optimal_selections: usize,
accuracy_by_operation: HashMap<OperationType, f64>,
accuracy_by_backend: HashMap<BackendType, f64>,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct AdaptiveSelectionConfig {
pub enable_learning: bool,
pub exploration_factor: f64,
pub min_confidence_threshold: f64,
pub max_concurrent_benchmarks: usize,
pub benchmark_timeout: Duration,
pub history_retention: Duration,
}
impl Default for AdaptiveSelectionConfig {
fn default() -> Self {
Self {
enable_learning: true,
exploration_factor: 0.1,
min_confidence_threshold: 0.8,
max_concurrent_benchmarks: 4,
benchmark_timeout: Duration::from_secs(30),
history_retention: Duration::from_secs(7 * 24 * 3600), }
}
}
impl AdaptiveKernelSelector {
pub fn new(performance_modeler: Arc<RuntimePerformanceModeler>) -> Self {
Self {
kernel_registry: Arc::new(RwLock::new(KernelRegistry::new())),
performance_modeler,
selection_algorithm: SelectionAlgorithm::ScoreBased(ScoreBasedConfig::default()),
performance_tracker: Arc::new(Mutex::new(PerformanceTracker::new())),
config: AdaptiveSelectionConfig::default(),
}
}
pub fn register_kernel(&self, kernel: KernelImplementation) -> Result<()> {
let mut registry = self
.kernel_registry
.write()
.expect("lock should not be poisoned");
registry.register_kernel(kernel)
}
pub fn register_custom_kernel(
&self,
kernel: Box<dyn CustomKernel + Send + Sync>,
) -> Result<()> {
let mut registry = self
.kernel_registry
.write()
.expect("lock should not be poisoned");
registry.register_custom_kernel(kernel)
}
pub fn select_kernel(
&self,
operation_type: OperationType,
backend_type: BackendType,
inputs: &KernelInputs,
workload: &WorkloadCharacteristics,
system_state: &SystemState,
) -> Result<KernelSelection> {
let registry = self
.kernel_registry
.read()
.expect("lock should not be poisoned");
let candidates = registry.get_candidates(operation_type, backend_type, inputs)?;
if candidates.is_empty() {
return Err(TorshError::BackendError(format!(
"No suitable kernels found for operation {:?} on backend {:?}",
operation_type, backend_type
)));
}
let selection = match &self.selection_algorithm {
SelectionAlgorithm::ScoreBased(config) => {
self.score_based_selection(&candidates, inputs, workload, system_state, config)?
}
SelectionAlgorithm::MachineLearning(config) => {
self.ml_based_selection(&candidates, inputs, workload, system_state, config)?
}
SelectionAlgorithm::Hybrid(config) => {
self.hybrid_selection(&candidates, inputs, workload, system_state, config)?
}
};
if self.config.enable_learning {
self.track_selection(&selection, workload, system_state)?;
}
Ok(selection)
}
fn score_based_selection(
&self,
candidates: &[KernelImplementation],
inputs: &KernelInputs,
workload: &WorkloadCharacteristics,
system_state: &SystemState,
config: &ScoreBasedConfig,
) -> Result<KernelSelection> {
let mut best_kernel = None;
let mut best_score = f64::NEG_INFINITY;
for kernel in candidates {
let score =
self.calculate_kernel_score(kernel, inputs, workload, system_state, config)?;
if score > best_score {
best_score = score;
best_kernel = Some(kernel);
}
}
let selected_kernel = best_kernel
.ok_or_else(|| TorshError::BackendError("No suitable kernel found".to_string()))?;
Ok(KernelSelection {
kernel: selected_kernel.clone(),
confidence: (best_score + 1.0) / 2.0, selection_reason: SelectionReason::ScoreBased(best_score),
alternatives: candidates
.iter()
.filter(|k| k.id != selected_kernel.id)
.cloned()
.collect(),
})
}
fn ml_based_selection(
&self,
candidates: &[KernelImplementation],
inputs: &KernelInputs,
workload: &WorkloadCharacteristics,
system_state: &SystemState,
_config: &MLBasedConfig,
) -> Result<KernelSelection> {
let score_config = ScoreBasedConfig::default();
self.score_based_selection(candidates, inputs, workload, system_state, &score_config)
}
fn hybrid_selection(
&self,
candidates: &[KernelImplementation],
inputs: &KernelInputs,
workload: &WorkloadCharacteristics,
system_state: &SystemState,
config: &HybridConfig,
) -> Result<KernelSelection> {
let ml_confidence = self.get_ml_confidence(inputs, workload, system_state)?;
if ml_confidence > config.ml_threshold {
self.ml_based_selection(candidates, inputs, workload, system_state, &config.ml_based)
} else {
self.score_based_selection(
candidates,
inputs,
workload,
system_state,
&config.score_based,
)
}
}
fn calculate_kernel_score(
&self,
kernel: &KernelImplementation,
inputs: &KernelInputs,
workload: &WorkloadCharacteristics,
system_state: &SystemState,
config: &ScoreBasedConfig,
) -> Result<f64> {
let mut score = 0.0;
let predicted_time = self.predict_execution_time(kernel, inputs, workload, system_state)?;
let time_score = 1.0 / (1.0 + predicted_time.as_secs_f64());
score += config.execution_time_weight * time_score;
let memory_requirements = kernel.implementation.get_resource_requirements(inputs);
let memory_score = 1.0 / (1.0 + memory_requirements.memory as f64 / 1024.0 / 1024.0);
score += config.memory_usage_weight * memory_score;
let cache_score = kernel.characteristics.cache_efficiency;
score += config.cache_efficiency_weight * cache_score;
let historical_score = self.get_historical_performance_score(&kernel.id)?;
score += config.historical_weight * historical_score;
if let Some(current_kernel) = self.get_current_kernel(workload)? {
if current_kernel != kernel.id {
score -= config.switching_penalty;
}
}
Ok(score)
}
fn predict_execution_time(
&self,
kernel: &KernelImplementation,
inputs: &KernelInputs,
workload: &WorkloadCharacteristics,
system_state: &SystemState,
) -> Result<Duration> {
let device_id = inputs.device.id();
let _measurement = PerformanceMeasurement {
id: generate_measurement_id(),
timestamp: std::time::SystemTime::now(),
backend_type: kernel.backend_type,
device_id,
workload: workload.clone(),
parameters: TuningParameters::default(),
system_state: system_state.clone(),
actual_performance: ActualPerformance::default(),
predicted_performance: None,
prediction_accuracy: None,
environment: crate::performance_modeling::EnvironmentalFactors::default(),
};
let default_params = TuningParameters::default();
let default_env = EnvironmentalFactors::default();
let prediction = self.performance_modeler.predict_performance(
kernel.backend_type,
workload,
&default_params,
system_state,
&default_env,
)?;
Ok(prediction.execution_time)
}
fn get_historical_performance_score(&self, kernel_id: &str) -> Result<f64> {
let tracker = self
.performance_tracker
.lock()
.expect("lock should not be poisoned");
if let Some(stats) = tracker.usage_stats.get(kernel_id) {
let success_rate = stats.successful_executions as f64 / stats.total_executions as f64;
let recency_factor = self.calculate_recency_factor(stats.last_used);
Ok(success_rate * recency_factor)
} else {
Ok(0.5) }
}
fn calculate_recency_factor(&self, last_used: std::time::SystemTime) -> f64 {
let now = std::time::SystemTime::now();
let elapsed = now
.duration_since(last_used)
.unwrap_or(Duration::from_secs(0));
let days_elapsed = elapsed.as_secs() as f64 / (24.0 * 3600.0);
(-days_elapsed / 7.0).exp()
}
fn get_current_kernel(&self, _workload: &WorkloadCharacteristics) -> Result<Option<String>> {
Ok(None)
}
fn get_ml_confidence(
&self,
_inputs: &KernelInputs,
_workload: &WorkloadCharacteristics,
_system_state: &SystemState,
) -> Result<f64> {
Ok(0.5)
}
fn track_selection(
&self,
selection: &KernelSelection,
workload: &WorkloadCharacteristics,
system_state: &SystemState,
) -> Result<()> {
let mut tracker = self
.performance_tracker
.lock()
.expect("lock should not be poisoned");
tracker.track_selection(selection, workload, system_state)
}
pub fn update_performance_feedback(
&self,
kernel_id: &str,
actual_performance: ActualPerformance,
predicted_performance: Option<PerformancePrediction>,
) -> Result<()> {
let mut tracker = self
.performance_tracker
.lock()
.expect("lock should not be poisoned");
tracker.update_performance_feedback(kernel_id, actual_performance, predicted_performance)
}
pub fn get_selection_statistics(&self) -> Result<SelectionStatistics> {
let tracker = self
.performance_tracker
.lock()
.expect("lock should not be poisoned");
Ok(tracker.get_statistics())
}
pub fn benchmark_kernels(
&self,
operation_type: OperationType,
backend_type: BackendType,
test_inputs: &[KernelInputs],
) -> Result<BenchmarkResults> {
let registry = self
.kernel_registry
.read()
.expect("lock should not be poisoned");
let kernels = registry.get_kernels_for_operation(operation_type, backend_type);
let mut results = BenchmarkResults::new();
for kernel in kernels {
for inputs in test_inputs {
if kernel.implementation.can_handle(inputs) {
let benchmark = self.benchmark_kernel(&kernel, inputs)?;
results.add_result(kernel.id.clone(), benchmark);
}
}
}
Ok(results)
}
fn benchmark_kernel(
&self,
kernel: &KernelImplementation,
inputs: &KernelInputs,
) -> Result<BenchmarkResult> {
let iterations = 10;
let mut execution_times = Vec::new();
for _ in 0..iterations {
let start = Instant::now();
let result = kernel.implementation.execute(inputs)?;
let execution_time = start.elapsed();
if result.success {
execution_times.push(execution_time);
}
}
if execution_times.is_empty() {
return Err(TorshError::BackendError(
"All benchmark iterations failed".to_string(),
));
}
let avg_time = execution_times.iter().sum::<Duration>() / execution_times.len() as u32;
let min_time = *execution_times
.iter()
.min()
.expect("execution_times should not be empty after check");
let max_time = *execution_times
.iter()
.max()
.expect("execution_times should not be empty after check");
let variance = execution_times
.iter()
.map(|t| {
let diff = t.as_secs_f64() - avg_time.as_secs_f64();
diff * diff
})
.sum::<f64>()
/ execution_times.len() as f64;
let std_dev = Duration::from_secs_f64(variance.sqrt());
let total_bytes_accessed = (inputs.total_size * 2) as f64; let memory_bandwidth = total_bytes_accessed / avg_time.as_secs_f64();
let cache_size_estimate = 32.0 * 1024.0 * 1024.0; let data_size_ratio = (inputs.total_size as f64 / cache_size_estimate).min(1.0);
let variance_factor = 1.0 - (std_dev.as_secs_f64() / avg_time.as_secs_f64()).min(0.5);
let cache_hit_rate = (1.0 - data_size_ratio) * variance_factor;
Ok(BenchmarkResult {
avg_execution_time: avg_time,
min_execution_time: min_time,
max_execution_time: max_time,
std_deviation: std_dev,
throughput: 1.0 / avg_time.as_secs_f64(),
memory_bandwidth,
cache_hit_rate,
})
}
}
#[derive(Debug, Clone)]
pub struct KernelSelection {
pub kernel: KernelImplementation,
pub confidence: f64,
pub selection_reason: SelectionReason,
pub alternatives: Vec<KernelImplementation>,
}
#[derive(Debug, Clone)]
pub enum SelectionReason {
ScoreBased(f64),
MachineLearning(f64),
Hybrid(f64),
Default,
}
#[derive(Debug)]
pub struct BenchmarkResults {
results: HashMap<String, BenchmarkResult>,
}
impl BenchmarkResults {
pub fn new() -> Self {
Self {
results: HashMap::new(),
}
}
pub fn add_result(&mut self, kernel_id: String, result: BenchmarkResult) {
self.results.insert(kernel_id, result);
}
pub fn get_result(&self, kernel_id: &str) -> Option<&BenchmarkResult> {
self.results.get(kernel_id)
}
pub fn get_best_kernel(&self) -> Option<(&String, &BenchmarkResult)> {
self.results
.iter()
.min_by(|a, b| a.1.avg_execution_time.cmp(&b.1.avg_execution_time))
}
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct SelectionStatistics {
pub total_selections: usize,
pub overall_accuracy: f64,
pub accuracy_by_operation: HashMap<OperationType, f64>,
pub accuracy_by_backend: HashMap<BackendType, f64>,
pub popular_kernels: Vec<(String, usize)>,
}
impl KernelRegistry {
pub fn new() -> Self {
Self {
kernels: HashMap::new(),
custom_kernels: HashMap::new(),
kernel_characteristics: HashMap::new(),
default_kernels: HashMap::new(),
}
}
pub fn register_kernel(&mut self, kernel: KernelImplementation) -> Result<()> {
let key = (kernel.operation_type, kernel.backend_type);
self.kernels
.entry(key)
.or_insert_with(Vec::new)
.push(kernel.clone());
self.kernel_characteristics
.insert(kernel.id.clone(), kernel.characteristics);
Ok(())
}
pub fn register_custom_kernel(
&mut self,
kernel: Box<dyn CustomKernel + Send + Sync>,
) -> Result<()> {
let name = kernel.name().to_string();
self.custom_kernels.insert(name, kernel);
Ok(())
}
pub fn get_candidates(
&self,
operation_type: OperationType,
backend_type: BackendType,
inputs: &KernelInputs,
) -> Result<Vec<KernelImplementation>> {
let key = (operation_type, backend_type);
if let Some(kernels) = self.kernels.get(&key) {
let candidates = kernels
.iter()
.filter(|k| k.implementation.can_handle(inputs))
.cloned()
.collect();
Ok(candidates)
} else {
Ok(Vec::new())
}
}
pub fn get_kernels_for_operation(
&self,
operation_type: OperationType,
backend_type: BackendType,
) -> Vec<KernelImplementation> {
let key = (operation_type, backend_type);
self.kernels.get(&key).cloned().unwrap_or_default()
}
}
impl PerformanceTracker {
pub fn new() -> Self {
Self {
performance_history: HashMap::new(),
usage_stats: HashMap::new(),
selection_accuracy: SelectionAccuracyTracker::new(),
}
}
pub fn track_selection(
&mut self,
selection: &KernelSelection,
workload: &WorkloadCharacteristics,
system_state: &SystemState,
) -> Result<()> {
let kernel_id = &selection.kernel.id;
let stats = self
.usage_stats
.entry(kernel_id.clone())
.or_insert_with(KernelUsageStats::default);
stats.total_executions += 1;
stats.last_used = std::time::SystemTime::now();
self.selection_accuracy
.track_selection(selection, workload, system_state);
Ok(())
}
pub fn update_performance_feedback(
&mut self,
kernel_id: &str,
actual_performance: ActualPerformance,
_predicted_performance: Option<PerformancePrediction>,
) -> Result<()> {
if let Some(stats) = self.usage_stats.get_mut(kernel_id) {
stats.successful_executions += 1;
stats.avg_execution_time = actual_performance.execution_time;
}
Ok(())
}
pub fn get_statistics(&self) -> SelectionStatistics {
SelectionStatistics {
total_selections: self.selection_accuracy.total_selections,
overall_accuracy: self.selection_accuracy.get_overall_accuracy(),
accuracy_by_operation: self.selection_accuracy.accuracy_by_operation.clone(),
accuracy_by_backend: self.selection_accuracy.accuracy_by_backend.clone(),
popular_kernels: self.get_popular_kernels(),
}
}
fn get_popular_kernels(&self) -> Vec<(String, usize)> {
let mut kernels: Vec<_> = self
.usage_stats
.iter()
.map(|(id, stats)| (id.clone(), stats.total_executions))
.collect();
kernels.sort_by(|a, b| b.1.cmp(&a.1));
kernels.into_iter().take(10).collect()
}
}
impl SelectionAccuracyTracker {
pub fn new() -> Self {
Self {
total_selections: 0,
optimal_selections: 0,
accuracy_by_operation: HashMap::new(),
accuracy_by_backend: HashMap::new(),
}
}
pub fn track_selection(
&mut self,
selection: &KernelSelection,
_workload: &WorkloadCharacteristics,
_system_state: &SystemState,
) {
self.total_selections += 1;
if selection.confidence > 0.8 {
self.optimal_selections += 1;
}
}
pub fn get_overall_accuracy(&self) -> f64 {
if self.total_selections == 0 {
0.0
} else {
self.optimal_selections as f64 / self.total_selections as f64
}
}
}
impl Default for ScoreBasedConfig {
fn default() -> Self {
Self {
execution_time_weight: 0.4,
memory_usage_weight: 0.2,
cache_efficiency_weight: 0.2,
historical_weight: 0.15,
switching_penalty: 0.05,
}
}
}
impl Default for KernelUsageStats {
fn default() -> Self {
Self {
total_executions: 0,
successful_executions: 0,
avg_execution_time: Duration::from_secs(0),
last_used: std::time::SystemTime::now(),
selection_frequency: 0.0,
}
}
}
impl Default for crate::performance_modeling::EnvironmentalFactors {
fn default() -> Self {
Self {
ambient_temperature: None,
system_load: 0.0,
background_processes: 0,
network_activity: 0.0,
storage_io: 0.0,
available_memory: 0,
cpu_frequency: None,
gpu_frequency: None,
}
}
}
impl Default for TuningParameters {
fn default() -> Self {
Self {
thread_count: 1,
vector_width: 1,
block_size: Some(1024),
tile_size: None,
unroll_factor: 1,
scheduling_strategy: crate::performance_tuning::SchedulingStrategy::Static,
memory_allocation_strategy:
crate::performance_tuning::MemoryAllocationStrategy::Default,
optimization_level: crate::performance_tuning::OptimizationLevel::Default,
backend_specific: HashMap::new(),
}
}
}
impl Default for ActualPerformance {
fn default() -> Self {
Self {
execution_time: Duration::from_secs(0),
throughput: 0.0,
memory_usage_peak: 0,
power_consumption_avg: 0.0,
cache_hit_ratio: 0.0,
thermal_increase: 0.0,
cpu_utilization: 0.0,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_kernel_registry() {
let mut registry = KernelRegistry::new();
let kernel = KernelImplementation {
id: "test_kernel".to_string(),
name: "Test Kernel".to_string(),
operation_type: OperationType::MatrixMultiply,
backend_type: BackendType::Cpu,
variant: KernelVariant::Naive,
characteristics: KernelCharacteristics {
optimal_size_range: (1, 1000),
memory_pattern: AccessPattern::Sequential,
compute_intensity: 1.0,
parallelization_efficiency: 0.8,
cache_efficiency: 0.7,
memory_bandwidth_utilization: 0.6,
initialization_overhead: Duration::from_millis(1),
scalability: ScalabilityCharacteristics {
size_scaling: ScalingBehavior::Linear,
thread_scaling: ScalingBehavior::Linear,
memory_hierarchy_scaling: ScalingBehavior::Constant,
},
},
constraints: KernelConstraints {
min_size: 1,
max_size: Some(1000),
supported_dtypes: vec![DataType::F32],
required_alignment: 4,
supported_shapes: None,
required_features: vec![],
},
implementation: std::sync::Arc::new(MockKernelExecutor),
};
assert!(registry.register_kernel(kernel).is_ok());
let inputs = KernelInputs {
input_shapes: vec![vec![10, 10]],
data_types: vec![DataType::F32],
total_size: 400,
operation_params: HashMap::new(),
device: Device::cpu().unwrap(),
};
let candidates = registry
.get_candidates(OperationType::MatrixMultiply, BackendType::Cpu, &inputs)
.unwrap();
assert_eq!(candidates.len(), 1);
}
#[test]
fn test_performance_tracker() {
let mut tracker = PerformanceTracker::new();
let selection = KernelSelection {
kernel: create_test_kernel(),
confidence: 0.9,
selection_reason: SelectionReason::ScoreBased(0.8),
alternatives: vec![],
};
let workload = WorkloadCharacteristics {
operation_type: OperationType::MatrixMultiply,
data_size: 1000,
data_shape: vec![10, 10],
data_type: DataType::F32,
access_pattern: AccessPattern::Sequential,
compute_intensity: 1.0,
memory_bandwidth_requirement: 0.5,
parallelization_potential: 0.8,
cache_locality: 0.7,
branch_predictability: 0.9,
vectorization_potential: 0.8,
};
let system_state = SystemState {
cpu_utilization: 0.5,
memory_utilization: 0.6,
thermal_state: crate::performance_tuning::ThermalState {
cpu_temperature: 65.0,
gpu_temperature: Some(70.0),
thermal_throttling_active: false,
cooling_efficiency: 0.85,
},
power_state: crate::performance_tuning::PowerState {
power_limit: Some(100.0),
current_power_draw: 75.0,
battery_level: Some(0.8),
power_efficiency_mode: crate::performance_tuning::PowerEfficiencyMode::Balanced,
},
concurrent_workloads: 2,
available_memory_bandwidth: 0.7,
cache_pressure: 0.4,
numa_topology: crate::performance_tuning::NumaTopologyState {
node_count: 1,
current_node: 0,
memory_distribution: vec![0.6],
cross_node_traffic: 0.0,
},
};
assert!(tracker
.track_selection(&selection, &workload, &system_state)
.is_ok());
let stats = tracker.get_statistics();
assert_eq!(stats.total_selections, 1);
}
fn create_test_kernel() -> KernelImplementation {
KernelImplementation {
id: "test_kernel".to_string(),
name: "Test Kernel".to_string(),
operation_type: OperationType::MatrixMultiply,
backend_type: BackendType::Cpu,
variant: KernelVariant::Naive,
characteristics: KernelCharacteristics {
optimal_size_range: (1, 1000),
memory_pattern: AccessPattern::Sequential,
compute_intensity: 1.0,
parallelization_efficiency: 0.8,
cache_efficiency: 0.7,
memory_bandwidth_utilization: 0.6,
initialization_overhead: Duration::from_millis(1),
scalability: ScalabilityCharacteristics {
size_scaling: ScalingBehavior::Linear,
thread_scaling: ScalingBehavior::Linear,
memory_hierarchy_scaling: ScalingBehavior::Constant,
},
},
constraints: KernelConstraints {
min_size: 1,
max_size: Some(1000),
supported_dtypes: vec![DataType::F32],
required_alignment: 4,
supported_shapes: None,
required_features: vec![],
},
implementation: std::sync::Arc::new(MockKernelExecutor),
}
}
#[derive(Debug)]
struct MockKernelExecutor;
impl KernelExecutor for MockKernelExecutor {
fn execute(&self, inputs: &KernelInputs) -> Result<KernelOutputs> {
Ok(KernelOutputs {
output_shapes: inputs.input_shapes.clone(),
execution_time: Duration::from_millis(10),
memory_usage: inputs.total_size,
success: true,
error_message: None,
})
}
fn estimate_execution_time(&self, _inputs: &KernelInputs) -> Duration {
Duration::from_millis(10)
}
fn can_handle(&self, _inputs: &KernelInputs) -> bool {
true
}
fn get_resource_requirements(&self, inputs: &KernelInputs) -> ResourceRequirements {
ResourceRequirements {
memory: inputs.total_size,
compute_units: 1,
bandwidth: 1000,
temporary_storage: 0,
}
}
}
}