Skip to main content

scirs2_core/
jit.rs

1//! Just-In-Time (JIT) Compilation Framework for Dynamic Kernel Generation
2//!
3//! This module provides a comprehensive JIT compilation system for generating optimized
4//! kernels at runtime. It supports multiple backends including LLVM IR generation,
5//! GPU kernel compilation, and adaptive optimization based on runtime characteristics.
6//!
7//! Features:
8//! - LLVM-based code generation for CPU and GPU
9//! - Runtime optimization and specialization
10//! - Adaptive compilation based on execution patterns
11//! - Multi-backend support (CUDA, OpenCL, CPU)
12//! - Kernel caching and reuse
13//! - Performance profiling and auto-tuning
14
15use crate::error::{CoreError, ErrorContext, ErrorLocation};
16#[cfg(feature = "gpu")]
17#[allow(unused_imports)]
18use crate::gpu::{GpuBackend, GpuContext, GpuError};
19use std::collections::HashMap;
20use std::fmt;
21use std::sync::{Arc, Mutex, RwLock};
22use std::time::{Duration, Instant};
23use thiserror::Error;
24
25#[cfg(feature = "parallel")]
26#[allow(unused_imports)]
27use crate::parallel_ops::*;
28
29/// JIT compilation error types
30#[derive(Error, Debug)]
31pub enum JitError {
32    /// Compilation failed
33    #[error("JIT compilation failed: {0}")]
34    CompilationError(String),
35
36    /// Code generation error
37    #[error("Code generation error: {0}")]
38    CodeGenerationError(String),
39
40    /// Optimization error
41    #[error("Optimization error: {0}")]
42    OptimizationError(String),
43
44    /// Backend not supported
45    #[error("Backend not supported: {backend}")]
46    BackendNotSupported { backend: String },
47
48    /// Invalid kernel source
49    #[error("Invalid kernel source: {0}")]
50    InvalidKernelSource(String),
51
52    /// Runtime execution error
53    #[error("Runtime execution error: {0}")]
54    RuntimeError(String),
55
56    /// Cache error
57    #[error("Kernel cache error: {0}")]
58    CacheError(String),
59
60    /// Profiling error
61    #[error("Profiling error: {0}")]
62    ProfilingError(String),
63
64    /// Underlying GPU error
65    #[cfg(feature = "gpu")]
66    #[error("GPU error: {0}")]
67    GpuError(#[from] GpuError),
68}
69
70impl From<JitError> for CoreError {
71    fn from(err: JitError) -> Self {
72        match err {
73            JitError::CompilationError(msg) => CoreError::ComputationError(
74                ErrorContext::new(format!("{msg}"))
75                    .with_location(ErrorLocation::new(file!(), line!())),
76            ),
77            JitError::CodeGenerationError(msg) => CoreError::ComputationError(
78                ErrorContext::new(format!("{msg}"))
79                    .with_location(ErrorLocation::new(file!(), line!())),
80            ),
81            JitError::OptimizationError(msg) => CoreError::ComputationError(
82                ErrorContext::new(format!("{msg}"))
83                    .with_location(ErrorLocation::new(file!(), line!())),
84            ),
85            JitError::BackendNotSupported { backend } => CoreError::NotImplementedError(
86                ErrorContext::new(format!("{backend}"))
87                    .with_location(ErrorLocation::new(file!(), line!())),
88            ),
89            JitError::RuntimeError(msg) => CoreError::ComputationError(
90                ErrorContext::new(format!("{msg}"))
91                    .with_location(ErrorLocation::new(file!(), line!())),
92            ),
93            _ => CoreError::ComputationError(
94                ErrorContext::new(format!("{err}"))
95                    .with_location(ErrorLocation::new(file!(), line!())),
96            ),
97        }
98    }
99}
100
101/// JIT compilation backends
102#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
103pub enum JitBackend {
104    /// LLVM-based compilation
105    Llvm,
106    /// GPU-specific backends
107    Cuda,
108    OpenCl,
109    Metal,
110    WebGpu,
111    /// Interpreter-based execution
112    Interpreter,
113    /// Native code generation
114    NativeCode,
115    /// Custom backend
116    Custom(&'static str),
117}
118
119impl fmt::Display for JitBackend {
120    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
121        match self {
122            JitBackend::Llvm => write!(f, "LLVM"),
123            JitBackend::Cuda => write!(f, "CUDA"),
124            JitBackend::OpenCl => write!(f, "OpenCL"),
125            JitBackend::Metal => write!(f, "Metal"),
126            JitBackend::WebGpu => write!(f, "WebGPU"),
127            JitBackend::Interpreter => write!(f, "Interpreter"),
128            JitBackend::NativeCode => write!(f, "NativeCode"),
129            JitBackend::Custom(name) => write!(f, "Custom({})", name),
130        }
131    }
132}
133
134/// JIT compilation target architectures
135#[derive(Debug, Clone, Copy, PartialEq, Eq)]
136pub enum TargetArchitecture {
137    /// x86-64 CPU
138    X86_64,
139    /// ARM64 CPU
140    Arm64,
141    /// NVIDIA GPU (CUDA)
142    NvidiaGpu,
143    /// AMD GPU (ROCm)
144    AmdGpu,
145    /// Intel GPU
146    IntelGpu,
147    /// Apple GPU (Metal)
148    AppleGpu,
149    /// WebGPU
150    WebGpu,
151}
152
153/// Optimization levels for JIT compilation
154#[derive(Debug, Clone, Copy, PartialEq, Eq)]
155pub enum OptimizationLevel {
156    /// No optimization
157    None,
158    /// Basic optimizations
159    O1,
160    /// Standard optimizations
161    O2,
162    /// Aggressive optimizations
163    O3,
164    /// Size optimizations
165    Os,
166    /// Fast math optimizations
167    Ofast,
168    /// Adaptive optimization based on profiling
169    Adaptive,
170}
171
172/// JIT compilation configuration
173#[derive(Debug, Clone)]
174pub struct JitConfig {
175    /// Target backend
176    pub backend: JitBackend,
177    /// Target architecture
178    pub target_arch: TargetArchitecture,
179    /// Optimization level
180    pub optimization_level: OptimizationLevel,
181    /// Enable caching
182    pub enable_caching: bool,
183    /// Enable profiling
184    pub enable_profiling: bool,
185    /// Maximum cache size
186    pub max_cache_size: usize,
187    /// Compilation timeout
188    pub compilation_timeout: Duration,
189    /// Enable adaptive optimization
190    pub adaptive_optimization: bool,
191    /// Custom compilation flags
192    pub custom_flags: Vec<String>,
193}
194
195impl Default for JitConfig {
196    fn default() -> Self {
197        Self {
198            backend: JitBackend::Llvm,
199            target_arch: TargetArchitecture::X86_64,
200            optimization_level: OptimizationLevel::O2,
201            enable_caching: true,
202            enable_profiling: true,
203            max_cache_size: 256 * 1024 * 1024, // 256MB
204            compilation_timeout: Duration::from_secs(30),
205            adaptive_optimization: true,
206            custom_flags: Vec::new(),
207        }
208    }
209}
210
211/// Kernel source code abstraction
212#[derive(Debug, Clone)]
213pub struct KernelSource {
214    /// Unique identifier for the kernel
215    pub id: String,
216    /// Source code
217    pub source: String,
218    /// Kernel language/dialect
219    pub language: KernelLanguage,
220    /// Entry point function name
221    pub entry_point: String,
222    /// Input parameter types
223    pub input_types: Vec<DataType>,
224    /// Output parameter types
225    pub output_types: Vec<DataType>,
226    /// Compilation hints
227    pub hints: CompilationHints,
228}
229
230/// Kernel programming languages/dialects
231#[derive(Debug, Clone, Copy, PartialEq, Eq)]
232pub enum KernelLanguage {
233    /// LLVM IR
234    LlvmIr,
235    /// CUDA C/C++
236    Cuda,
237    /// OpenCL C
238    OpenCl,
239    /// HLSL (DirectX)
240    Hlsl,
241    /// Metal Shading Language
242    Metal,
243    /// WGSL (WebGPU)
244    Wgsl,
245    /// High-level DSL
246    HighLevel,
247    /// Assembly language
248    Assembly,
249}
250
251/// Data types for kernel parameters
252#[derive(Debug, Clone, PartialEq, Eq)]
253pub enum DataType {
254    /// 8-bit signed integer
255    I8,
256    /// 16-bit signed integer
257    I16,
258    /// 32-bit signed integer
259    I32,
260    /// 64-bit signed integer
261    I64,
262    /// 8-bit unsigned integer
263    U8,
264    /// 16-bit unsigned integer
265    U16,
266    /// 32-bit unsigned integer
267    U32,
268    /// 64-bit unsigned integer
269    U64,
270    /// 16-bit floating point
271    F16,
272    /// 32-bit floating point
273    F32,
274    /// 64-bit floating point
275    F64,
276    /// Boolean
277    Bool,
278    /// Pointer to memory
279    Ptr(Box<DataType>),
280    /// Array of fixed size
281    Array(Box<DataType>, usize),
282    /// Vector types
283    Vec2(Box<DataType>),
284    Vec3(Box<DataType>),
285    Vec4(Box<DataType>),
286}
287
288/// Compilation hints for optimization
289#[derive(Debug, Clone, Default)]
290pub struct CompilationHints {
291    /// Expected workload size
292    pub workload_size: Option<usize>,
293    /// Memory access pattern
294    pub memory_pattern: Option<MemoryPattern>,
295    /// Computational intensity
296    pub compute_intensity: Option<ComputeIntensity>,
297    /// Parallelization hints
298    pub parallelization: Option<ParallelizationHints>,
299    /// Target-specific hints
300    pub target_hints: HashMap<String, String>,
301}
302
303/// Memory access patterns
304#[derive(Debug, Clone, Copy, PartialEq, Eq)]
305pub enum MemoryPattern {
306    /// Sequential access
307    Sequential,
308    /// Random access
309    Random,
310    /// Strided access
311    Strided,
312    /// Coalesced access
313    Coalesced,
314    /// Scattered access
315    Scattered,
316}
317
318/// Computational intensity levels
319#[derive(Debug, Clone, Copy, PartialEq, Eq)]
320pub enum ComputeIntensity {
321    /// Memory-bound operations
322    MemoryBound,
323    /// Compute-bound operations
324    ComputeBound,
325    /// Balanced compute and memory
326    Balanced,
327    /// Bandwidth-intensive
328    BandwidthIntensive,
329}
330
331impl Default for ComputeIntensity {
332    fn default() -> Self {
333        ComputeIntensity::Balanced
334    }
335}
336
337/// Parallelization hints
338#[derive(Debug, Clone)]
339pub struct ParallelizationHints {
340    /// Preferred work group size
341    pub work_group_size: Option<[usize; 3]>,
342    /// Vectorization width
343    pub vector_width: Option<usize>,
344    /// Loop unrolling factor
345    pub unroll_factor: Option<usize>,
346    /// Enable auto-vectorization
347    pub auto_vectorize: bool,
348}
349
350impl Default for ParallelizationHints {
351    fn default() -> Self {
352        Self {
353            work_group_size: None,
354            vector_width: None,
355            unroll_factor: None,
356            auto_vectorize: true,
357        }
358    }
359}
360
361/// Compiled kernel representation
362#[derive(Debug, Clone)]
363pub struct CompiledKernel {
364    /// Kernel identifier
365    pub id: String,
366    /// Compiled binary/bytecode
367    pub binary: Vec<u8>,
368    /// Backend used for compilation
369    pub backend: JitBackend,
370    /// Target architecture
371    pub target_arch: TargetArchitecture,
372    /// Compilation metadata
373    pub metadata: KernelMetadata,
374    /// Performance characteristics
375    pub performance: KernelPerformance,
376}
377
378/// Kernel compilation metadata
379#[derive(Debug, Clone)]
380pub struct KernelMetadata {
381    /// Compilation timestamp
382    pub compiled_at: Instant,
383    /// Compilation time
384    pub compilation_time: Duration,
385    /// Optimization level used
386    pub optimization_level: OptimizationLevel,
387    /// Binary size
388    pub binary_size: usize,
389    /// Register usage (GPU kernels)
390    pub register_usage: Option<usize>,
391    /// Shared memory usage (GPU kernels)
392    pub shared_memory_usage: Option<usize>,
393    /// Compiler version/info
394    pub compiler_info: String,
395}
396
397/// Kernel performance characteristics
398#[derive(Debug, Clone, Default)]
399pub struct KernelPerformance {
400    /// Execution count
401    pub execution_count: usize,
402    /// Total execution time
403    pub totalexecution_time: Duration,
404    /// Average execution time
405    pub avgexecution_time: Duration,
406    /// Best execution time
407    pub bestexecution_time: Duration,
408    /// Worst execution time
409    pub worstexecution_time: Duration,
410    /// Throughput (operations per second)
411    pub throughput: f64,
412    /// Energy efficiency (operations per joule)
413    pub energy_efficiency: Option<f64>,
414}
415
416/// JIT compiler interface
417pub struct JitCompiler {
418    /// Configuration
419    config: JitConfig,
420    /// Backend implementations
421    backends: HashMap<JitBackend, Box<dyn JitBackendImpl>>,
422    /// Kernel cache
423    cache: Arc<RwLock<KernelCache>>,
424    /// Performance profiler
425    profiler: Arc<Mutex<KernelProfiler>>,
426    /// Adaptive optimizer
427    adaptive_optimizer: Arc<Mutex<AdaptiveOptimizer>>,
428}
429
430/// Kernel cache for compiled kernels
431#[derive(Debug)]
432pub struct KernelCache {
433    /// Cached kernels
434    kernels: HashMap<String, CompiledKernel>,
435    /// Cache size in bytes
436    current_size: usize,
437    /// Maximum cache size
438    maxsize: usize,
439    /// Access frequency tracking
440    access_counts: HashMap<String, usize>,
441    /// Last access times
442    last_accessed: HashMap<String, Instant>,
443}
444
445/// Kernel performance profiler
446#[derive(Debug)]
447pub struct KernelProfiler {
448    /// Execution profiles
449    profiles: HashMap<String, Vec<ExecutionProfile>>,
450    /// Hardware performance counters
451    hw_counters: HardwareCounters,
452    /// Profiling enabled
453    enabled: bool,
454}
455
456/// Individual execution profile
457#[derive(Debug, Clone)]
458pub struct ExecutionProfile {
459    /// Execution timestamp
460    pub timestamp: Instant,
461    /// Execution time
462    pub execution_time: Duration,
463    /// Memory bandwidth utilized
464    pub memorybandwidth: f64,
465    /// Compute utilization
466    pub compute_utilization: f64,
467    /// Cache hit rates
468    pub cache_hit_rates: Vec<f64>,
469    /// Power consumption
470    pub power_consumption: Option<f64>,
471}
472
473/// Hardware performance counters
474#[derive(Debug, Default)]
475pub struct HardwareCounters {
476    /// CPU cycles
477    pub cpu_cycles: u64,
478    /// Instructions executed
479    pub instructions: u64,
480    /// Cache misses
481    pub cache_misses: u64,
482    /// Memory transactions
483    pub memory_transactions: u64,
484    /// GPU-specific counters
485    pub gpu_counters: HashMap<String, u64>,
486}
487
488/// Adaptive optimizer for runtime optimization
489#[derive(Debug)]
490pub struct AdaptiveOptimizer {
491    /// Optimization history
492    optimization_history: HashMap<String, Vec<OptimizationResult>>,
493    /// Learning model for optimization decisions
494    learning_model: Option<Box<dyn OptimizationModel>>,
495    /// Optimization strategies
496    strategies: Vec<OptimizationStrategy>,
497}
498
499/// Optimization result tracking
500#[derive(Debug, Clone)]
501pub struct OptimizationResult {
502    /// Strategy used
503    pub strategy: OptimizationStrategy,
504    /// Performance improvement
505    pub improvement: f64,
506    /// Compilation overhead
507    pub compilation_overhead: Duration,
508    /// Success flag
509    pub success: bool,
510}
511
512/// Optimization strategies
513#[derive(Debug, Clone, Copy, PartialEq, Eq)]
514pub enum OptimizationStrategy {
515    /// Loop unrolling
516    LoopUnrolling,
517    /// Vectorization
518    Vectorization,
519    /// Memory prefetching
520    MemoryPrefetching,
521    /// Register allocation optimization
522    RegisterAllocation,
523    /// Instruction scheduling
524    InstructionScheduling,
525    /// Constant folding
526    ConstantFolding,
527    /// Dead code elimination
528    DeadCodeElimination,
529    /// Function inlining
530    FunctionInlining,
531}
532
533/// Machine learning model for optimization decisions
534pub trait OptimizationModel: Send + Sync + std::fmt::Debug {
535    /// Predict optimal strategy for a kernel
536    fn predict(&self, features: &KernelFeatures) -> OptimizationStrategy;
537
538    /// Update model with feedback
539    fn update_model(&mut self, features: &KernelFeatures, result: &OptimizationResult);
540}
541
542/// Kernel feature extraction for ML optimization
543#[derive(Debug, Clone)]
544pub struct KernelFeatures {
545    /// Source code metrics
546    pub source_metrics: SourceMetrics,
547    /// Runtime characteristics
548    pub runtime_metrics: RuntimeMetrics,
549    /// Target characteristics
550    pub target_metrics: TargetMetrics,
551}
552
553/// Source code metrics
554#[derive(Debug, Clone, Default)]
555pub struct SourceMetrics {
556    /// Lines of code
557    pub lines_ofcode: usize,
558    /// Loop count
559    pub loop_count: usize,
560    /// Branching factor
561    pub branching_factor: f64,
562    /// Memory operations count
563    pub memory_ops_count: usize,
564    /// Arithmetic operations count
565    pub arithmetic_ops_count: usize,
566    /// Function call count
567    pub function_call_count: usize,
568}
569
570/// Runtime characteristics
571#[derive(Debug, Clone, Default)]
572pub struct RuntimeMetrics {
573    /// Typical input sizes
574    pub typical_input_sizes: Vec<usize>,
575    /// Execution frequency
576    pub execution_frequency: f64,
577    /// Memory access patterns
578    pub memory_patterns: Vec<MemoryPattern>,
579    /// Computational intensity
580    pub compute_intensity: ComputeIntensity,
581}
582
583/// Target platform metrics
584#[derive(Debug, Clone, Default)]
585pub struct TargetMetrics {
586    /// Available compute units
587    pub compute_units: usize,
588    /// Memory bandwidth
589    pub memorybandwidth: f64,
590    /// Cache sizes
591    pub cache_sizes: Vec<usize>,
592    /// Vector width
593    pub vector_width: usize,
594}
595
596/// JIT backend implementation trait
597pub trait JitBackendImpl: Send + Sync {
598    /// Compile kernel source to binary
599    fn compile_kernel(
600        &self,
601        source: &KernelSource,
602        config: &JitConfig,
603    ) -> Result<CompiledKernel, JitError>;
604
605    /// Execute compiled kernel
606    fn execute_kernel(
607        &self,
608        kernel: &CompiledKernel,
609        inputs: &[&dyn std::any::Any],
610        outputs: &mut [&mut dyn std::any::Any],
611    ) -> Result<ExecutionProfile, JitError>;
612
613    /// Check if backend is available
614    fn is_available(&self) -> bool;
615
616    /// Get backend capabilities
617    fn get_capabilities(&self) -> BackendCapabilities;
618}
619
620/// Backend capabilities
621#[derive(Debug, Clone)]
622pub struct BackendCapabilities {
623    /// Supported data types
624    pub supported_types: Vec<DataType>,
625    /// Supported optimization levels
626    pub optimization_levels: Vec<OptimizationLevel>,
627    /// Maximum kernel size
628    pub max_kernel_size: Option<usize>,
629    /// Supports debugging
630    pub supports_debugging: bool,
631    /// Supports profiling
632    pub supports_profiling: bool,
633    /// Target architectures
634    pub target_architectures: Vec<TargetArchitecture>,
635}
636
637impl JitCompiler {
638    /// Create a new JIT compiler
639    pub fn new(config: JitConfig) -> Result<Self, JitError> {
640        let mut backends = HashMap::new();
641
642        // Initialize available backends
643        if config.backend == JitBackend::Llvm || config.backend == JitBackend::NativeCode {
644            backends.insert(
645                JitBackend::Llvm,
646                Box::new(LlvmBackend::new()?) as Box<dyn JitBackendImpl>,
647            );
648        }
649
650        backends.insert(
651            JitBackend::Interpreter,
652            Box::new(InterpreterBackend::new()) as Box<dyn JitBackendImpl>,
653        );
654
655        let cache = Arc::new(RwLock::new(KernelCache::size(config.max_cache_size)));
656        let profiler = Arc::new(Mutex::new(KernelProfiler::new(config.enable_profiling)));
657        let adaptive_optimizer = Arc::new(Mutex::new(AdaptiveOptimizer::new()));
658
659        Ok(Self {
660            config,
661            backends,
662            cache,
663            profiler,
664            adaptive_optimizer,
665        })
666    }
667
668    /// Compile a kernel from source
669    pub fn compile_kernel(&self, source: KernelSource) -> Result<String, JitError> {
670        let kernel_id = source.id.clone();
671
672        // Check cache first
673        if self.config.enable_caching {
674            let cache = self.cache.read().expect("Operation failed");
675            if cache.contains_kernel(&kernel_id) {
676                return Ok(kernel_id);
677            }
678        }
679
680        // Get backend
681        let backend = self.backends.get(&self.config.backend).ok_or_else(|| {
682            JitError::BackendNotSupported {
683                backend: format!("{:?}", self.config.backend),
684            }
685        })?;
686
687        // Compile kernel
688        let compiled_kernel = backend.compile_kernel(&source, &self.config)?;
689
690        // Cache compiled kernel
691        if self.config.enable_caching {
692            let mut cache = self.cache.write().expect("Operation failed");
693            cache.insert(compiled_kernel);
694        }
695
696        Ok(kernel_id)
697    }
698
699    /// Execute a compiled kernel
700    pub fn execute_kernel(
701        &self,
702        kernel_id: &str,
703        inputs: &[&dyn std::any::Any],
704        outputs: &mut [&mut dyn std::any::Any],
705    ) -> Result<(), JitError> {
706        // Get compiled kernel from cache
707        let kernel = {
708            let cache = self.cache.read().expect("Operation failed");
709            cache
710                .get_readonly(kernel_id)
711                .ok_or_else(|| JitError::CacheError(format!("{kernel_id}")))?
712                .clone()
713        };
714
715        // Get backend
716        let backend =
717            self.backends
718                .get(&kernel.backend)
719                .ok_or_else(|| JitError::BackendNotSupported {
720                    backend: format!("{:?}", kernel.backend),
721                })?;
722
723        // Execute kernel
724        let profile = backend.execute_kernel(&kernel, inputs, outputs)?;
725
726        // Update profiling data
727        if self.config.enable_profiling {
728            let mut profiler = self.profiler.lock().expect("Operation failed");
729            profiler.record_execution(kernel_id, profile);
730        }
731
732        // Update adaptive optimization
733        if self.config.adaptive_optimization {
734            let mut optimizer = self.adaptive_optimizer.lock().expect("Operation failed");
735            optimizer.update_performance_data(&kernel.performance);
736        }
737
738        Ok(())
739    }
740
741    /// Get kernel performance statistics
742    pub fn get_kernel_performance(&self, kernel_id: &str) -> Option<KernelPerformance> {
743        let mut cache = self.cache.write().expect("Operation failed");
744        cache.get(kernel_id).map(|k| k.performance.clone())
745    }
746
747    /// Get compilation statistics
748    pub fn get_compilation_stats(&self) -> CompilationStats {
749        let cache = self.cache.read().expect("Operation failed");
750        cache.get_stats()
751    }
752
753    /// Clear kernel cache
754    pub fn clear_cache(&self) {
755        let mut cache = self.cache.write().expect("Operation failed");
756        cache.clear();
757    }
758
759    /// Optimize existing kernel
760    pub fn optimize_kernel(&self, kernel_id: &str) -> Result<String, JitError> {
761        let optimizer = self.adaptive_optimizer.lock().expect("Operation failed");
762        optimizer.optimize_kernel(kernel_id, &self.config)
763    }
764}
765
766/// Compilation statistics
767#[derive(Debug, Clone, Default)]
768pub struct CompilationStats {
769    /// Total kernels compiled
770    pub total_compiled: usize,
771    /// Cache hit rate
772    pub cache_hit_rate: f64,
773    /// Average compilation time
774    pub avg_compilation_time: Duration,
775    /// Total cache size
776    pub cache_size: usize,
777    /// Most frequently used kernels
778    pub top_kernels: Vec<(String, usize)>,
779}
780
781impl KernelCache {
782    /// Create a new kernel cache
783    pub fn size(value: usize) -> Self {
784        Self {
785            kernels: HashMap::new(),
786            current_size: 0,
787            maxsize: value,
788            access_counts: HashMap::new(),
789            last_accessed: HashMap::new(),
790        }
791    }
792
793    /// Check if kernel is cached
794    pub fn contains_kernel(&self, kernel_id: &str) -> bool {
795        self.kernels.contains_key(kernel_id)
796    }
797
798    /// Get kernel from cache
799    pub fn get(&mut self, kernel_id: &str) -> Option<&CompiledKernel> {
800        if let Some(kernel) = self.kernels.get(kernel_id) {
801            // Update access tracking
802            *self.access_counts.entry(kernel_id.to_string()).or_insert(0) += 1;
803            self.last_accessed
804                .insert(kernel_id.to_string(), Instant::now());
805            Some(kernel)
806        } else {
807            None
808        }
809    }
810
811    /// Get a kernel from the cache without updating access tracking
812    pub fn get_readonly(&self, kernel_id: &str) -> Option<&CompiledKernel> {
813        self.kernels.get(kernel_id)
814    }
815
816    /// Insert kernel into cache
817    pub fn insert(&mut self, kernel: CompiledKernel) {
818        let kernel_id = kernel.id.clone();
819        let kernel_size = kernel.binary.len();
820
821        // Check if we need to evict
822        while self.current_size + kernel_size > self.maxsize && !self.kernels.is_empty() {
823            self.evict_lru();
824        }
825
826        self.current_size += kernel_size;
827        self.kernels.insert(kernel_id.clone(), kernel);
828        self.access_counts.insert(kernel_id.clone(), 1);
829        self.last_accessed.insert(kernel_id, Instant::now());
830    }
831
832    /// Evict least recently used kernel
833    fn evict_lru(&mut self) {
834        if let Some((lru_id, _)) = self.last_accessed.iter().min_by_key(|(_, &time)| time) {
835            let lru_id = lru_id.clone();
836            if let Some(kernel) = self.kernels.remove(&lru_id) {
837                self.current_size -= kernel.binary.len();
838                self.access_counts.remove(&lru_id);
839                self.last_accessed.remove(&lru_id);
840            }
841        }
842    }
843
844    /// Clear all cached kernels
845    pub fn clear(&mut self) {
846        self.kernels.clear();
847        self.access_counts.clear();
848        self.last_accessed.clear();
849        self.current_size = 0;
850    }
851
852    /// Get cache statistics
853    pub fn get_stats(&self) -> CompilationStats {
854        let total_accesses: usize = self.access_counts.values().sum();
855        let cache_hit_rate = if total_accesses > 0 {
856            self.access_counts.len() as f64 / total_accesses as f64
857        } else {
858            0.0
859        };
860
861        let mut top_kernels: Vec<_> = self
862            .access_counts
863            .iter()
864            .map(|(id, count)| (id.clone(), *count))
865            .collect();
866        top_kernels.sort_by_key(|b| std::cmp::Reverse(b.1));
867        top_kernels.truncate(10);
868
869        CompilationStats {
870            total_compiled: self.kernels.len(),
871            cache_hit_rate,
872            avg_compilation_time: Duration::from_millis(100), // Placeholder
873            cache_size: self.current_size,
874            top_kernels,
875        }
876    }
877}
878
879impl KernelProfiler {
880    /// Create a new profiler
881    pub fn new(enabled: bool) -> Self {
882        Self {
883            profiles: HashMap::new(),
884            hw_counters: HardwareCounters::default(),
885            enabled,
886        }
887    }
888
889    /// Record kernel execution
890    pub fn record_execution(&mut self, kernel_id: &str, profile: ExecutionProfile) {
891        if !self.enabled {
892            return;
893        }
894
895        self.profiles
896            .entry(kernel_id.to_string())
897            .or_insert_with(Vec::new)
898            .push(profile);
899    }
900
901    /// Get profiling data for a kernel
902    pub fn id_2(&self, kernelid: &str) -> Option<&Vec<ExecutionProfile>> {
903        self.profiles.get(kernelid)
904    }
905}
906
907impl AdaptiveOptimizer {
908    /// Create a new adaptive optimizer
909    pub fn new() -> Self {
910        Self {
911            optimization_history: HashMap::new(),
912            learning_model: None,
913            strategies: vec![
914                OptimizationStrategy::LoopUnrolling,
915                OptimizationStrategy::Vectorization,
916                OptimizationStrategy::MemoryPrefetching,
917                OptimizationStrategy::RegisterAllocation,
918            ],
919        }
920    }
921
922    /// Update performance data by recording an `OptimizationResult` synthesised
923    /// from the observed `KernelPerformance`.  The result is stored under the
924    /// special key `"__perf_trends__"` so that `optimize_kernel` can later
925    /// inspect aggregate patterns when no per-kernel history is available.
926    ///
927    /// Strategy selection heuristic:
928    /// - High throughput (> 1 GOp/s) → prefer `Vectorization`
929    /// - Low execution count (< 10 executions) → prefer `ConstantFolding`
930    /// - Otherwise → cycle through the available strategies in insertion order
931    pub fn update_performance_data(&mut self, data: &KernelPerformance) {
932        // Derive a rough "improvement" signal: normalised throughput (clamped 0..1).
933        // A throughput of 1e9 ops/s is treated as "perfect" (improvement = 1.0).
934        let improvement = (data.throughput / 1.0e9).clamp(0.0, 1.0);
935
936        // Pick a strategy based on the current performance snapshot.
937        let strategy = if data.throughput > 1.0e9 {
938            OptimizationStrategy::Vectorization
939        } else if data.execution_count < 10 {
940            OptimizationStrategy::ConstantFolding
941        } else {
942            // Round-robin over available strategies.
943            let existing = self
944                .optimization_history
945                .get("__perf_trends__")
946                .map(|v| v.len())
947                .unwrap_or(0);
948            let idx = existing % self.strategies.len();
949            self.strategies[idx]
950        };
951
952        let result = OptimizationResult {
953            strategy,
954            improvement,
955            compilation_overhead: data.avgexecution_time,
956            success: improvement > 0.1,
957        };
958
959        self.optimization_history
960            .entry("__perf_trends__".to_string())
961            .or_default()
962            .push(result);
963    }
964
965    /// Suggest an optimization directive for the kernel identified by `kernel_id`.
966    ///
967    /// If the kernel has dedicated history (populated by callers that use
968    /// `kernel_id` as the history key), the most successful `OptimizationResult`
969    /// is selected.  Otherwise the method falls back to the aggregate performance
970    /// trends recorded by `update_performance_data`.  The returned `String` is a
971    /// human-readable directive (e.g. `"vectorize"`, `"unroll"`) that the JIT
972    /// back-end may interpret as a compilation hint.
973    pub fn optimize_kernel(&self, kernel_id: &str, config: &JitConfig) -> Result<String, JitError> {
974        // Try per-kernel history first.
975        let history = self
976            .optimization_history
977            .get(kernel_id)
978            .or_else(|| self.optimization_history.get("__perf_trends__"));
979
980        if let Some(records) = history {
981            // Find the record with the highest improvement that succeeded.
982            let best = records.iter().filter(|r| r.success).max_by(|a, b| {
983                a.improvement
984                    .partial_cmp(&b.improvement)
985                    .unwrap_or(std::cmp::Ordering::Equal)
986            });
987
988            if let Some(best_result) = best {
989                let directive = match best_result.strategy {
990                    OptimizationStrategy::LoopUnrolling => "unroll",
991                    OptimizationStrategy::Vectorization => "vectorize",
992                    OptimizationStrategy::MemoryPrefetching => "prefetch",
993                    OptimizationStrategy::RegisterAllocation => "regalloc",
994                    OptimizationStrategy::InstructionScheduling => "schedule",
995                    OptimizationStrategy::ConstantFolding => "constfold",
996                    OptimizationStrategy::DeadCodeElimination => "dce",
997                    OptimizationStrategy::FunctionInlining => "inline",
998                };
999                let level_flag = optimization_level_flag(config.optimization_level);
1000                return Ok(format!("{directive} {level_flag}"));
1001            }
1002        }
1003
1004        // No history yet — derive a default directive from the config.
1005        let default_directive = match config.optimization_level {
1006            OptimizationLevel::None => "none",
1007            OptimizationLevel::O1 => "constfold",
1008            OptimizationLevel::O2 => "vectorize",
1009            OptimizationLevel::O3 => "unroll vectorize prefetch",
1010            OptimizationLevel::Os => "constfold dce",
1011            OptimizationLevel::Ofast => "unroll vectorize prefetch inline",
1012            OptimizationLevel::Adaptive => "vectorize",
1013        };
1014        let level_flag = optimization_level_flag(config.optimization_level);
1015        Ok(format!("{default_directive} {level_flag}"))
1016    }
1017}
1018
1019/// Return a short flag string that names the optimisation level, e.g. `"-O2"`.
1020fn optimization_level_flag(level: OptimizationLevel) -> &'static str {
1021    match level {
1022        OptimizationLevel::None => "-O0",
1023        OptimizationLevel::O1 => "-O1",
1024        OptimizationLevel::O2 => "-O2",
1025        OptimizationLevel::O3 => "-O3",
1026        OptimizationLevel::Os => "-Os",
1027        OptimizationLevel::Ofast => "-Ofast",
1028        OptimizationLevel::Adaptive => "-O2",
1029    }
1030}
1031
1032/// LLVM-based backend implementation
1033pub struct LlvmBackend {
1034    /// LLVM context
1035    context: Option<()>, // Placeholder for LLVM context
1036}
1037
1038impl LlvmBackend {
1039    /// Create new LLVM backend
1040    pub fn new() -> Result<Self, JitError> {
1041        // In a real implementation, this would initialize LLVM
1042        Ok(Self { context: Some(()) })
1043    }
1044}
1045
1046impl JitBackendImpl for LlvmBackend {
1047    fn compile_kernel(
1048        &self,
1049        source: &KernelSource,
1050        config: &JitConfig,
1051    ) -> Result<CompiledKernel, JitError> {
1052        // Placeholder implementation
1053        let compilation_start = Instant::now();
1054
1055        // In a real implementation, this would:
1056        // 1. Parse the source code
1057        // 2. Generate LLVM IR
1058        // 3. Apply optimizations
1059        // 4. Generate machine code
1060
1061        let compilation_time = compilation_start.elapsed();
1062
1063        Ok(CompiledKernel {
1064            id: source.id.clone(),
1065            binary: vec![0; 1024], // Placeholder binary
1066            backend: config.backend,
1067            target_arch: config.target_arch,
1068            metadata: KernelMetadata {
1069                compiled_at: Instant::now(),
1070                compilation_time,
1071                optimization_level: config.optimization_level,
1072                binary_size: 1024,
1073                register_usage: Some(32),
1074                shared_memory_usage: Some(1024),
1075                compiler_info: "LLVM 15.0".to_string(),
1076            },
1077            performance: KernelPerformance::default(),
1078        })
1079    }
1080
1081    fn execute_kernel(
1082        &self,
1083        kernel: &CompiledKernel,
1084        inputs: &[&dyn std::any::Any],
1085        outputs: &mut [&mut dyn std::any::Any],
1086    ) -> Result<ExecutionProfile, JitError> {
1087        // Placeholder implementation
1088        let start = Instant::now();
1089
1090        // Simulate execution
1091        std::thread::sleep(Duration::from_micros(100));
1092
1093        Ok(ExecutionProfile {
1094            timestamp: start,
1095            execution_time: start.elapsed(),
1096            memorybandwidth: 100.0, // GB/s
1097            compute_utilization: 0.8,
1098            cache_hit_rates: vec![0.95, 0.87, 0.72],
1099            power_consumption: Some(50.0), // Watts
1100        })
1101    }
1102
1103    fn is_available(&self) -> bool {
1104        self.context.is_some()
1105    }
1106
1107    fn get_capabilities(&self) -> BackendCapabilities {
1108        BackendCapabilities {
1109            supported_types: vec![
1110                DataType::I32,
1111                DataType::I64,
1112                DataType::F32,
1113                DataType::F64,
1114                DataType::Vec4(Box::new(DataType::F32)),
1115            ],
1116            optimization_levels: vec![
1117                OptimizationLevel::None,
1118                OptimizationLevel::O1,
1119                OptimizationLevel::O2,
1120                OptimizationLevel::O3,
1121            ],
1122            max_kernel_size: None,
1123            supports_debugging: true,
1124            supports_profiling: true,
1125            target_architectures: vec![TargetArchitecture::X86_64, TargetArchitecture::Arm64],
1126        }
1127    }
1128}
1129
1130/// Interpreter-based backend for debugging and fallback
1131pub struct InterpreterBackend;
1132
1133impl InterpreterBackend {
1134    /// Create new interpreter backend
1135    pub fn new() -> Self {
1136        Self
1137    }
1138}
1139
1140impl JitBackendImpl for InterpreterBackend {
1141    fn compile_kernel(
1142        &self,
1143        source: &KernelSource,
1144        config: &JitConfig,
1145    ) -> Result<CompiledKernel, JitError> {
1146        // For interpreter, "compilation" is just validation
1147        let compilation_start = Instant::now();
1148
1149        // Basic validation
1150        if source.source.is_empty() {
1151            return Err(JitError::InvalidKernelSource("Empty source".to_string()));
1152        }
1153
1154        let compilation_time = compilation_start.elapsed();
1155
1156        Ok(CompiledKernel {
1157            id: source.id.clone(),
1158            binary: source.source.as_bytes().to_vec(),
1159            backend: config.backend,
1160            target_arch: config.target_arch,
1161            metadata: KernelMetadata {
1162                compiled_at: Instant::now(),
1163                compilation_time,
1164                optimization_level: OptimizationLevel::None,
1165                binary_size: source.source.len(),
1166                register_usage: None,
1167                shared_memory_usage: None,
1168                compiler_info: JitBackend::Interpreter.to_string(),
1169            },
1170            performance: KernelPerformance::default(),
1171        })
1172    }
1173
1174    fn execute_kernel(
1175        &self,
1176        kernel: &CompiledKernel,
1177        inputs: &[&dyn std::any::Any],
1178        outputs: &mut [&mut dyn std::any::Any],
1179    ) -> Result<ExecutionProfile, JitError> {
1180        // Placeholder interpreter execution
1181        let start = Instant::now();
1182
1183        // Simulate interpretation
1184        std::thread::sleep(Duration::from_micros(500));
1185
1186        Ok(ExecutionProfile {
1187            timestamp: start,
1188            execution_time: start.elapsed(),
1189            memorybandwidth: 10.0, // Lower bandwidth for interpreter
1190            compute_utilization: 0.1,
1191            cache_hit_rates: vec![1.0], // Perfect cache hit for interpreter
1192            power_consumption: Some(5.0), // Low power
1193        })
1194    }
1195
1196    fn is_available(&self) -> bool {
1197        true // Interpreter is always available
1198    }
1199
1200    fn get_capabilities(&self) -> BackendCapabilities {
1201        BackendCapabilities {
1202            supported_types: vec![DataType::I32, DataType::F32, DataType::F64, DataType::Bool],
1203            optimization_levels: vec![OptimizationLevel::None],
1204            max_kernel_size: Some(1024 * 1024), // 1MB limit for interpreter
1205            supports_debugging: true,
1206            supports_profiling: false,
1207            target_architectures: vec![TargetArchitecture::X86_64],
1208        }
1209    }
1210}
1211
1212/// Convenience functions for common JIT operations
1213pub mod jit_dsl {
1214    use super::*;
1215
1216    /// Create a simple arithmetic kernel
1217    pub fn create_arithmetic_kernel(
1218        operation: &str,
1219        input_type: DataType,
1220        output_type: DataType,
1221    ) -> KernelSource {
1222        let input_type_str = format!("{input_type:?}").to_lowercase();
1223        let output_type_str = format!("{output_type:?}").to_lowercase();
1224
1225        let source = format!(
1226            r#"
1227kernel void arithmetic_op(global {input_type}* input, global {output_type}* output, int size) {{
1228    int idx = get_global_id(0);
1229    if (idx < size) {{
1230        output[idx] = {operation}(input[idx]);
1231    }}
1232}}
1233"#,
1234            input_type = input_type_str,
1235            output_type = output_type_str,
1236            operation = operation
1237        );
1238
1239        KernelSource {
1240            id: format!("arithmetic_{operation}"),
1241            source,
1242            language: KernelLanguage::OpenCl,
1243            entry_point: "arithmetic_op".to_string(),
1244            input_types: vec![input_type],
1245            output_types: vec![output_type],
1246            hints: CompilationHints::default(),
1247        }
1248    }
1249
1250    /// Create a reduction kernel
1251    pub fn create_reduction_kernel(operation: &str, datatype: DataType) -> KernelSource {
1252        let datatype_str = format!("{datatype:?}").to_lowercase();
1253
1254        let source = format!(
1255            r#"
1256kernel void reduction_op(global {datatype}* input, global {datatype}* output, int size) {{
1257    local {datatype} shared_data[256];
1258    int tid = get_local_id(0);
1259    int gid = get_global_id(0);
1260
1261    // Load data into shared memory
1262    shared_data[tid] = (gid < size) ? input[gid] : 0;
1263    barrier(CLK_LOCAL_MEM_FENCE);
1264
1265    // Perform reduction
1266    for (int stride = get_local_size(0) / 2; stride > 0; stride /= 2) {{
1267        if (tid < stride) {{
1268            shared_data[tid] = {operation}(shared_data[tid], shared_data[tid + stride]);
1269        }}
1270        barrier(CLK_LOCAL_MEM_FENCE);
1271    }}
1272
1273    // Write result
1274    if (tid == 0) {{
1275        output[get_group_id(0)] = shared_data[0];
1276    }}
1277}}
1278"#,
1279            datatype = datatype_str,
1280            operation = operation
1281        );
1282
1283        KernelSource {
1284            id: format!("reduction_{operation}"),
1285            source,
1286            language: KernelLanguage::OpenCl,
1287            entry_point: "reduction_op".to_string(),
1288            input_types: vec![datatype.clone()],
1289            output_types: vec![datatype.clone()],
1290            hints: CompilationHints {
1291                workload_size: Some(1024),
1292                memory_pattern: Some(MemoryPattern::Sequential),
1293                compute_intensity: Some(ComputeIntensity::ComputeBound),
1294                parallelization: Some(ParallelizationHints {
1295                    work_group_size: Some([256, 1, 1]),
1296                    vector_width: Some(4),
1297                    unroll_factor: Some(4),
1298                    auto_vectorize: true,
1299                }),
1300                target_hints: HashMap::new(),
1301            },
1302        }
1303    }
1304}
1305
1306#[cfg(test)]
1307mod tests {
1308    use super::*;
1309
1310    #[test]
1311    fn test_jit_compiler_creation() {
1312        let config = JitConfig::default();
1313        let compiler = JitCompiler::new(config);
1314        assert!(compiler.is_ok());
1315    }
1316
1317    #[test]
1318    fn test_kernel_source_creation() {
1319        let source = KernelSource {
1320            id: "test_kernel".to_string(),
1321            source: "kernel void test() {}".to_string(),
1322            language: KernelLanguage::OpenCl,
1323            entry_point: "test".to_string(),
1324            input_types: vec![DataType::F32],
1325            output_types: vec![DataType::F32],
1326            hints: CompilationHints::default(),
1327        };
1328
1329        assert_eq!(source.id, "test_kernel");
1330        assert_eq!(source.language, KernelLanguage::OpenCl);
1331    }
1332
1333    #[test]
1334    fn test_dsl_arithmetic_kernel() {
1335        let kernel = jit_dsl::create_arithmetic_kernel("sqrt", DataType::F32, DataType::F32);
1336        assert_eq!(kernel.id, "arithmetic_sqrt");
1337        assert!(!kernel.source.is_empty());
1338        assert_eq!(kernel.input_types.len(), 1);
1339        assert_eq!(kernel.output_types.len(), 1);
1340    }
1341
1342    #[test]
1343    fn test_dsl_reduction_kernel() {
1344        let kernel = jit_dsl::create_reduction_kernel("max", DataType::F32);
1345        assert_eq!(kernel.id, "reduction_max");
1346        assert!(!kernel.source.is_empty());
1347        assert!(kernel.hints.workload_size.is_some());
1348    }
1349
1350    #[test]
1351    fn test_kernel_cache() {
1352        let mut cache = KernelCache::size(1024 * 1024); // 1MB cache
1353
1354        let kernel = CompiledKernel {
1355            id: "test".to_string(),
1356            binary: vec![0; 1024],
1357            backend: JitBackend::Interpreter,
1358            target_arch: TargetArchitecture::X86_64,
1359            metadata: KernelMetadata {
1360                compiled_at: Instant::now(),
1361                compilation_time: Duration::from_millis(100),
1362                optimization_level: OptimizationLevel::O2,
1363                binary_size: 1024,
1364                register_usage: None,
1365                shared_memory_usage: None,
1366                compiler_info: "test".to_string(),
1367            },
1368            performance: KernelPerformance::default(),
1369        };
1370
1371        cache.insert(kernel);
1372        assert!(cache.contains_kernel("test"));
1373        assert!(cache.get("test").is_some());
1374    }
1375
1376    #[test]
1377    fn test_interpreter_backend() {
1378        let backend = InterpreterBackend::new();
1379        assert!(backend.is_available());
1380
1381        let capabilities = backend.get_capabilities();
1382        assert!(!capabilities.supported_types.is_empty());
1383        assert!(capabilities.supports_debugging);
1384    }
1385
1386    #[test]
1387    fn test_compilation_with_interpreter() {
1388        let config = JitConfig {
1389            backend: JitBackend::Interpreter,
1390            ..Default::default()
1391        };
1392
1393        let compiler = JitCompiler::new(config).expect("Operation failed");
1394
1395        let source = KernelSource {
1396            id: "test_kernel".to_string(),
1397            source: "void test() { /* test kernel */ }".to_string(),
1398            language: KernelLanguage::HighLevel,
1399            entry_point: "test".to_string(),
1400            input_types: vec![],
1401            output_types: vec![],
1402            hints: CompilationHints::default(),
1403        };
1404
1405        let result = compiler.compile_kernel(source);
1406        assert!(result.is_ok());
1407    }
1408
1409    // ── AdaptiveOptimizer tests ──────────────────────────────────────────────
1410
1411    /// `update_performance_data` should record at least one entry in history.
1412    #[test]
1413    fn test_adaptive_optimizer_update_records_history() {
1414        let mut optimizer = AdaptiveOptimizer::new();
1415
1416        let perf = KernelPerformance {
1417            execution_count: 5,
1418            totalexecution_time: Duration::from_millis(50),
1419            avgexecution_time: Duration::from_millis(10),
1420            bestexecution_time: Duration::from_millis(8),
1421            worstexecution_time: Duration::from_millis(15),
1422            throughput: 1.0e8,
1423            energy_efficiency: None,
1424        };
1425
1426        optimizer.update_performance_data(&perf);
1427
1428        // Should have recorded into the aggregate key.
1429        let history = optimizer
1430            .optimization_history
1431            .get("__perf_trends__")
1432            .expect("Expected history to be populated after update");
1433        assert_eq!(
1434            history.len(),
1435            1,
1436            "Exactly one record should have been added"
1437        );
1438    }
1439
1440    /// Multiple `update_performance_data` calls accumulate multiple records.
1441    #[test]
1442    fn test_adaptive_optimizer_update_accumulates() {
1443        let mut optimizer = AdaptiveOptimizer::new();
1444
1445        for i in 0..5u64 {
1446            let perf = KernelPerformance {
1447                execution_count: (i + 1) as usize,
1448                totalexecution_time: Duration::from_millis(10 * (i + 1)),
1449                avgexecution_time: Duration::from_millis(10),
1450                bestexecution_time: Duration::from_millis(8),
1451                worstexecution_time: Duration::from_millis(15),
1452                throughput: 2.0e9 * (i + 1) as f64, // varying throughput
1453                energy_efficiency: None,
1454            };
1455            optimizer.update_performance_data(&perf);
1456        }
1457
1458        let history = optimizer
1459            .optimization_history
1460            .get("__perf_trends__")
1461            .expect("history should exist");
1462        assert_eq!(history.len(), 5);
1463    }
1464
1465    /// `optimize_kernel` with no history should return a default directive.
1466    #[test]
1467    fn test_adaptive_optimizer_default_directive_no_history() {
1468        let optimizer = AdaptiveOptimizer::new();
1469        let config = JitConfig::default(); // O2
1470        let result = optimizer.optimize_kernel("unknown_kernel", &config);
1471        assert!(result.is_ok(), "Should succeed even with no history");
1472        let directive = result.expect("optimizer returned Err unexpectedly");
1473        assert!(!directive.is_empty(), "Directive string must not be empty");
1474        assert!(
1475            directive.contains("-O2"),
1476            "O2 config should produce -O2 flag, got: {directive}"
1477        );
1478    }
1479
1480    /// `optimize_kernel` after recording successful improvements picks
1481    /// the strategy with the highest improvement.
1482    #[test]
1483    fn test_adaptive_optimizer_picks_best_strategy_from_history() {
1484        let mut optimizer = AdaptiveOptimizer::new();
1485
1486        // Record a low-throughput run first (triggers ConstantFolding for count < 10).
1487        let perf_low = KernelPerformance {
1488            execution_count: 3,
1489            totalexecution_time: Duration::from_millis(300),
1490            avgexecution_time: Duration::from_millis(100),
1491            bestexecution_time: Duration::from_millis(90),
1492            worstexecution_time: Duration::from_millis(120),
1493            throughput: 5.0e7, // low
1494            energy_efficiency: None,
1495        };
1496        optimizer.update_performance_data(&perf_low);
1497
1498        // Record a very high-throughput run (triggers Vectorization).
1499        let perf_high = KernelPerformance {
1500            execution_count: 20,
1501            totalexecution_time: Duration::from_millis(20),
1502            avgexecution_time: Duration::from_millis(1),
1503            bestexecution_time: Duration::from_millis(1),
1504            worstexecution_time: Duration::from_millis(2),
1505            throughput: 5.0e9, // > 1 GOp/s
1506            energy_efficiency: None,
1507        };
1508        optimizer.update_performance_data(&perf_high);
1509
1510        let config = JitConfig {
1511            optimization_level: OptimizationLevel::O3,
1512            ..Default::default()
1513        };
1514        let result = optimizer
1515            .optimize_kernel("my_kernel", &config)
1516            .expect("optimize_kernel should succeed");
1517
1518        // The vectorize record has improvement = 5.0e9 / 1e9 = 5.0 (clamped to 1.0)
1519        // which is higher than the constfold record, so "vectorize" should appear.
1520        assert!(
1521            result.contains("vectorize"),
1522            "Expected 'vectorize' in directive, got: {result}"
1523        );
1524    }
1525}