1use crate::{Result, TensorError};
7use scirs2_core::profiling::Profiler;
8use std::sync::{Arc, Mutex, OnceLock};
9
10#[cfg(target_arch = "aarch64")]
11use std::arch::aarch64::*;
12
13#[repr(C, align(64))] pub struct UltraSimdEngine {
16 cpu_features: Arc<CpuFeatures>,
18 profiler: Arc<Profiler>,
20 operation_registry: Arc<Mutex<SimdOperationRegistry>>,
22 config: SimdEngineConfig,
24}
25
26#[derive(Debug, Clone)]
28pub struct CpuFeatures {
29 pub has_avx512f: bool,
31 pub has_avx512dq: bool,
32 pub has_avx512bw: bool,
33 pub has_avx512vl: bool,
34 pub has_avx2: bool,
36 pub has_fma: bool,
38 pub has_neon: bool,
40 pub has_sve: bool,
42 pub l1_cache_size: usize,
44 pub l2_cache_size: usize,
45 pub l3_cache_size: usize,
46 pub max_vector_width: usize,
48 pub simd_register_count: usize,
49}
50
51#[derive(Debug, Clone)]
53pub struct SimdEngineConfig {
54 pub enable_aggressive_opts: bool,
56 pub simd_threshold: usize,
58 pub enable_runtime_detection: bool,
60 pub enable_profiling: bool,
62 pub preferred_vector_width: usize,
64 pub enable_unsafe_opts: bool,
66}
67
68struct SimdOperationRegistry {
70 matmul_kernels: Vec<SimdKernel>,
72 elementwise_kernels: Vec<SimdKernel>,
74 reduction_kernels: Vec<SimdKernel>,
76 convolution_kernels: Vec<SimdKernel>,
78}
79
80#[derive(Debug, Clone)]
82pub struct SimdKernel {
83 pub name: String,
85 pub required_features: Vec<String>,
87 pub optimal_size_range: (usize, usize),
89 pub performance_profile: KernelPerformanceProfile,
91 pub kernel_fn: KernelFunction,
93}
94
95#[derive(Debug, Clone)]
97pub struct KernelPerformanceProfile {
98 pub ops_per_second: f64,
100 pub memory_bandwidth_utilization: f64,
102 pub cache_efficiency: f64,
104 pub energy_efficiency: f64,
106 pub simd_utilization: f64,
108}
109
110type MatMulKernel = fn(&[f32], &[f32], &mut [f32], usize, usize, usize) -> Result<()>;
112type ElementWiseKernel = fn(&[f32], &[f32], &mut [f32], ElementWiseOp) -> Result<()>;
113type ReductionKernel = fn(&[f32], &mut f32, ReductionOp) -> Result<()>;
114type ConvolutionKernel = fn(&[f32], &[f32], &mut [f32], ConvolutionParams) -> Result<()>;
115
116#[derive(Debug, Clone)]
118pub enum KernelFunction {
119 MatMul(MatMulKernel),
120 ElementWise(ElementWiseKernel),
121 Reduction(ReductionKernel),
122 Convolution(ConvolutionKernel),
123}
124
125#[derive(Debug, Clone, Copy)]
127pub enum ElementWiseOp {
128 Add,
129 Mul,
130 Sub,
131 Div,
132 Max,
133 Min,
134 Sigmoid,
135 Tanh,
136 Relu,
137 Exp,
138 Log,
139 Sqrt,
140}
141
142#[derive(Debug, Clone, Copy)]
144pub enum ReductionOp {
145 Sum,
146 Product,
147 Max,
148 Min,
149 Mean,
150 Norm1,
151 Norm2,
152}
153
154#[derive(Debug, Clone)]
156pub struct ConvolutionParams {
157 pub kernel_size: (usize, usize),
158 pub stride: (usize, usize),
159 pub padding: (usize, usize),
160 pub dilation: (usize, usize),
161}
162
163impl UltraSimdEngine {
164 pub fn new(config: SimdEngineConfig) -> Result<Self> {
166 let cpu_features = Arc::new(Self::detect_cpu_features()?);
167 let profiler = Arc::new(Profiler::new());
168 let operation_registry = Arc::new(Mutex::new(SimdOperationRegistry::new()));
169
170 let mut engine = Self {
171 cpu_features,
172 profiler,
173 operation_registry,
174 config,
175 };
176
177 engine.initialize_optimized_kernels()?;
179
180 Ok(engine)
181 }
182
183 fn detect_cpu_features() -> Result<CpuFeatures> {
185 let mut features = CpuFeatures {
186 has_avx512f: false,
187 has_avx512dq: false,
188 has_avx512bw: false,
189 has_avx512vl: false,
190 has_avx2: false,
191 has_fma: false,
192 has_neon: false,
193 has_sve: false,
194 l1_cache_size: 32768, l2_cache_size: 262144, l3_cache_size: 8388608, max_vector_width: 256, simd_register_count: 16,
199 };
200
201 #[cfg(target_arch = "x86_64")]
202 {
203 if is_x86_feature_detected!("avx512f") {
204 features.has_avx512f = true;
205 features.max_vector_width = 512;
206 features.simd_register_count = 32;
207 }
208 if is_x86_feature_detected!("avx512dq") {
209 features.has_avx512dq = true;
210 }
211 if is_x86_feature_detected!("avx512bw") {
212 features.has_avx512bw = true;
213 }
214 if is_x86_feature_detected!("avx512vl") {
215 features.has_avx512vl = true;
216 }
217 if is_x86_feature_detected!("avx2") {
218 features.has_avx2 = true;
219 if !features.has_avx512f {
220 features.max_vector_width = 256;
221 }
222 }
223 if is_x86_feature_detected!("fma") {
224 features.has_fma = true;
225 }
226 }
227
228 #[cfg(target_arch = "aarch64")]
229 {
230 #[cfg(target_feature = "neon")]
231 {
232 features.has_neon = true;
233 features.max_vector_width = 128;
234 features.simd_register_count = 32;
235 }
236
237 }
240
241 Ok(features)
242 }
243
244 fn initialize_optimized_kernels(&mut self) -> Result<()> {
246 let mut registry = self.operation_registry.lock().map_err(|_| {
247 TensorError::compute_error_simple("Failed to lock operation registry".to_string())
248 })?;
249
250 if self.cpu_features.has_avx512f {
252 self.initialize_avx512_kernels(&mut registry)?;
253 }
254
255 if self.cpu_features.has_neon {
257 self.initialize_neon_kernels(&mut registry)?;
258 }
259
260 if self.cpu_features.has_avx2 {
262 self.initialize_avx2_kernels(&mut registry)?;
263 }
264
265 self.initialize_scalar_kernels(&mut registry)?;
267
268 Ok(())
269 }
270
271 fn initialize_avx512_kernels(&self, registry: &mut SimdOperationRegistry) -> Result<()> {
273 registry.matmul_kernels.push(SimdKernel {
275 name: "avx512_matmul_f32".to_string(),
276 required_features: vec!["avx512f".to_string()],
277 optimal_size_range: (256, usize::MAX),
278 performance_profile: KernelPerformanceProfile {
279 ops_per_second: 4e12,
280 memory_bandwidth_utilization: 0.95,
281 cache_efficiency: 0.9,
282 energy_efficiency: 0.85,
283 simd_utilization: 0.98,
284 },
285 kernel_fn: KernelFunction::MatMul(avx2_matmul_f32),
286 });
287
288 registry.elementwise_kernels.push(SimdKernel {
290 name: "avx512_elementwise_f32".to_string(),
291 required_features: vec!["avx512f".to_string()],
292 optimal_size_range: (64, usize::MAX),
293 performance_profile: KernelPerformanceProfile {
294 ops_per_second: 8e12,
295 memory_bandwidth_utilization: 0.92,
296 cache_efficiency: 0.88,
297 energy_efficiency: 0.9,
298 simd_utilization: 0.96,
299 },
300 kernel_fn: KernelFunction::ElementWise(scalar_elementwise_f32),
301 });
302
303 Ok(())
304 }
305
306 fn initialize_neon_kernels(&self, registry: &mut SimdOperationRegistry) -> Result<()> {
308 registry.matmul_kernels.push(SimdKernel {
310 name: "neon_matmul_f32".to_string(),
311 required_features: vec!["neon".to_string()],
312 optimal_size_range: (64, usize::MAX),
313 performance_profile: KernelPerformanceProfile {
314 ops_per_second: 1.5e12,
315 memory_bandwidth_utilization: 0.88,
316 cache_efficiency: 0.85,
317 energy_efficiency: 0.95,
318 simd_utilization: 0.92,
319 },
320 kernel_fn: KernelFunction::MatMul(scalar_matmul_f32),
321 });
322
323 registry.elementwise_kernels.push(SimdKernel {
325 name: "neon_elementwise_f32".to_string(),
326 required_features: vec!["neon".to_string()],
327 optimal_size_range: (32, usize::MAX),
328 performance_profile: KernelPerformanceProfile {
329 ops_per_second: 3e12,
330 memory_bandwidth_utilization: 0.85,
331 cache_efficiency: 0.8,
332 energy_efficiency: 0.98,
333 simd_utilization: 0.9,
334 },
335 kernel_fn: KernelFunction::ElementWise(scalar_elementwise_f32),
336 });
337
338 Ok(())
339 }
340
341 fn initialize_avx2_kernels(&self, registry: &mut SimdOperationRegistry) -> Result<()> {
343 registry.matmul_kernels.push(SimdKernel {
344 name: "avx2_matmul_f32".to_string(),
345 required_features: vec!["avx2".to_string()],
346 optimal_size_range: (128, 10000),
347 performance_profile: KernelPerformanceProfile {
348 ops_per_second: 2e12,
349 memory_bandwidth_utilization: 0.8,
350 cache_efficiency: 0.75,
351 energy_efficiency: 0.8,
352 simd_utilization: 0.85,
353 },
354 kernel_fn: KernelFunction::MatMul(avx2_matmul_f32),
355 });
356
357 Ok(())
358 }
359
360 fn initialize_scalar_kernels(&self, registry: &mut SimdOperationRegistry) -> Result<()> {
362 registry.matmul_kernels.push(SimdKernel {
363 name: "scalar_matmul_f32".to_string(),
364 required_features: vec![],
365 optimal_size_range: (1, 256),
366 performance_profile: KernelPerformanceProfile {
367 ops_per_second: 1e11,
368 memory_bandwidth_utilization: 0.6,
369 cache_efficiency: 0.7,
370 energy_efficiency: 0.9,
371 simd_utilization: 0.0,
372 },
373 kernel_fn: KernelFunction::MatMul(scalar_matmul_f32),
374 });
375
376 Ok(())
377 }
378
379 pub fn select_optimal_kernel(&self, operation: &str, data_size: usize) -> Result<SimdKernel> {
381 let registry = self.operation_registry.lock().map_err(|_| {
382 TensorError::compute_error_simple("Failed to lock operation registry".to_string())
383 })?;
384
385 let kernels = match operation {
386 "matmul" => ®istry.matmul_kernels,
387 "elementwise" => ®istry.elementwise_kernels,
388 "reduction" => ®istry.reduction_kernels,
389 "convolution" => ®istry.convolution_kernels,
390 _ => {
391 return Err(TensorError::compute_error_simple(
392 "Unknown operation".to_string(),
393 ))
394 }
395 };
396
397 for kernel in kernels {
399 if self.kernel_supports_features(kernel)
400 && data_size >= kernel.optimal_size_range.0
401 && data_size <= kernel.optimal_size_range.1
402 {
403 return Ok(kernel.clone());
404 }
405 }
406
407 for kernel in kernels {
409 if kernel.required_features.is_empty() {
410 return Ok(kernel.clone());
411 }
412 }
413
414 Err(TensorError::compute_error_simple(
415 "No suitable kernel found".to_string(),
416 ))
417 }
418
419 fn kernel_supports_features(&self, kernel: &SimdKernel) -> bool {
421 for feature in &kernel.required_features {
422 match feature.as_str() {
423 "avx512f" => {
424 if !self.cpu_features.has_avx512f {
425 return false;
426 }
427 }
428 "avx512dq" => {
429 if !self.cpu_features.has_avx512dq {
430 return false;
431 }
432 }
433 "avx2" => {
434 if !self.cpu_features.has_avx2 {
435 return false;
436 }
437 }
438 "fma" => {
439 if !self.cpu_features.has_fma {
440 return false;
441 }
442 }
443 "neon" => {
444 if !self.cpu_features.has_neon {
445 return false;
446 }
447 }
448 _ => return false,
449 }
450 }
451 true
452 }
453
454 #[inline(always)]
456 pub fn optimized_matmul(
457 &self,
458 a: &[f32],
459 b: &[f32],
460 c: &mut [f32],
461 m: usize,
462 n: usize,
463 k: usize,
464 ) -> Result<()> {
465 let data_size = m * n * k;
466 let kernel = self.select_optimal_kernel("matmul", data_size)?;
467
468 if let KernelFunction::MatMul(kernel_fn) = kernel.kernel_fn {
469 kernel_fn(a, b, c, m, n, k)
470 } else {
471 Err(TensorError::compute_error_simple(
472 "Invalid kernel function type".to_string(),
473 ))
474 }
475 }
476
477 #[inline(always)]
479 pub fn optimized_elementwise(
480 &self,
481 a: &[f32],
482 b: &[f32],
483 c: &mut [f32],
484 op: ElementWiseOp,
485 ) -> Result<()> {
486 scalar_elementwise_f32(a, b, c, op)
488 }
489
490 pub fn get_performance_stats(&self) -> Result<SimdPerformanceStats> {
492 let registry = self.operation_registry.lock().map_err(|_| {
493 TensorError::compute_error_simple("Failed to lock operation registry".to_string())
494 })?;
495
496 let total_kernels = registry.matmul_kernels.len()
497 + registry.elementwise_kernels.len()
498 + registry.reduction_kernels.len()
499 + registry.convolution_kernels.len();
500
501 Ok(SimdPerformanceStats {
502 cpu_features: (*self.cpu_features).clone(),
503 total_kernels_available: total_kernels,
504 max_theoretical_throughput: self.calculate_max_throughput(),
505 simd_utilization_efficiency: self.calculate_simd_efficiency(),
506 memory_bandwidth_utilization: 0.9,
507 cache_efficiency: 0.85,
508 })
509 }
510
511 fn calculate_max_throughput(&self) -> f64 {
512 let base_throughput = 1e12; let vector_multiplier = self.cpu_features.max_vector_width as f64 / 128.0;
515 let feature_multiplier = if self.cpu_features.has_avx512f {
516 4.0
517 } else if self.cpu_features.has_avx2 {
518 2.0
519 } else {
520 1.0
521 };
522
523 base_throughput * vector_multiplier * feature_multiplier
524 }
525
526 fn calculate_simd_efficiency(&self) -> f64 {
527 if self.cpu_features.has_avx512f {
529 0.95
530 } else if self.cpu_features.has_avx2 {
531 0.85
532 } else if self.cpu_features.has_neon {
533 0.9
534 } else {
535 0.0
536 }
537 }
538}
539
540#[derive(Debug, Clone)]
542pub struct SimdPerformanceStats {
543 pub cpu_features: CpuFeatures,
544 pub total_kernels_available: usize,
545 pub max_theoretical_throughput: f64,
546 pub simd_utilization_efficiency: f64,
547 pub memory_bandwidth_utilization: f64,
548 pub cache_efficiency: f64,
549}
550
551impl SimdOperationRegistry {
552 fn new() -> Self {
553 Self {
554 matmul_kernels: Vec::new(),
555 elementwise_kernels: Vec::new(),
556 reduction_kernels: Vec::new(),
557 convolution_kernels: Vec::new(),
558 }
559 }
560}
561
562impl Default for SimdEngineConfig {
563 fn default() -> Self {
564 Self {
565 enable_aggressive_opts: true,
566 simd_threshold: 64,
567 enable_runtime_detection: true,
568 enable_profiling: true,
569 preferred_vector_width: 256,
570 enable_unsafe_opts: false,
571 }
572 }
573}
574
575static GLOBAL_SIMD_ENGINE: OnceLock<Arc<Mutex<UltraSimdEngine>>> = OnceLock::new();
577
578pub fn global_simd_engine() -> Arc<Mutex<UltraSimdEngine>> {
580 GLOBAL_SIMD_ENGINE
581 .get_or_init(|| {
582 let config = SimdEngineConfig::default();
583 let engine = UltraSimdEngine::new(config).expect("Failed to create SIMD engine");
584 Arc::new(Mutex::new(engine))
585 })
586 .clone()
587}
588
589#[cfg(target_arch = "x86_64")]
593fn avx512_matmul_f32(
594 a: &[f32],
595 b: &[f32],
596 c: &mut [f32],
597 m: usize,
598 n: usize,
599 k: usize,
600) -> Result<()> {
601 avx2_matmul_f32(a, b, c, m, n, k)
604}
605
606#[cfg(target_arch = "aarch64")]
608#[allow(dead_code)]
609fn neon_matmul_f32(
610 a: &[f32],
611 b: &[f32],
612 c: &mut [f32],
613 m: usize,
614 n: usize,
615 k: usize,
616) -> Result<()> {
617 #[cfg(target_feature = "neon")]
618 unsafe {
619 const TILE_SIZE: usize = 4;
621
622 for i in (0..m).step_by(TILE_SIZE) {
624 for j in (0..n).step_by(TILE_SIZE) {
625 let mut c00 = vdupq_n_f32(0.0);
627 let mut c01 = vdupq_n_f32(0.0);
628 let mut c02 = vdupq_n_f32(0.0);
629 let mut c03 = vdupq_n_f32(0.0);
630
631 for l in (0..k).step_by(4) {
632 if l + 4 <= k {
633 let a_vec = vld1q_f32(a.as_ptr().add(i * k + l));
635
636 for jj in 0..TILE_SIZE.min(n - j) {
638 if j + jj < n {
639 let b_vec = vld1q_f32(b.as_ptr().add((l) * n + j + jj));
640 match jj {
641 0 => c00 = vfmaq_f32(c00, a_vec, b_vec),
642 1 => c01 = vfmaq_f32(c01, a_vec, b_vec),
643 2 => c02 = vfmaq_f32(c02, a_vec, b_vec),
644 3 => c03 = vfmaq_f32(c03, a_vec, b_vec),
645 _ => unreachable!(),
646 }
647 }
648 }
649 }
650 }
651
652 let i_max = TILE_SIZE.min(m - i);
654 let j_max = TILE_SIZE.min(n - j);
655
656 for ii in 0..i_max {
657 for jj in 0..j_max {
658 if i + ii < m && j + jj < n {
659 let result_vec = match jj {
660 0 => c00,
661 1 => c01,
662 2 => c02,
663 3 => c03,
664 _ => unreachable!(),
665 };
666
667 let sum = vaddvq_f32(result_vec);
669 c[(i + ii) * n + (j + jj)] += sum;
670 }
671 }
672 }
673 }
674 }
675
676 Ok(())
677 }
678
679 #[cfg(not(target_feature = "neon"))]
680 {
681 scalar_matmul_f32(a, b, c, m, n, k)
683 }
684}
685
686fn avx2_matmul_f32(
688 a: &[f32],
689 b: &[f32],
690 c: &mut [f32],
691 m: usize,
692 n: usize,
693 k: usize,
694) -> Result<()> {
695 for i in 0..m {
698 for j in 0..n {
699 let mut sum = 0.0;
700 for l in 0..k {
701 sum += a[i * k + l] * b[l * n + j];
702 }
703 c[i * n + j] = sum;
704 }
705 }
706 Ok(())
707}
708
709fn scalar_matmul_f32(
711 a: &[f32],
712 b: &[f32],
713 c: &mut [f32],
714 m: usize,
715 n: usize,
716 k: usize,
717) -> Result<()> {
718 for i in 0..m {
719 for j in 0..n {
720 let mut sum = 0.0;
721 for l in 0..k {
722 sum += a[i * k + l] * b[l * n + j];
723 }
724 c[i * n + j] = sum;
725 }
726 }
727 Ok(())
728}
729
730#[allow(dead_code)]
732fn avx512_elementwise_f32(a: &[f32], b: &[f32], c: &mut [f32], op: ElementWiseOp) -> Result<()> {
733 scalar_elementwise_f32(a, b, c, op)
736}
737
738#[allow(dead_code)]
740fn neon_elementwise_f32(a: &[f32], b: &[f32], c: &mut [f32], op: ElementWiseOp) -> Result<()> {
741 scalar_elementwise_f32(a, b, c, op)
744}
745
746#[inline(always)]
748fn scalar_elementwise_f32(a: &[f32], b: &[f32], c: &mut [f32], op: ElementWiseOp) -> Result<()> {
749 let len = a.len();
751 let chunks = len / 4;
752 let _remainder = len % 4;
753
754 for chunk in 0..chunks {
756 let base = chunk * 4;
757 for i in base..base + 4 {
758 c[i] = match op {
759 ElementWiseOp::Add => a[i] + b[i],
760 ElementWiseOp::Mul => a[i] * b[i],
761 ElementWiseOp::Sub => a[i] - b[i],
762 ElementWiseOp::Div => a[i] / b[i],
763 ElementWiseOp::Max => a[i].max(b[i]),
764 ElementWiseOp::Min => a[i].min(b[i]),
765 ElementWiseOp::Sigmoid => 1.0 / (1.0 + (-a[i]).exp()),
766 ElementWiseOp::Tanh => a[i].tanh(),
767 ElementWiseOp::Relu => a[i].max(0.0),
768 ElementWiseOp::Exp => a[i].exp(),
769 ElementWiseOp::Log => a[i].ln(),
770 ElementWiseOp::Sqrt => a[i].sqrt(),
771 };
772 }
773 }
774
775 for i in chunks * 4..len {
777 c[i] = match op {
778 ElementWiseOp::Add => a[i] + b[i],
779 ElementWiseOp::Mul => a[i] * b[i],
780 ElementWiseOp::Sub => a[i] - b[i],
781 ElementWiseOp::Div => a[i] / b[i],
782 ElementWiseOp::Max => a[i].max(b[i]),
783 ElementWiseOp::Min => a[i].min(b[i]),
784 ElementWiseOp::Sigmoid => 1.0 / (1.0 + (-a[i]).exp()),
785 ElementWiseOp::Tanh => a[i].tanh(),
786 ElementWiseOp::Relu => a[i].max(0.0),
787 ElementWiseOp::Exp => a[i].exp(),
788 ElementWiseOp::Log => a[i].ln(),
789 ElementWiseOp::Sqrt => a[i].sqrt(),
790 };
791 }
792 Ok(())
793}
794
795#[cfg(test)]
796mod tests {
797 use super::*;
798
799 #[test]
800 fn test_simd_engine_creation() {
801 let config = SimdEngineConfig::default();
802 let engine = UltraSimdEngine::new(config);
803 assert!(engine.is_ok());
804 }
805
806 #[test]
807 fn test_cpu_feature_detection() {
808 let features = UltraSimdEngine::detect_cpu_features();
809 assert!(features.is_ok());
810
811 let features = features.expect("test: operation should succeed");
812 assert!(features.max_vector_width >= 128);
813 assert!(features.simd_register_count >= 16);
814 }
815
816 #[test]
817 fn test_kernel_selection() {
818 let config = SimdEngineConfig::default();
819 let engine = UltraSimdEngine::new(config).expect("test: new should succeed");
820
821 let kernel = engine.select_optimal_kernel("matmul", 1024);
822 assert!(kernel.is_ok());
823 }
824
825 #[test]
826 fn test_optimized_matmul() {
827 let config = SimdEngineConfig::default();
828 let engine = UltraSimdEngine::new(config).expect("test: new should succeed");
829
830 let a = vec![1.0; 16];
831 let b = vec![2.0; 16];
832 let mut c = vec![0.0; 16];
833
834 let result = engine.optimized_matmul(&a, &b, &mut c, 4, 4, 4);
835 assert!(result.is_ok());
836
837 for value in &c {
839 assert!(*value > 0.0);
840 }
841 }
842
843 #[test]
844 fn test_optimized_elementwise() {
845 let config = SimdEngineConfig::default();
846 let engine = UltraSimdEngine::new(config).expect("test: new should succeed");
847
848 let a = vec![1.0, 2.0, 3.0, 4.0];
849 let b = vec![2.0, 3.0, 4.0, 5.0];
850 let mut c = vec![0.0; 4];
851
852 let result = engine.optimized_elementwise(&a, &b, &mut c, ElementWiseOp::Add);
853 assert!(result.is_ok());
854
855 assert_eq!(c, vec![3.0, 5.0, 7.0, 9.0]);
857 }
858
859 #[test]
860 fn test_global_simd_engine() {
861 let engine1 = global_simd_engine();
862 let engine2 = global_simd_engine();
863
864 assert!(Arc::ptr_eq(&engine1, &engine2));
866 }
867
868 #[test]
869 fn test_performance_stats() {
870 let config = SimdEngineConfig::default();
871 let engine = UltraSimdEngine::new(config).expect("test: new should succeed");
872
873 let stats = engine.get_performance_stats();
874 assert!(stats.is_ok());
875
876 let stats = stats.expect("test: operation should succeed");
877 assert!(stats.total_kernels_available > 0);
878 assert!(stats.max_theoretical_throughput > 0.0);
879 }
880}