tenflowers-core 0.1.1

Core tensor operations and execution engine for TenfloweRS
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
//! GPU Performance Benchmarking vs TensorFlow
//!
//! This module provides comprehensive GPU benchmarking capabilities to measure
//! TenfloweRS GPU performance against TensorFlow GPU and work towards achieving
//! 90% of TensorFlow GPU performance as specified in the project goals.

use super::benchmark::{BenchmarkConfig, BenchmarkResult, BenchmarkSuite};
use super::framework_comparison::{FrameworkBenchmarkConfig, FrameworkComparisonResult};
use crate::gpu::performance_optimizer::{
    GpuCapabilities, GpuOpMetrics, GpuPerformanceOptimizer, OptimizationConfig,
};
use crate::{DType, Device, Result, Tensor, TensorError};
use std::collections::HashMap;
use std::process::Command;
use std::sync::Arc;
use std::time::{Duration, Instant};

#[cfg(feature = "serialize")]
use serde::{Deserialize, Serialize};

/// GPU-specific benchmark configuration
#[derive(Debug, Clone)]
pub struct GpuBenchmarkConfig {
    pub base_config: BenchmarkConfig,
    pub gpu_device_ids: Vec<usize>,
    pub test_mixed_precision: bool,
    pub test_tensor_cores: bool,
    pub enable_cuda_graphs: bool,
    pub target_tensorflow_efficiency: f64, // Target: 0.9 for 90%
    pub python_executable: String,
}

impl Default for GpuBenchmarkConfig {
    fn default() -> Self {
        Self {
            base_config: BenchmarkConfig::default(),
            gpu_device_ids: vec![0],
            test_mixed_precision: true,
            test_tensor_cores: true,
            enable_cuda_graphs: true,
            target_tensorflow_efficiency: 0.9,
            python_executable: "python3".to_string(),
        }
    }
}

/// GPU benchmark result comparing TenfloweRS vs TensorFlow GPU
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
pub struct GpuBenchmarkResult {
    pub operation: String,
    pub input_shapes: Vec<Vec<usize>>,
    pub dtype: DType,
    pub device_id: usize,
    pub tenflowers_metrics: GpuOpMetrics,
    pub tensorflow_gpu_time: Option<Duration>,
    pub pytorch_gpu_time: Option<Duration>,
    pub performance_ratio: f64, // TenfloweRS / TensorFlow (target: >= 0.9)
    pub memory_efficiency: f64,
    pub throughput_comparison: HashMap<String, f64>,
    pub bottlenecks_identified: Vec<String>,
    pub optimization_suggestions: Vec<String>,
    pub meets_target: bool, // Whether it meets 90% target
}

/// Comprehensive GPU benchmark suite for TensorFlow comparison
pub struct GpuTensorFlowBenchmark {
    config: GpuBenchmarkConfig,
    gpu_optimizer: Arc<GpuPerformanceOptimizer>,
    benchmark_suite: BenchmarkSuite,
}

impl GpuTensorFlowBenchmark {
    /// Create a new GPU TensorFlow benchmark suite
    pub fn new(config: GpuBenchmarkConfig, gpu_optimizer: Arc<GpuPerformanceOptimizer>) -> Self {
        let benchmark_suite = BenchmarkSuite::new(config.base_config.clone());

        Self {
            config,
            gpu_optimizer,
            benchmark_suite,
        }
    }

    /// Run comprehensive GPU benchmark against TensorFlow
    pub fn run_comprehensive_benchmark(&self) -> Result<Vec<GpuBenchmarkResult>> {
        println!("🚀 Running Comprehensive GPU Benchmark vs TensorFlow");
        println!("Target: Achieve 90% of TensorFlow GPU performance\n");

        let mut results = Vec::new();

        // Core operations to benchmark
        let operations = [
            ("MatMul", vec![vec![1024, 1024], vec![1024, 1024]]),
            ("MatMul", vec![vec![2048, 2048], vec![2048, 2048]]),
            ("MatMul", vec![vec![4096, 4096], vec![4096, 4096]]),
            ("Add", vec![vec![10000000], vec![10000000]]),
            ("Mul", vec![vec![10000000], vec![10000000]]),
            ("Conv2D", vec![vec![32, 256, 256, 3], vec![3, 3, 3, 64]]),
            ("Conv2D", vec![vec![64, 128, 128, 64], vec![3, 3, 64, 128]]),
            ("BatchNorm", vec![vec![32, 128, 128, 64]]),
            ("ReLU", vec![vec![32, 128, 128, 64]]),
            ("Softmax", vec![vec![1024, 10000]]),
        ];

        for &device_id in &self.config.gpu_device_ids {
            println!("Benchmarking GPU Device {device_id}:");

            for (op_name, input_shapes) in &operations {
                println!("  Testing {op_name} with shapes {:?}", input_shapes);

                match self.benchmark_gpu_operation(op_name, input_shapes, device_id) {
                    Ok(result) => {
                        let status = if result.meets_target {
                            "✅ MEETS TARGET"
                        } else {
                            "❌ BELOW TARGET"
                        };
                        println!(
                            "    Performance ratio: {:.3} - {status}",
                            result.performance_ratio
                        );
                        results.push(result);
                    }
                    Err(e) => {
                        println!("    ❌ Benchmark failed: {e}");
                    }
                }
            }
        }

        // Mixed precision benchmarks if enabled
        if self.config.test_mixed_precision {
            println!("\n🔄 Running Mixed Precision Benchmarks (FP16):");
            let fp16_results = self.run_mixed_precision_benchmarks()?;
            results.extend(fp16_results);
        }

        // Generate summary report
        self.print_benchmark_summary(&results);

        Ok(results)
    }

    /// Benchmark a specific GPU operation against TensorFlow
    fn benchmark_gpu_operation(
        &self,
        operation: &str,
        input_shapes: &[Vec<usize>],
        device_id: usize,
    ) -> Result<GpuBenchmarkResult> {
        // Create tensors on GPU
        let mut gpu_tensors = Vec::new();
        for shape in input_shapes {
            let tensor: Tensor<f32> = Tensor::zeros(shape).to(Device::Gpu(device_id))?;
            gpu_tensors.push(tensor);
        }

        // Benchmark TenfloweRS GPU performance
        self.gpu_optimizer.start_profiling(
            operation,
            device_id,
            input_shapes.to_vec(),
            DType::Float32,
        );

        let start = Instant::now();
        let tenflowers_result = self.execute_tenflowers_operation(operation, &gpu_tensors)?;
        let tenflowers_time = start.elapsed();

        // Record GPU metrics
        self.gpu_optimizer.record_kernel_execution(
            tenflowers_time,
            input_shapes
                .iter()
                .map(|s| s.iter().product::<usize>())
                .sum(),
            crate::gpu::performance_optimizer::WorkgroupConfig::default(),
        );

        let gpu_metrics = self
            .gpu_optimizer
            .finish_profiling()
            .ok_or_else(|| TensorError::other("Failed to collect GPU metrics".to_string()))?;

        // Benchmark TensorFlow GPU performance
        let tensorflow_gpu_time = self.benchmark_tensorflow_gpu(operation, input_shapes)?;
        let pytorch_gpu_time = self.benchmark_pytorch_gpu(operation, input_shapes)?;

        // Calculate performance metrics
        let performance_ratio = if let Some(tf_time) = tensorflow_gpu_time {
            tf_time.as_secs_f64() / tenflowers_time.as_secs_f64()
        } else {
            0.0
        };

        let meets_target = performance_ratio >= self.config.target_tensorflow_efficiency;

        // Analyze bottlenecks and generate suggestions
        let bottlenecks = self.gpu_optimizer.analyze_bottlenecks(&gpu_metrics);
        let bottlenecks_identified: Vec<String> =
            bottlenecks.iter().map(|b| b.description.clone()).collect();

        let optimization_suggestions = self
            .gpu_optimizer
            .get_optimization_recommendations(operation);

        // Calculate throughput comparisons
        let mut throughput_comparison = HashMap::new();
        let tenflowers_throughput = input_shapes
            .iter()
            .map(|s| s.iter().product::<usize>())
            .sum::<usize>() as f64
            / tenflowers_time.as_secs_f64();

        throughput_comparison.insert("tenflowers".to_string(), tenflowers_throughput);

        if let Some(tf_time) = tensorflow_gpu_time {
            let tf_throughput = input_shapes
                .iter()
                .map(|s| s.iter().product::<usize>())
                .sum::<usize>() as f64
                / tf_time.as_secs_f64();
            throughput_comparison.insert("tensorflow".to_string(), tf_throughput);
        }

        Ok(GpuBenchmarkResult {
            operation: operation.to_string(),
            input_shapes: input_shapes.to_vec(),
            dtype: DType::Float32,
            device_id,
            tenflowers_metrics: gpu_metrics,
            tensorflow_gpu_time,
            pytorch_gpu_time,
            performance_ratio,
            memory_efficiency: self.calculate_memory_efficiency(&tenflowers_result)?,
            throughput_comparison,
            bottlenecks_identified,
            optimization_suggestions,
            meets_target,
        })
    }

    /// Execute TenfloweRS operation
    fn execute_tenflowers_operation(
        &self,
        operation: &str,
        tensors: &[Tensor<f32>],
    ) -> Result<Tensor<f32>> {
        match operation {
            "MatMul" => {
                if tensors.len() >= 2 {
                    crate::ops::matmul::matmul(&tensors[0], &tensors[1])
                } else {
                    Err(TensorError::invalid_argument(
                        "MatMul requires 2 tensors".to_string(),
                    ))
                }
            }
            "Add" => {
                if tensors.len() >= 2 {
                    crate::ops::binary::add(&tensors[0], &tensors[1])
                } else {
                    Err(TensorError::invalid_argument(
                        "Add requires 2 tensors".to_string(),
                    ))
                }
            }
            "Mul" => {
                if tensors.len() >= 2 {
                    crate::ops::binary::mul(&tensors[0], &tensors[1])
                } else {
                    Err(TensorError::invalid_argument(
                        "Mul requires 2 tensors".to_string(),
                    ))
                }
            }
            "Conv2D" => {
                if tensors.len() >= 2 {
                    crate::ops::conv::conv2d(&tensors[0], &tensors[1], None, (1, 1), "same")
                } else {
                    Err(TensorError::invalid_argument(
                        "Conv2D requires 2 tensors".to_string(),
                    ))
                }
            }
            "BatchNorm" => {
                if !tensors.is_empty() {
                    // Create dummy gamma, beta, running_mean, running_var tensors for benchmark
                    let num_features = tensors[0].shape().dims()[1]; // Assume NCHW format
                    let ones = crate::Tensor::ones(&[num_features]);
                    let zeros = crate::Tensor::zeros(&[num_features]);
                    crate::ops::normalization::batch_norm(
                        &tensors[0],
                        &ones,   // gamma
                        &zeros,  // beta
                        &zeros,  // running_mean
                        &ones,   // running_var
                        1e-5f32, // epsilon
                        true,    // training
                    )
                } else {
                    Err(TensorError::invalid_argument(
                        "BatchNorm requires 1 tensor".to_string(),
                    ))
                }
            }
            "ReLU" => {
                if !tensors.is_empty() {
                    crate::ops::activation::relu(&tensors[0])
                } else {
                    Err(TensorError::invalid_argument(
                        "ReLU requires 1 tensor".to_string(),
                    ))
                }
            }
            "Softmax" => {
                if !tensors.is_empty() {
                    crate::ops::activation::softmax(&tensors[0], Some(-1))
                } else {
                    Err(TensorError::invalid_argument(
                        "Softmax requires 1 tensor".to_string(),
                    ))
                }
            }
            _ => Err(TensorError::invalid_argument(format!(
                "Unknown operation: {operation}"
            ))),
        }
    }

    /// Benchmark TensorFlow GPU performance
    fn benchmark_tensorflow_gpu(
        &self,
        operation: &str,
        input_shapes: &[Vec<usize>],
    ) -> Result<Option<Duration>> {
        let script = self.generate_tensorflow_gpu_script(operation, input_shapes)?;
        let output = Command::new(&self.config.python_executable)
            .arg("-c")
            .arg(&script)
            .output()
            .map_err(|e| {
                TensorError::other(format!("Failed to execute TensorFlow benchmark: {e}"))
            })?;

        if !output.status.success() {
            println!(
                "Warning: TensorFlow GPU benchmark failed: {}",
                String::from_utf8_lossy(&output.stderr)
            );
            return Ok(None);
        }

        let elapsed_ns_str = String::from_utf8_lossy(&output.stdout);
        let elapsed_ns: f64 = elapsed_ns_str
            .trim()
            .parse()
            .map_err(|e| TensorError::other(format!("Failed to parse TensorFlow timing: {e}")))?;

        Ok(Some(Duration::from_nanos(elapsed_ns as u64)))
    }

    /// Benchmark PyTorch GPU performance  
    fn benchmark_pytorch_gpu(
        &self,
        operation: &str,
        input_shapes: &[Vec<usize>],
    ) -> Result<Option<Duration>> {
        let script = self.generate_pytorch_gpu_script(operation, input_shapes)?;
        let output = Command::new(&self.config.python_executable)
            .arg("-c")
            .arg(&script)
            .output()
            .map_err(|e| TensorError::other(format!("Failed to execute PyTorch benchmark: {e}")))?;

        if !output.status.success() {
            println!(
                "Warning: PyTorch GPU benchmark failed: {}",
                String::from_utf8_lossy(&output.stderr)
            );
            return Ok(None);
        }

        let elapsed_ns_str = String::from_utf8_lossy(&output.stdout);
        let elapsed_ns: f64 = elapsed_ns_str
            .trim()
            .parse()
            .map_err(|e| TensorError::other(format!("Failed to parse PyTorch timing: {e}")))?;

        Ok(Some(Duration::from_nanos(elapsed_ns as u64)))
    }

    /// Generate TensorFlow GPU benchmark script
    fn generate_tensorflow_gpu_script(
        &self,
        operation: &str,
        input_shapes: &[Vec<usize>],
    ) -> Result<String> {
        let shape_strs: Vec<String> = input_shapes
            .iter()
            .map(|shape| {
                format!(
                    "[{}]",
                    shape
                        .iter()
                        .map(|x| x.to_string())
                        .collect::<Vec<_>>()
                        .join(", ")
                )
            })
            .collect();

        let setup = format!(
            r#"
import tensorflow as tf
import time
import numpy as np

# Ensure GPU is available
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

with tf.device('/GPU:0'):
    # Create tensors
    tensors = []
    for shape in [{}]:
        tensor = tf.random.normal(shape, dtype=tf.float32)
        tensors.append(tensor)
"#,
            shape_strs.join(", ")
        );

        let operation_code = match operation {
            "MatMul" => {
                "result = tf.linalg.matmul(tensors[0], tensors[1])"
            }
            "Add" => {
                "result = tf.add(tensors[0], tensors[1])"
            }
            "Mul" => {
                "result = tf.multiply(tensors[0], tensors[1])"
            }
            "Conv2D" => {
                r#"
    # Ensure correct shapes for Conv2D: [batch, height, width, channels]
    inputs = tensors[0]  # [batch, height, width, in_channels]
    filters = tf.random.normal([3, 3, inputs.shape[-1], 64], dtype=tf.float32)
    result = tf.nn.conv2d(inputs, filters, strides=[1, 1, 1, 1], padding='SAME')"#
            }
            "BatchNorm" => {
                "result = tf.nn.batch_normalization(tensors[0], mean=tf.reduce_mean(tensors[0]), variance=tf.math.reduce_variance(tensors[0]), offset=None, scale=None, variance_epsilon=1e-5)"
            }
            "ReLU" => {
                "result = tf.nn.relu(tensors[0])"
            }
            "Softmax" => {
                "result = tf.nn.softmax(tensors[0])"
            }
            _ => return Err(TensorError::invalid_argument(format!("Unsupported operation: {operation}"))),
        };

        let script = format!(
            r#"
{setup}
    
    # Warmup
    for _ in range(10):
        {operation_code}
    
    # Benchmark
    start_time = time.perf_counter()
    for _ in range(100):
        {operation_code}
    
    # Ensure execution completes
    tf.experimental.numpy.copy(result)
    end_time = time.perf_counter()
    
    elapsed_ns = (end_time - start_time) * 1e9 / 100
    print(f"{{elapsed_ns:.0f}}")
"#
        );

        Ok(script)
    }

    /// Generate PyTorch GPU benchmark script
    fn generate_pytorch_gpu_script(
        &self,
        operation: &str,
        input_shapes: &[Vec<usize>],
    ) -> Result<String> {
        let shape_strs: Vec<String> = input_shapes
            .iter()
            .map(|shape| {
                format!(
                    "[{}]",
                    shape
                        .iter()
                        .map(|x| x.to_string())
                        .collect::<Vec<_>>()
                        .join(", ")
                )
            })
            .collect();

        let setup = format!(
            r#"
import torch
import time

# Ensure GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cpu':
    print("0")  # Return 0 if no GPU available
    exit()

# Create tensors on GPU
tensors = []
for shape in [{}]:
    tensor = torch.randn(shape, dtype=torch.float32, device=device)
    tensors.append(tensor)
"#,
            shape_strs.join(", ")
        );

        let operation_code = match operation {
            "MatMul" => "result = torch.matmul(tensors[0], tensors[1])",
            "Add" => "result = torch.add(tensors[0], tensors[1])",
            "Mul" => "result = torch.mul(tensors[0], tensors[1])",
            "Conv2D" => {
                r#"
# Create conv2d layer
conv = torch.nn.Conv2d(tensors[0].shape[1], 64, 3, padding=1).to(device)
result = conv(tensors[0])"#
            }
            "BatchNorm" => {
                r#"
# Create batch norm layer
bn = torch.nn.BatchNorm2d(tensors[0].shape[1]).to(device)
result = bn(tensors[0])"#
            }
            "ReLU" => "result = torch.relu(tensors[0])",
            "Softmax" => "result = torch.softmax(tensors[0], dim=-1)",
            _ => {
                return Err(TensorError::invalid_argument(format!(
                    "Unsupported operation: {operation}"
                )))
            }
        };

        let script = format!(
            r#"
{setup}

# Warmup
for _ in range(10):
    {operation_code}

torch.cuda.synchronize()

# Benchmark
start_time = time.perf_counter()
for _ in range(100):
    {operation_code}

torch.cuda.synchronize()
end_time = time.perf_counter()

elapsed_ns = (end_time - start_time) * 1e9 / 100
print(f"{{elapsed_ns:.0f}}")
"#
        );

        Ok(script)
    }

    /// Run mixed precision benchmarks
    fn run_mixed_precision_benchmarks(&self) -> Result<Vec<GpuBenchmarkResult>> {
        println!("Testing FP16 mixed precision performance...");

        // This would involve creating FP16 tensors and benchmarking
        // For now, return empty results as mixed precision support
        // needs to be implemented at the tensor level
        Ok(Vec::new())
    }

    /// Calculate memory efficiency
    fn calculate_memory_efficiency(&self, _result: &Tensor<f32>) -> Result<f64> {
        // Simplified memory efficiency calculation
        // In a real implementation, this would analyze memory usage patterns
        Ok(0.85) // Placeholder efficiency score
    }

    /// Print comprehensive benchmark summary
    fn print_benchmark_summary(&self, results: &[GpuBenchmarkResult]) {
        println!("\n{}", "=".repeat(80));
        println!("🎯 GPU BENCHMARK SUMMARY - TensorFlow Comparison");
        println!("{}", "=".repeat(80));

        if results.is_empty() {
            println!("No benchmark results available.");
            return;
        }

        let total_tests = results.len();
        let meets_target = results.iter().filter(|r| r.meets_target).count();
        let overall_success_rate = meets_target as f64 / total_tests as f64;

        println!("📊 Overall Performance:");
        println!("  • Total operations tested: {total_tests}");
        println!("  • Operations meeting 90% target: {meets_target}/{total_tests}");
        println!("  • Success rate: {:.1}%", overall_success_rate * 100.0);

        let avg_performance_ratio: f64 =
            results.iter().map(|r| r.performance_ratio).sum::<f64>() / results.len() as f64;

        println!(
            "  • Average performance ratio: {:.3} (target: ≥0.900)",
            avg_performance_ratio
        );

        if avg_performance_ratio >= 0.9 {
            println!("  ✅ OVERALL TARGET ACHIEVED!");
        } else {
            println!("  ❌ Overall target not yet achieved");
            println!(
                "  📈 Performance gap: {:.1}%",
                (0.9 - avg_performance_ratio) * 100.0
            );
        }

        println!("\n📋 Detailed Results:");
        println!("{:-<120}", "");
        println!(
            "| {:^15} | {:^20} | {:^12} | {:^12} | {:^15} | {:^10} |",
            "Operation", "Shapes", "TF GPU (μs)", "TF RS (μs)", "Ratio", "Target Met"
        );
        println!("{:-<120}", "");

        for result in results {
            let tf_time_str = result
                .tensorflow_gpu_time
                .map(|t| format!("{}", t.as_micros()))
                .unwrap_or_else(|| "N/A".to_string());

            let tf_rs_time = result.tenflowers_metrics.total_time.as_micros();
            let ratio_str = format!("{:.3}", result.performance_ratio);
            let target_met = if result.meets_target {
                "✅ Yes"
            } else {
                "❌ No"
            };

            let shapes_str = result
                .input_shapes
                .iter()
                .map(|s| {
                    format!(
                        "[{}]",
                        s.iter()
                            .map(|x| x.to_string())
                            .collect::<Vec<_>>()
                            .join("×")
                    )
                })
                .collect::<Vec<_>>()
                .join(" ");

            println!(
                "| {:^15} | {:^20} | {:^12} | {:^12} | {:^15} | {:^10} |",
                result.operation,
                if shapes_str.len() > 20 {
                    shapes_str[..17].to_string() + "..."
                } else {
                    shapes_str.clone()
                },
                tf_time_str,
                tf_rs_time,
                ratio_str,
                target_met
            );
        }
        println!("{:-<120}", "");

        // Performance improvement recommendations
        println!("\n💡 Performance Improvement Recommendations:");
        let mut all_suggestions: Vec<String> = results
            .iter()
            .flat_map(|r| r.optimization_suggestions.iter())
            .cloned()
            .collect();
        all_suggestions.sort();
        all_suggestions.dedup();

        for (i, suggestion) in all_suggestions.iter().enumerate() {
            println!("  {}. {suggestion}", i + 1);
        }

        // Critical bottlenecks
        println!("\n⚠️ Critical Bottlenecks Identified:");
        let mut all_bottlenecks: Vec<String> = results
            .iter()
            .flat_map(|r| r.bottlenecks_identified.iter())
            .cloned()
            .collect();
        all_bottlenecks.sort();
        all_bottlenecks.dedup();

        for bottleneck in &all_bottlenecks {
            println!("{bottleneck}");
        }

        println!("\n{}", "=".repeat(80));
    }
}

/// Convenience function to run a quick GPU benchmark
pub fn run_quick_gpu_tensorflow_benchmark() -> Result<Vec<GpuBenchmarkResult>> {
    println!("🚀 Quick GPU TensorFlow Benchmark");

    // This is a simplified version for quick testing
    // In practice, you'd need proper GPU context setup
    let config = GpuBenchmarkConfig::default();

    // Mock GPU optimizer for now - in real use this would be properly initialized
    #[allow(invalid_value)]
    let gpu_capabilities = crate::gpu::performance_optimizer::detect_gpu_capabilities(
        &unsafe { std::mem::zeroed() }, // Mock device - unsafe but for testing
    );

    #[allow(invalid_value)]
    let gpu_optimizer = Arc::new(GpuPerformanceOptimizer::new(
        Arc::new(unsafe { std::mem::zeroed() }), // Mock device
        Arc::new(unsafe { std::mem::zeroed() }), // Mock queue
        gpu_capabilities,
        OptimizationConfig::default(),
    ));

    let benchmark = GpuTensorFlowBenchmark::new(config, gpu_optimizer);
    benchmark.run_comprehensive_benchmark()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_gpu_benchmark_config() {
        let config = GpuBenchmarkConfig::default();
        assert_eq!(config.target_tensorflow_efficiency, 0.9);
        assert!(config.test_mixed_precision);
        assert!(config.test_tensor_cores);
    }

    #[test]
    #[ignore = "Requires valid WGPU device - panics with zero-initialized device"]
    #[allow(invalid_value)]
    fn test_tensorflow_script_generation() {
        let config = GpuBenchmarkConfig::default();
        // SAFETY: This test is ignored and only serves as documentation.
        // Real implementation would use proper GPU device initialization.
        let gpu_capabilities =
            crate::gpu::performance_optimizer::detect_gpu_capabilities(&unsafe {
                std::mem::zeroed()
            });
        let gpu_optimizer = Arc::new(GpuPerformanceOptimizer::new(
            Arc::new(unsafe { std::mem::zeroed() }),
            Arc::new(unsafe { std::mem::zeroed() }),
            gpu_capabilities,
            OptimizationConfig::default(),
        ));

        let benchmark = GpuTensorFlowBenchmark::new(config, gpu_optimizer);

        let script = benchmark
            .generate_tensorflow_gpu_script("MatMul", &[vec![1024, 1024], vec![1024, 1024]])
            .expect("test: operation should succeed");

        assert!(script.contains("tf.linalg.matmul"));
        assert!(script.contains("tf.device('/GPU:0')"));
        assert!(script.contains("1024, 1024"));
    }

    #[test]
    #[ignore = "Requires valid WGPU device - panics with zero-initialized device"]
    #[allow(invalid_value)]
    fn test_pytorch_script_generation() {
        let config = GpuBenchmarkConfig::default();
        // SAFETY: This test is ignored and only serves as documentation.
        // Real implementation would use proper GPU device initialization.
        let gpu_capabilities =
            crate::gpu::performance_optimizer::detect_gpu_capabilities(&unsafe {
                std::mem::zeroed()
            });
        let gpu_optimizer = Arc::new(GpuPerformanceOptimizer::new(
            Arc::new(unsafe { std::mem::zeroed() }),
            Arc::new(unsafe { std::mem::zeroed() }),
            gpu_capabilities,
            OptimizationConfig::default(),
        ));

        let benchmark = GpuTensorFlowBenchmark::new(config, gpu_optimizer);

        let script = benchmark
            .generate_pytorch_gpu_script("Add", &[vec![1000000], vec![1000000]])
            .expect("test: operation should succeed");

        assert!(script.contains("torch.add"));
        assert!(script.contains("torch.cuda.is_available"));
        assert!(script.contains("1000000"));
    }
}