scirs2-linalg 0.4.0

Linear algebra module for SciRS2 (scirs2-linalg)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
//! Advanced GPU tensor core scheduling and task execution
//!
//! This module implements sophisticated scheduling algorithms including:
//! - Tensor core-aware operation scheduling
//! - Performance monitoring and optimization
//! - Bandwidth prediction and resource management
//! - Multi-objective scheduling strategies

use super::kernels::{ElementType, GpuOperationType, TensorShape};
use super::memory::{MemoryAccessPattern, TensorCorePrecision};
use crate::error::{LinalgError, LinalgResult};
use scirs2_core::ndarray::Array2;
use std::collections::VecDeque;
use std::time::Instant;

/// Advanced GPU tensor core scheduler
#[derive(Debug)]
pub struct AdvancedGpuTensorCoreScheduler<T>
where
    T: Clone,
{
    /// Tensor core units
    tensor_core_units: Vec<TensorCoreUnit>,
    /// Scheduling algorithm
    scheduling_algorithm: TensorCoreSchedulingAlgorithm,
    /// Operation queue
    operation_queue: VecDeque<TensorCoreOperation<T>>,
    /// Performance monitor
    performance_monitor: TensorCorePerformanceMonitor,
}

/// Tensor core unit information
#[derive(Debug, Clone)]
pub struct TensorCoreUnit {
    /// Unit ID
    pub id: usize,
    /// Supported data types
    pub supported_types: Vec<ElementType>,
    /// Peak throughput (TOPS)
    pub peak_throughput: f64,
    /// Current utilization
    pub utilization: f64,
    /// Temperature
    pub temperature: f64,
}

/// Tensor core scheduling algorithms
#[derive(Debug, Clone)]
pub enum TensorCoreSchedulingAlgorithm {
    /// Round-robin scheduling
    RoundRobin,
    /// Priority-based scheduling
    PriorityBased,
    /// Throughput-optimal scheduling
    ThroughputOptimal,
    /// Energy-efficient scheduling
    EnergyEfficient,
    /// Latency-optimal scheduling
    LatencyOptimal,
    /// Load-balanced scheduling
    LoadBalanced,
    /// Latency-minimizing scheduling
    LatencyMinimizing,
    /// Machine learning driven scheduling
    MLDriven,
}

/// Tensor core operation
#[derive(Debug, Clone)]
pub struct TensorCoreOperation<T>
where
    T: Clone,
{
    /// Operation ID
    pub id: usize,
    /// Operation type
    pub operation_type: TensorCoreOpType,
    /// Input tensor shapes
    pub input_shapes: Vec<TensorShape>,
    /// Input tensors
    pub inputs: Vec<Array2<T>>,
    /// Output tensor
    pub output: Array2<T>,
    /// Precision requirement
    pub precision: TensorCorePrecision,
    /// Priority
    pub priority: u32,
    /// Deadline
    pub deadline: Option<Instant>,
}

/// Tensor core operation types
#[derive(Debug, Clone)]
pub enum TensorCoreOpType {
    /// Matrix multiplication
    MatrixMultiplication,
    /// Convolutional layer
    ConvolutionalLayer,
    /// Attention mechanism
    AttentionMechanism,
    /// Batch normalization
    BatchNormalization,
    /// Layer normalization
    LayerNormalization,
    /// Custom operation
    Custom(String),
}

/// Performance monitor for tensor cores
#[derive(Debug)]
pub struct TensorCorePerformanceMonitor {
    /// Throughput measurements
    pub throughput_history: VecDeque<f64>,
    /// Latency measurements
    pub latency_history: VecDeque<f64>,
    /// Energy consumption
    pub energy_history: VecDeque<f64>,
    /// Error rates
    pub error_rates: VecDeque<f64>,
}

/// Operation analysis results for scheduling optimization
#[derive(Debug, Clone)]
pub struct OperationAnalysis {
    /// Computational intensity score
    pub compute_intensity: f64,
    /// Memory bandwidth requirement (0-1 normalized)
    pub memory_bandwidth_requirement: f64,
    /// Precision requirement for the operation
    pub precision_requirement: TensorCorePrecision,
    /// Expected tensor core utilization efficiency
    pub tensor_core_utilization: f64,
    /// Estimated execution time in milliseconds
    pub estimated_execution_time: f64,
    /// Estimated energy consumption
    pub energy_consumption: f64,
    /// Parallelism potential (0-1 score)
    pub parallelism_potential: f64,
}

/// Memory bandwidth predictor
#[derive(Debug)]
pub struct BandwidthPredictor {
    /// Prediction models
    pub models: Vec<BandwidthPredictionModel>,
    /// Historical bandwidth measurements
    pub history: VecDeque<BandwidthMeasurement>,
    /// Prediction accuracy
    pub accuracy: f64,
}

/// Bandwidth prediction models
#[derive(Debug, Clone)]
pub enum BandwidthPredictionModel {
    /// Linear regression model
    LinearRegression,
    /// Neural network model
    NeuralNetwork,
    /// Time series model
    TimeSeries,
    /// Machine learning ensemble
    Ensemble,
}

/// Bandwidth measurement record
#[derive(Debug, Clone)]
pub struct BandwidthMeasurement {
    /// Timestamp of measurement
    pub timestamp: Instant,
    /// Measured bandwidth (GB/s)
    pub bandwidth_gbps: f64,
    /// Memory access pattern
    pub access_pattern: MemoryAccessPattern,
    /// Data size
    pub data_size: usize,
}

impl<T> AdvancedGpuTensorCoreScheduler<T>
where
    T: Clone,
{
    /// Create a new tensor core scheduler
    pub fn new() -> LinalgResult<Self> {
        Ok(Self {
            tensor_core_units: Vec::new(),
            scheduling_algorithm: TensorCoreSchedulingAlgorithm::ThroughputOptimal,
            operation_queue: VecDeque::new(),
            performance_monitor: TensorCorePerformanceMonitor::new(),
        })
    }

    /// Add tensor core unit
    pub fn add_tensor_core_unit(&mut self, unit: TensorCoreUnit) {
        self.tensor_core_units.push(unit);
    }

    /// Schedule operations using the current algorithm
    pub fn schedule_operations(
        &mut self,
        operations: &[TensorCoreOperation<T>],
    ) -> LinalgResult<Vec<usize>> {
        // Analyze each operation
        let mut analyses: Vec<(usize, OperationAnalysis)> = operations
            .iter()
            .enumerate()
            .map(|(idx, op)| (idx, self.analyze_operation_requirements(op)))
            .collect();

        // Apply scheduling algorithm
        let schedule = match self.scheduling_algorithm {
            TensorCoreSchedulingAlgorithm::ThroughputOptimal => {
                self.schedule_for_throughput(&mut analyses)?
            }
            TensorCoreSchedulingAlgorithm::LatencyOptimal => {
                self.schedule_for_latency(&mut analyses)?
            }
            TensorCoreSchedulingAlgorithm::EnergyEfficient => {
                self.schedule_for_energy_efficiency(&mut analyses)?
            }
            TensorCoreSchedulingAlgorithm::LoadBalanced => {
                self.schedule_for_load_balance(&mut analyses)?
            }
            _ => {
                // Default to simple ordering
                (0..operations.len()).collect()
            }
        };

        // Update performance metrics
        self.update_scheduling_metrics(&schedule, operations)?;

        // Add operations to queue
        for &op_idx in &schedule {
            if let Some(op) = operations.get(op_idx) {
                self.operation_queue.push_back((*op).clone());
            }
        }

        Ok(schedule)
    }

    /// Analyze individual operation requirements
    fn analyze_operation_requirements(
        &self,
        operation: &TensorCoreOperation<T>,
    ) -> OperationAnalysis {
        OperationAnalysis {
            compute_intensity: self.calculate_compute_intensity(operation),
            memory_bandwidth_requirement: self.calculate_memory_requirement(operation),
            precision_requirement: operation.precision.clone(),
            tensor_core_utilization: self.estimate_tensor_core_utilization(operation),
            estimated_execution_time: self.estimate_execution_time(operation),
            energy_consumption: self.estimate_energy_consumption(operation),
            parallelism_potential: self.analyze_parallelism(operation),
        }
    }

    /// Schedule operations for maximum throughput
    fn schedule_for_throughput(
        &self,
        analyses: &mut [(usize, OperationAnalysis)],
    ) -> LinalgResult<Vec<usize>> {
        // Sort by compute intensity (high first) and tensor core utilization
        analyses.sort_by(|a, b| {
            let score_a = a.1.compute_intensity * a.1.tensor_core_utilization;
            let score_b = b.1.compute_intensity * b.1.tensor_core_utilization;
            score_b
                .partial_cmp(&score_a)
                .unwrap_or(std::cmp::Ordering::Equal)
        });

        // Group operations with similar characteristics for batching
        let mut schedule = Vec::new();
        let mut current_batch = Vec::new();
        let mut last_compute_intensity = -1.0;

        for (idx, analysis) in analyses {
            // Start new batch if compute intensity differs significantly
            if (analysis.compute_intensity - last_compute_intensity).abs() > 0.3
                && !current_batch.is_empty()
            {
                schedule.extend(current_batch.drain(..));
            }

            current_batch.push(*idx);
            last_compute_intensity = analysis.compute_intensity;

            // Limit batch size for optimal tensor core utilization
            if current_batch.len() >= 8 {
                schedule.extend(current_batch.drain(..));
            }
        }

        // Add remaining operations
        schedule.extend(current_batch);
        Ok(schedule)
    }

    /// Schedule operations for minimum latency
    fn schedule_for_latency(
        &self,
        analyses: &mut [(usize, OperationAnalysis)],
    ) -> LinalgResult<Vec<usize>> {
        // Sort by estimated execution time (shortest first)
        analyses.sort_by(|a, b| {
            a.1.estimated_execution_time
                .partial_cmp(&b.1.estimated_execution_time)
                .unwrap_or(std::cmp::Ordering::Equal)
        });

        // Prioritize operations that can overlap with memory transfers
        let mut priority_ops = Vec::new();
        let mut regular_ops = Vec::new();

        for (idx, analysis) in analyses {
            if analysis.memory_bandwidth_requirement < 0.5 && analysis.parallelism_potential > 0.7 {
                priority_ops.push(*idx);
            } else {
                regular_ops.push(*idx);
            }
        }

        // Interleave high-priority and regular operations for optimal pipeline utilization
        let mut schedule = Vec::new();
        let mut priority_iter = priority_ops.into_iter();
        let mut regular_iter = regular_ops.into_iter();

        loop {
            match (priority_iter.next(), regular_iter.next()) {
                (Some(p), Some(r)) => {
                    schedule.push(p);
                    schedule.push(r);
                }
                (Some(p), None) => schedule.push(p),
                (None, Some(r)) => schedule.push(r),
                (None, None) => break,
            }
        }

        Ok(schedule)
    }

    /// Schedule operations for energy efficiency
    fn schedule_for_energy_efficiency(
        &self,
        analyses: &mut [(usize, OperationAnalysis)],
    ) -> LinalgResult<Vec<usize>> {
        // Sort by energy efficiency ratio (compute/energy)
        analyses.sort_by(|a, b| {
            let efficiency_a = a.1.compute_intensity / (a.1.energy_consumption + 1e-6);
            let efficiency_b = b.1.compute_intensity / (b.1.energy_consumption + 1e-6);
            efficiency_b
                .partial_cmp(&efficiency_a)
                .unwrap_or(std::cmp::Ordering::Equal)
        });

        // Group low-energy operations together to enable power scaling
        let mut schedule = Vec::new();
        let low_energy_threshold = 0.3;

        let (low_energy, high_energy): (Vec<_>, Vec<_>) = analyses
            .iter()
            .partition(|(_, analysis)| analysis.energy_consumption < low_energy_threshold);

        // Schedule low-energy operations first to allow for power down periods
        schedule.extend(low_energy.into_iter().map(|(idx, _)| *idx));
        schedule.extend(high_energy.into_iter().map(|(idx, _)| *idx));

        Ok(schedule)
    }

    /// Schedule operations for load balancing across tensor cores
    fn schedule_for_load_balance(
        &self,
        analyses: &mut [(usize, OperationAnalysis)],
    ) -> LinalgResult<Vec<usize>> {
        let num_tensor_cores = self.tensor_core_units.len().max(1);
        let mut core_loads = vec![0.0; num_tensor_cores];
        let mut schedule = vec![Vec::new(); num_tensor_cores];

        // Sort by execution time (longest first) for better load balancing
        analyses.sort_by(|a, b| {
            b.1.estimated_execution_time
                .partial_cmp(&a.1.estimated_execution_time)
                .unwrap_or(std::cmp::Ordering::Equal)
        });

        // Assign each operation to the least loaded tensor core
        for (idx, analysis) in analyses {
            let min_load_core = core_loads
                .iter()
                .enumerate()
                .min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
                .map(|(core_idx, _)| core_idx)
                .unwrap_or(0);

            schedule[min_load_core].push(*idx);
            core_loads[min_load_core] += analysis.estimated_execution_time;
        }

        // Flatten schedule maintaining core assignment order
        let mut final_schedule = Vec::new();
        let max_ops_per_core = schedule.iter().map(|s| s.len()).max().unwrap_or(0);

        for i in 0..max_ops_per_core {
            for core_schedule in &schedule {
                if let Some(&op_idx) = core_schedule.get(i) {
                    final_schedule.push(op_idx);
                }
            }
        }

        Ok(final_schedule)
    }

    /// Calculate operation compute intensity
    fn calculate_compute_intensity(&self, operation: &TensorCoreOperation<T>) -> f64 {
        // Estimate based on operation type and matrix dimensions
        match operation.operation_type {
            TensorCoreOpType::MatrixMultiplication => {
                let dims = &operation.input_shapes[0].dimensions;
                if dims.len() >= 2 {
                    (dims[0] * dims[1]) as f64 / 1e6 // Normalize to millions of operations
                } else {
                    1.0
                }
            }
            TensorCoreOpType::ConvolutionalLayer => 2.5, // High compute intensity
            TensorCoreOpType::AttentionMechanism => 3.0, // Very high compute intensity
            TensorCoreOpType::BatchNormalization => 0.5, // Medium compute intensity
            TensorCoreOpType::LayerNormalization => 0.6, // Medium compute intensity
            TensorCoreOpType::Custom(_) => 1.0,
        }
    }

    /// Calculate memory bandwidth requirement
    fn calculate_memory_requirement(&self, operation: &TensorCoreOperation<T>) -> f64 {
        let total_elements: usize = operation
            .input_shapes
            .iter()
            .map(|shape| shape.dimensions.iter().product::<usize>())
            .sum();

        // Normalize to 0-1 range based on typical tensor sizes
        (total_elements as f64 / 1e8).min(1.0)
    }

    /// Estimate tensor core utilization efficiency
    fn estimate_tensor_core_utilization(&self, operation: &TensorCoreOperation<T>) -> f64 {
        match operation.operation_type {
            TensorCoreOpType::MatrixMultiplication => {
                // Check if dimensions are multiples of 16 (optimal for tensor cores)
                let dims = &operation.input_shapes[0].dimensions;
                if dims.len() >= 2 && dims[0] % 16 == 0 && dims[1] % 16 == 0 {
                    0.95
                } else {
                    0.7
                }
            }
            TensorCoreOpType::ConvolutionalLayer => 0.8,
            TensorCoreOpType::AttentionMechanism => 0.85,
            _ => 0.3, // Non-tensor-core operations
        }
    }

    /// Estimate execution time for operation
    fn estimate_execution_time(&self, operation: &TensorCoreOperation<T>) -> f64 {
        let complexity = self.calculate_compute_intensity(operation);
        let memory_factor = self.calculate_memory_requirement(operation);

        // Simple model: time = compute_time + memory_time
        let compute_time = complexity * 0.1; // 0.1ms per million ops
        let memory_time = memory_factor * 0.05; // 0.05ms per normalized memory unit

        compute_time + memory_time
    }

    /// Estimate energy consumption
    fn estimate_energy_consumption(&self, operation: &TensorCoreOperation<T>) -> f64 {
        let intensity = self.calculate_compute_intensity(operation);
        let utilization = self.estimate_tensor_core_utilization(operation);

        // Higher utilization is more energy efficient
        intensity * (2.0 - utilization)
    }

    /// Analyze parallelism potential
    fn analyze_parallelism(&self, operation: &TensorCoreOperation<T>) -> f64 {
        match operation.operation_type {
            TensorCoreOpType::MatrixMultiplication => 0.9, // Highly parallelizable
            TensorCoreOpType::ConvolutionalLayer => 0.95,  // Perfectly parallelizable
            TensorCoreOpType::AttentionMechanism => 0.8,   // Good parallelization
            TensorCoreOpType::BatchNormalization => 0.6,   // Limited by reduction
            TensorCoreOpType::LayerNormalization => 0.6,   // Limited by reduction
            TensorCoreOpType::Custom(_) => 0.7,
        }
    }

    /// Update scheduling performance metrics
    fn update_scheduling_metrics(
        &mut self,
        schedule: &[usize],
        operations: &[TensorCoreOperation<T>],
    ) -> LinalgResult<()> {
        let total_time: f64 = schedule
            .iter()
            .filter_map(|&idx| operations.get(idx))
            .map(|op| self.estimate_execution_time(op))
            .sum();

        let avg_utilization: f64 = schedule
            .iter()
            .filter_map(|&idx| operations.get(idx))
            .map(|op| self.estimate_tensor_core_utilization(op))
            .sum::<f64>()
            / schedule.len().max(1) as f64;

        // Update performance history
        self.performance_monitor
            .throughput_history
            .push_back(1.0 / total_time);
        self.performance_monitor
            .latency_history
            .push_back(total_time);

        // Keep history size manageable
        if self.performance_monitor.throughput_history.len() > 1000 {
            self.performance_monitor.throughput_history.pop_front();
            self.performance_monitor.latency_history.pop_front();
        }

        Ok(())
    }

    /// Get scheduling performance statistics
    pub fn get_performance_stats(&self) -> SchedulingStats {
        let avg_throughput = if self.performance_monitor.throughput_history.is_empty() {
            0.0
        } else {
            self.performance_monitor
                .throughput_history
                .iter()
                .sum::<f64>()
                / self.performance_monitor.throughput_history.len() as f64
        };

        let avg_latency = if self.performance_monitor.latency_history.is_empty() {
            0.0
        } else {
            self.performance_monitor.latency_history.iter().sum::<f64>()
                / self.performance_monitor.latency_history.len() as f64
        };

        SchedulingStats {
            average_throughput: avg_throughput,
            average_latency: avg_latency,
            total_operations_scheduled: self.performance_monitor.throughput_history.len(),
            tensor_core_utilization: self.get_average_utilization(),
        }
    }

    fn get_average_utilization(&self) -> f64 {
        if self.tensor_core_units.is_empty() {
            0.0
        } else {
            self.tensor_core_units
                .iter()
                .map(|unit| unit.utilization)
                .sum::<f64>()
                / self.tensor_core_units.len() as f64
        }
    }
}

impl TensorCorePerformanceMonitor {
    fn new() -> Self {
        Self {
            throughput_history: VecDeque::new(),
            latency_history: VecDeque::new(),
            energy_history: VecDeque::new(),
            error_rates: VecDeque::new(),
        }
    }
}

impl BandwidthPredictor {
    /// Create a new bandwidth predictor
    pub fn new() -> Self {
        Self {
            models: vec![BandwidthPredictionModel::LinearRegression],
            history: VecDeque::new(),
            accuracy: 0.85,
        }
    }

    /// Predict bandwidth for given operations and data sizes
    pub fn predict_bandwidth(
        &self,
        operations: &[GpuOperationType],
        data_sizes: &[usize],
    ) -> LinalgResult<f64> {
        // Advanced bandwidth prediction

        // 1. Calculate operation complexity score
        let complexity_score = operations
            .iter()
            .enumerate()
            .map(|(i, op)| {
                let data_size = data_sizes.get(i).unwrap_or(&1);
                match op {
                    GpuOperationType::MatrixMultiplication => (*data_size as f64).powf(1.5) * 0.8,
                    GpuOperationType::ElementwiseAddition => *data_size as f64 * 0.2,
                    GpuOperationType::Convolution => (*data_size as f64).powf(1.3) * 1.2,
                    GpuOperationType::Reduction => (*data_size as f64).log2() * 0.5,
                    GpuOperationType::Transpose => *data_size as f64 * 0.3,
                    GpuOperationType::Normalization => *data_size as f64 * 0.4,
                    _ => *data_size as f64 * 0.1,
                }
            })
            .sum::<f64>();

        // 2. Memory hierarchy analysis
        let total_data = data_sizes.iter().sum::<usize>() as f64;

        // 3. Predict based on model
        let predicted_bandwidth = match self.models.first() {
            Some(BandwidthPredictionModel::LinearRegression) => {
                // Simple linear model
                let base_bandwidth = 400.0; // GB/s
                let complexity_factor = (complexity_score / 1e6).min(2.0);
                let size_factor = (total_data / 1e9).min(1.5);

                base_bandwidth * complexity_factor * size_factor
            }
            _ => 200.0, // Default fallback
        };

        Ok(predicted_bandwidth.max(10.0).min(1000.0)) // Clamp to reasonable range
    }

    /// Add bandwidth measurement
    pub fn add_measurement(&mut self, measurement: BandwidthMeasurement) {
        self.history.push_back(measurement);

        // Keep history size manageable
        if self.history.len() > 1000 {
            self.history.pop_front();
        }
    }
}

/// Scheduling performance statistics
#[derive(Debug, Clone)]
pub struct SchedulingStats {
    /// Average throughput (operations/second)
    pub average_throughput: f64,
    /// Average latency (seconds)
    pub average_latency: f64,
    /// Total operations scheduled
    pub total_operations_scheduled: usize,
    /// Average tensor core utilization
    pub tensor_core_utilization: f64,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_tensor_core_scheduler_creation() {
        let scheduler = AdvancedGpuTensorCoreScheduler::<f32>::new().expect("Operation failed");
        assert_eq!(scheduler.tensor_core_units.len(), 0);
    }

    #[test]
    fn test_bandwidth_predictor() {
        let predictor = BandwidthPredictor::new();
        let operations = vec![GpuOperationType::MatrixMultiplication];
        let data_sizes = vec![1024];

        let bandwidth = predictor
            .predict_bandwidth(&operations, &data_sizes)
            .expect("Operation failed");
        assert!(bandwidth > 0.0);
    }

    #[test]
    fn test_tensor_core_unit() {
        let unit = TensorCoreUnit {
            id: 0,
            supported_types: vec![ElementType::F32, ElementType::F16],
            peak_throughput: 100.0,
            utilization: 0.5,
            temperature: 65.0,
        };
        assert_eq!(unit.id, 0);
        assert_eq!(unit.supported_types.len(), 2);
    }
}