Skip to main content

verificar/ml/
backend.rs

1//! Backend selection and cost model for GPU-accelerated generation
2//!
3//! Implements adaptive backend selection using a Mixture-of-Experts approach.
4//! The MoE router analyzes operation complexity and data size to select the optimal
5//! compute backend (Scalar/SIMD/GPU).
6//!
7//! # Operation Complexity Levels
8//!
9//! - **Low**: Element-wise operations (add, multiply, etc.) - Memory-bound, GPU rarely beneficial
10//! - **Medium**: Reductions (dot product, sum, etc.) - Moderate compute, GPU at 100K+ elements
11//! - **High**: Matrix operations (matmul, convolution) - Compute-intensive O(n²) or O(n³), GPU at 10K+ elements
12//!
13//! # Usage Example
14//!
15//! ```
16//! use verificar::ml::{BackendSelector, OpComplexity};
17//!
18//! let selector = BackendSelector::new();
19//!
20//! // Element-wise operation
21//! let backend = selector.select_with_moe(OpComplexity::Low, 500_000);
22//! // Returns: Scalar (below 1M threshold, memory-bound)
23//!
24//! // Matrix multiplication
25//! let backend = selector.select_with_moe(OpComplexity::High, 50_000);
26//! // Returns: GPU (above 10K threshold for O(n²) ops)
27//! ```
28//!
29//! # Performance Thresholds
30//!
31//! Based on empirical analysis and the 5× PCIe rule (Gregg & Hazelwood 2011):
32//!
33//! | Complexity | SIMD Threshold | GPU Threshold | Rationale |
34//! |------------|----------------|---------------|-----------|
35//! | Low        | 1M elements    | Never         | Memory-bound, PCIe overhead dominates |
36//! | Medium     | 10K elements   | 100K elements | Moderate compute/transfer ratio |
37//! | High       | 1K elements    | 10K elements  | O(n²/n³) complexity favors GPU |
38
39use serde::{Deserialize, Serialize};
40use std::fmt;
41
42/// Compute backend options
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
44pub enum Backend {
45    /// Scalar operations (baseline)
46    Scalar,
47    /// SIMD vectorization (AVX2, NEON)
48    Simd,
49    /// GPU acceleration (WebGPU/Vulkan via trueno)
50    Gpu,
51}
52
53impl fmt::Display for Backend {
54    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
55        match self {
56            Backend::Scalar => write!(f, "Scalar"),
57            Backend::Simd => write!(f, "SIMD"),
58            Backend::Gpu => write!(f, "GPU"),
59        }
60    }
61}
62
63impl Default for Backend {
64    fn default() -> Self {
65        Backend::Scalar
66    }
67}
68
69/// Operation complexity for MoE (Mixture-of-Experts) routing
70#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
71pub enum OpComplexity {
72    /// Simple operations (add, mul) - O(n), prefer SIMD unless very large
73    Low,
74    /// Moderate operations (dot, reduce) - O(n), GPU beneficial at 100K+ elements
75    Medium,
76    /// Complex operations (matmul, convolution) - O(n²) or O(n³), GPU beneficial at 10K+ elements
77    High,
78}
79
80impl Default for OpComplexity {
81    fn default() -> Self {
82        OpComplexity::Low
83    }
84}
85
86/// Cost model for backend selection
87///
88/// Based on Gregg & Hazelwood (2011) 5× PCIe rule for GPU dispatch decisions.
89#[derive(Debug, Clone)]
90pub struct BackendSelector {
91    /// PCIe bandwidth in bytes/sec (default: 32 GB/s for PCIe 4.0 x16)
92    pcie_bandwidth: f64,
93    /// GPU compute throughput in FLOPS (default: 20 TFLOPS for A100)
94    gpu_gflops: f64,
95    /// Minimum dispatch ratio (default: 5× per Gregg & Hazelwood 2011)
96    min_dispatch_ratio: f64,
97    /// SIMD threshold for low complexity ops
98    simd_threshold_low: usize,
99    /// SIMD threshold for medium complexity ops
100    simd_threshold_medium: usize,
101    /// GPU threshold for medium complexity ops
102    gpu_threshold_medium: usize,
103    /// SIMD threshold for high complexity ops
104    simd_threshold_high: usize,
105    /// GPU threshold for high complexity ops
106    gpu_threshold_high: usize,
107}
108
109impl Default for BackendSelector {
110    fn default() -> Self {
111        Self {
112            pcie_bandwidth: 32e9,    // 32 GB/s
113            gpu_gflops: 20e12,       // 20 TFLOPS
114            min_dispatch_ratio: 5.0, // 5× rule
115            simd_threshold_low: 1_000_000,
116            simd_threshold_medium: 10_000,
117            gpu_threshold_medium: 100_000,
118            simd_threshold_high: 1_000,
119            gpu_threshold_high: 10_000,
120        }
121    }
122}
123
124impl BackendSelector {
125    /// Create a new backend selector with default thresholds
126    #[must_use]
127    pub fn new() -> Self {
128        Self::default()
129    }
130
131    /// Configure custom PCIe bandwidth
132    #[must_use]
133    pub fn with_pcie_bandwidth(mut self, bandwidth: f64) -> Self {
134        self.pcie_bandwidth = bandwidth;
135        self
136    }
137
138    /// Configure custom GPU throughput
139    #[must_use]
140    pub fn with_gpu_gflops(mut self, gflops: f64) -> Self {
141        self.gpu_gflops = gflops;
142        self
143    }
144
145    /// Configure custom dispatch ratio threshold
146    #[must_use]
147    pub fn with_min_dispatch_ratio(mut self, ratio: f64) -> Self {
148        self.min_dispatch_ratio = ratio;
149        self
150    }
151
152    /// Select optimal backend based on workload characteristics
153    ///
154    /// # Arguments
155    /// * `data_bytes` - Amount of data to transfer (host → device)
156    /// * `flops` - Floating point operations required
157    ///
158    /// # Returns
159    /// Recommended backend based on cost model
160    ///
161    /// # Cost Model
162    /// GPU dispatch is beneficial when:
163    /// ```text
164    /// compute_time > min_dispatch_ratio × transfer_time
165    /// ```
166    ///
167    /// Per Gregg & Hazelwood (2011), the 5× rule accounts for:
168    /// - Host→Device transfer (PCIe overhead)
169    /// - Kernel launch latency
170    /// - Device→Host transfer
171    /// - CPU-GPU synchronization
172    #[must_use]
173    pub fn select_backend(&self, data_bytes: usize, flops: u64) -> Backend {
174        // Calculate transfer time (seconds)
175        let transfer_s = data_bytes as f64 / self.pcie_bandwidth;
176
177        // Calculate compute time (seconds)
178        let compute_s = flops as f64 / self.gpu_gflops;
179
180        // Apply 5× dispatch rule
181        if compute_s > self.min_dispatch_ratio * transfer_s {
182            Backend::Gpu
183        } else {
184            // Fallback to SIMD for intermediate workloads
185            Backend::Simd
186        }
187    }
188
189    /// Select backend for matrix multiplication
190    ///
191    /// # Arguments
192    /// * `m`, `n`, `k` - Matrix dimensions (M×K) × (K×N) = (M×N)
193    ///
194    /// # Complexity
195    /// - Data: O(mk + kn + mn) bytes
196    /// - FLOPs: O(2mnk) operations
197    #[must_use]
198    pub fn select_for_matmul(&self, m: usize, n: usize, k: usize) -> Backend {
199        // Data size: two input matrices + output (f32 = 4 bytes)
200        let data_bytes = (m * k + k * n + m * n) * 4;
201
202        // FLOPs: 2mnk (multiply-add per element)
203        let flops = (2 * m * n * k) as u64;
204
205        self.select_backend(data_bytes, flops)
206    }
207
208    /// Select backend for vector operations
209    ///
210    /// # Arguments
211    /// * `n` - Vector length
212    /// * `ops_per_element` - Operations per element (e.g., 2 for dot product)
213    #[must_use]
214    pub fn select_for_vector_op(&self, n: usize, ops_per_element: u64) -> Backend {
215        // Data size: typically two input vectors + output (f32 = 4 bytes)
216        let data_bytes = n * 3 * 4;
217
218        // FLOPs
219        let flops = n as u64 * ops_per_element;
220
221        self.select_backend(data_bytes, flops)
222    }
223
224    /// Select backend for element-wise operations
225    ///
226    /// Element-wise ops are memory-bound, so GPU is rarely beneficial
227    #[must_use]
228    pub fn select_for_elementwise(&self, n: usize) -> Backend {
229        // Element-wise ops: 1 FLOP per element, memory-bound
230        // GPU overhead rarely justified
231        if n > self.simd_threshold_low {
232            Backend::Simd
233        } else {
234            Backend::Scalar
235        }
236    }
237
238    /// MoE (Mixture-of-Experts) routing: select backend based on operation complexity
239    ///
240    /// # Arguments
241    /// * `complexity` - Operation complexity (Low/Medium/High)
242    /// * `data_size` - Number of elements in the operation
243    ///
244    /// # Returns
245    /// Recommended backend using adaptive thresholds per complexity level
246    ///
247    /// # MoE Thresholds (per empirical performance analysis)
248    /// - **Low complexity** (element-wise): SIMD at 1M+ elements, never GPU
249    /// - **Medium complexity** (reductions): SIMD at 10K+, GPU at 100K+ elements
250    /// - **High complexity** (matmul): SIMD at 1K+, GPU at 10K+ elements
251    #[must_use]
252    pub fn select_with_moe(&self, complexity: OpComplexity, data_size: usize) -> Backend {
253        match complexity {
254            OpComplexity::Low => {
255                // Element-wise: memory-bound, GPU overhead not justified
256                if data_size > self.simd_threshold_low {
257                    Backend::Simd
258                } else {
259                    Backend::Scalar
260                }
261            }
262            OpComplexity::Medium => {
263                // Reductions (dot product, sum): moderate compute
264                if data_size > self.gpu_threshold_medium {
265                    Backend::Gpu
266                } else if data_size > self.simd_threshold_medium {
267                    Backend::Simd
268                } else {
269                    Backend::Scalar
270                }
271            }
272            OpComplexity::High => {
273                // Matrix operations: compute-intensive, O(n²) or O(n³)
274                if data_size > self.gpu_threshold_high {
275                    Backend::Gpu
276                } else if data_size > self.simd_threshold_high {
277                    Backend::Simd
278                } else {
279                    Backend::Scalar
280                }
281            }
282        }
283    }
284
285    /// Get selection statistics for profiling
286    #[must_use]
287    pub fn selection_stats(&self, complexity: OpComplexity, data_size: usize) -> SelectionStats {
288        let backend = self.select_with_moe(complexity, data_size);
289
290        // Estimate performance multiplier
291        let speedup = match backend {
292            Backend::Scalar => 1.0,
293            Backend::Simd => {
294                // AVX2 gives ~4-8x for aligned data
295                match complexity {
296                    OpComplexity::Low => 4.0,
297                    OpComplexity::Medium => 6.0,
298                    OpComplexity::High => 8.0,
299                }
300            }
301            Backend::Gpu => {
302                // GPU speedup depends heavily on problem size
303                let base = match complexity {
304                    OpComplexity::Low => 1.0, // Never selected for Low
305                    OpComplexity::Medium => 10.0,
306                    OpComplexity::High => 50.0,
307                };
308                // Scale with data size (diminishing returns)
309                base * (data_size as f64 / 10_000.0).min(10.0)
310            }
311        };
312
313        SelectionStats {
314            backend,
315            complexity,
316            data_size,
317            estimated_speedup: speedup,
318        }
319    }
320}
321
322/// Statistics about backend selection decision
323#[derive(Debug, Clone)]
324pub struct SelectionStats {
325    /// Selected backend
326    pub backend: Backend,
327    /// Operation complexity
328    pub complexity: OpComplexity,
329    /// Data size (elements)
330    pub data_size: usize,
331    /// Estimated speedup vs scalar
332    pub estimated_speedup: f64,
333}
334
335impl fmt::Display for SelectionStats {
336    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
337        write!(
338            f,
339            "{} backend for {:?} complexity ({} elements) - ~{:.1}x speedup",
340            self.backend, self.complexity, self.data_size, self.estimated_speedup
341        )
342    }
343}
344
345/// Batch configuration for parallel generation
346#[derive(Debug, Clone)]
347pub struct BatchConfig {
348    /// Backend selector
349    selector: BackendSelector,
350    /// Batch size for generation
351    pub batch_size: usize,
352    /// Operation complexity hint
353    pub complexity: OpComplexity,
354}
355
356impl BatchConfig {
357    /// Create a new batch configuration
358    #[must_use]
359    pub fn new(batch_size: usize) -> Self {
360        Self {
361            selector: BackendSelector::new(),
362            batch_size,
363            complexity: OpComplexity::Low,
364        }
365    }
366
367    /// Set operation complexity
368    #[must_use]
369    pub fn with_complexity(mut self, complexity: OpComplexity) -> Self {
370        self.complexity = complexity;
371        self
372    }
373
374    /// Get recommended backend for this batch
375    #[must_use]
376    pub fn recommended_backend(&self) -> Backend {
377        self.selector
378            .select_with_moe(self.complexity, self.batch_size)
379    }
380
381    /// Check if GPU acceleration is recommended
382    #[must_use]
383    pub fn should_use_gpu(&self) -> bool {
384        self.recommended_backend() == Backend::Gpu
385    }
386
387    /// Check if SIMD acceleration is recommended
388    #[must_use]
389    pub fn should_use_simd(&self) -> bool {
390        matches!(self.recommended_backend(), Backend::Simd | Backend::Gpu)
391    }
392}
393
394impl Default for BatchConfig {
395    fn default() -> Self {
396        Self::new(1000)
397    }
398}
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403
404    #[test]
405    fn test_backend_display() {
406        assert_eq!(format!("{}", Backend::Scalar), "Scalar");
407        assert_eq!(format!("{}", Backend::Simd), "SIMD");
408        assert_eq!(format!("{}", Backend::Gpu), "GPU");
409    }
410
411    #[test]
412    fn test_backend_default() {
413        assert_eq!(Backend::default(), Backend::Scalar);
414    }
415
416    #[test]
417    fn test_op_complexity_ordering() {
418        assert!(OpComplexity::Low < OpComplexity::Medium);
419        assert!(OpComplexity::Medium < OpComplexity::High);
420    }
421
422    #[test]
423    fn test_selector_default() {
424        let selector = BackendSelector::new();
425        assert_eq!(selector.min_dispatch_ratio, 5.0);
426    }
427
428    #[test]
429    fn test_select_elementwise_small() {
430        let selector = BackendSelector::new();
431        let backend = selector.select_for_elementwise(100);
432        assert_eq!(backend, Backend::Scalar);
433    }
434
435    #[test]
436    fn test_select_elementwise_large() {
437        let selector = BackendSelector::new();
438        let backend = selector.select_for_elementwise(10_000_000);
439        assert_eq!(backend, Backend::Simd);
440    }
441
442    #[test]
443    fn test_moe_low_complexity() {
444        let selector = BackendSelector::new();
445
446        // Small: Scalar
447        assert_eq!(
448            selector.select_with_moe(OpComplexity::Low, 100),
449            Backend::Scalar
450        );
451
452        // Large: SIMD (never GPU for low complexity)
453        assert_eq!(
454            selector.select_with_moe(OpComplexity::Low, 10_000_000),
455            Backend::Simd
456        );
457    }
458
459    #[test]
460    fn test_moe_medium_complexity() {
461        let selector = BackendSelector::new();
462
463        // Small: Scalar
464        assert_eq!(
465            selector.select_with_moe(OpComplexity::Medium, 100),
466            Backend::Scalar
467        );
468
469        // Medium: SIMD
470        assert_eq!(
471            selector.select_with_moe(OpComplexity::Medium, 50_000),
472            Backend::Simd
473        );
474
475        // Large: GPU
476        assert_eq!(
477            selector.select_with_moe(OpComplexity::Medium, 500_000),
478            Backend::Gpu
479        );
480    }
481
482    #[test]
483    fn test_moe_high_complexity() {
484        let selector = BackendSelector::new();
485
486        // Small: Scalar
487        assert_eq!(
488            selector.select_with_moe(OpComplexity::High, 100),
489            Backend::Scalar
490        );
491
492        // Medium: SIMD
493        assert_eq!(
494            selector.select_with_moe(OpComplexity::High, 5_000),
495            Backend::Simd
496        );
497
498        // Large: GPU
499        assert_eq!(
500            selector.select_with_moe(OpComplexity::High, 50_000),
501            Backend::Gpu
502        );
503    }
504
505    #[test]
506    fn test_select_matmul_small() {
507        let selector = BackendSelector::new();
508        // 10x10 matmul: very small
509        let backend = selector.select_for_matmul(10, 10, 10);
510        assert_eq!(backend, Backend::Simd);
511    }
512
513    #[test]
514    fn test_select_matmul_large() {
515        let selector = BackendSelector::new();
516        // 1000x1000 matmul: large but 5× rule still favors SIMD for this size
517        // (compute time doesn't exceed 5× transfer time with default params)
518        let backend = selector.select_for_matmul(1000, 1000, 1000);
519        assert_eq!(backend, Backend::Simd);
520
521        // For extremely large matmul where O(n³) compute dominates O(n²) transfer
522        // Need n > 7500 for GPU to be beneficial with 2× rule
523        // Using n=10000 to ensure GPU selection
524        let fast_gpu_selector = BackendSelector::new().with_min_dispatch_ratio(2.0);
525        let backend = fast_gpu_selector.select_for_matmul(10000, 10000, 10000);
526        assert_eq!(backend, Backend::Gpu);
527    }
528
529    #[test]
530    fn test_selection_stats() {
531        let selector = BackendSelector::new();
532        let stats = selector.selection_stats(OpComplexity::High, 100_000);
533
534        assert_eq!(stats.backend, Backend::Gpu);
535        assert!(stats.estimated_speedup > 1.0);
536        assert!(format!("{}", stats).contains("GPU"));
537    }
538
539    #[test]
540    fn test_batch_config() {
541        let config = BatchConfig::new(50_000).with_complexity(OpComplexity::Medium);
542
543        assert_eq!(config.batch_size, 50_000);
544        assert!(config.should_use_simd());
545        assert!(!config.should_use_gpu());
546    }
547
548    #[test]
549    fn test_batch_config_gpu() {
550        let config = BatchConfig::new(500_000).with_complexity(OpComplexity::Medium);
551
552        assert!(config.should_use_gpu());
553    }
554
555    #[test]
556    fn test_batch_config_default() {
557        let config = BatchConfig::default();
558        assert_eq!(config.batch_size, 1000);
559    }
560
561    #[test]
562    fn test_custom_thresholds() {
563        let selector = BackendSelector::new()
564            .with_pcie_bandwidth(64e9)
565            .with_gpu_gflops(80e12)
566            .with_min_dispatch_ratio(3.0);
567
568        // With faster GPU, smaller workloads become viable
569        assert!(selector.pcie_bandwidth > 32e9);
570        assert!(selector.gpu_gflops > 20e12);
571    }
572
573    #[test]
574    fn test_vector_op_selection() {
575        let selector = BackendSelector::new();
576
577        // Small vector dot product
578        let backend = selector.select_for_vector_op(100, 2);
579        assert_eq!(backend, Backend::Simd);
580
581        // Large vector ops - 5× rule still favors SIMD for memory-bound ops
582        // (vector ops are inherently memory-bound with low compute intensity)
583        let backend = selector.select_for_vector_op(10_000_000, 2);
584        assert_eq!(backend, Backend::Simd);
585
586        // With very high flops per element and lower dispatch ratio, GPU is viable
587        // Need flops/element to overcome the 3:1 data/compute ratio
588        let fast_gpu_selector = BackendSelector::new()
589            .with_min_dispatch_ratio(0.1) // Very aggressive GPU dispatch
590            .with_gpu_gflops(1e12); // Model a slower GPU
591        let backend = fast_gpu_selector.select_for_vector_op(10_000_000, 100);
592        assert_eq!(backend, Backend::Gpu);
593    }
594
595    #[test]
596    fn test_op_complexity_default() {
597        assert_eq!(OpComplexity::default(), OpComplexity::Low);
598    }
599
600    #[test]
601    fn test_backend_serialization() {
602        let backend = Backend::Gpu;
603        let json = serde_json::to_string(&backend).expect("serialize");
604        assert_eq!(json, "\"Gpu\"");
605
606        let parsed: Backend = serde_json::from_str(&json).expect("deserialize");
607        assert_eq!(parsed, Backend::Gpu);
608    }
609
610    #[test]
611    fn test_complexity_serialization() {
612        let complexity = OpComplexity::High;
613        let json = serde_json::to_string(&complexity).expect("serialize");
614
615        let parsed: OpComplexity = serde_json::from_str(&json).expect("deserialize");
616        assert_eq!(parsed, OpComplexity::High);
617    }
618}