trueno_db/backend/mod.rs
1//! Compute backend dispatcher
2//!
3//! Toyota Way Principles:
4//! - Genchi Genbutsu: Physics-based cost model (`PCIe` Gen4 x16 = 32 GB/s)
5//! - Muda elimination: GPU only if compute > 5x transfer time
6
7/// Cost-based backend selection
8///
9/// References:
10/// - Gregg & Hazelwood (2011): `PCIe` bus bottleneck analysis
11/// - Breß et al. (2014): Operator variant selection on heterogeneous hardware
12pub struct BackendDispatcher {
13 _private: (),
14}
15
16/// `PCIe` Gen4 x16 bandwidth: 32 GB/s
17const PCIE_BANDWIDTH_GBPS: f64 = 32.0;
18
19/// Minimum data size for GPU consideration (10 MB)
20const MIN_GPU_DATA_SIZE_BYTES: usize = 10_000_000;
21
22/// GPU compute throughput (conservative estimate: 100 GFLOP/s)
23/// Modern GPUs can achieve 1-10 TFLOP/s, but we use conservative estimate
24const GPU_THROUGHPUT_GFLOPS: f64 = 100.0;
25
26/// Transfer overhead multiplier (5x rule from spec)
27/// GPU compute must be > 5x transfer time to be worthwhile
28const TRANSFER_OVERHEAD_MULTIPLIER: f64 = 5.0;
29
30impl BackendDispatcher {
31 /// Select backend based on arithmetic intensity (FLOPs/Byte)
32 ///
33 /// # Arguments
34 /// * `total_bytes` - Total data size in bytes
35 /// * `estimated_flops` - Estimated floating point operations
36 ///
37 /// # Returns
38 /// Backend selection (GPU, SIMD, or Scalar)
39 ///
40 /// # Algorithm
41 /// 1. Check minimum data size threshold (10 MB)
42 /// 2. Calculate `PCIe` transfer time: bytes / 32 GB/s
43 /// 3. Estimate GPU compute time: FLOPs / 100 GFLOP/s
44 /// 4. Apply 5x rule: GPU only if compute > 5x transfer
45 #[must_use]
46 #[allow(clippy::cast_precision_loss)]
47 pub fn select(total_bytes: usize, estimated_flops: f64) -> super::Backend {
48 // Rule 1: Minimum data size threshold (10 MB)
49 if total_bytes < MIN_GPU_DATA_SIZE_BYTES {
50 return super::Backend::Simd;
51 }
52
53 // Rule 2: Calculate transfer time (PCIe Gen4 x16 = 32 GB/s)
54 let pcie_transfer_time_ms =
55 (total_bytes as f64 / (PCIE_BANDWIDTH_GBPS * 1_000_000_000.0)) * 1000.0;
56
57 // Rule 3: Estimate GPU compute time
58 let estimated_gpu_compute_ms =
59 (estimated_flops / (GPU_THROUGHPUT_GFLOPS * 1_000_000_000.0)) * 1000.0;
60
61 // Rule 4: Apply 5x rule (Toyota Way: Genchi Genbutsu - physics-based decision)
62 if estimated_gpu_compute_ms > pcie_transfer_time_ms * TRANSFER_OVERHEAD_MULTIPLIER {
63 super::Backend::Gpu
64 } else {
65 super::Backend::Simd
66 }
67 }
68
69 /// Calculate arithmetic intensity (FLOPs per byte)
70 ///
71 /// Higher arithmetic intensity means more compute per data transfer,
72 /// making GPU acceleration more beneficial.
73 ///
74 /// # Arguments
75 /// * `total_flops` - Total floating point operations
76 /// * `total_bytes` - Total data size in bytes
77 ///
78 /// # Returns
79 /// Arithmetic intensity ratio (FLOPs/Byte)
80 ///
81 /// # Example
82 /// ```
83 /// use trueno_db::backend::BackendDispatcher;
84 ///
85 /// // Matrix multiply: N^3 FLOPs for N^2 elements = N FLOPs/element
86 /// let intensity = BackendDispatcher::arithmetic_intensity(1_000_000_000.0, 100_000_000);
87 /// assert_eq!(intensity, 10.0); // 10 FLOPs per byte
88 /// ```
89 #[must_use]
90 #[allow(clippy::cast_precision_loss)]
91 pub const fn arithmetic_intensity(total_flops: f64, total_bytes: usize) -> f64 {
92 total_flops / total_bytes as f64
93 }
94
95 /// Estimate FLOPs for simple aggregation (SUM, AVG, COUNT, MIN, MAX)
96 ///
97 /// Simple aggregations perform ~1 FLOP per element (single pass)
98 ///
99 /// # Arguments
100 /// * `num_elements` - Number of elements to aggregate
101 ///
102 /// # Returns
103 /// Estimated FLOPs
104 ///
105 /// # Example
106 /// ```
107 /// use trueno_db::backend::BackendDispatcher;
108 ///
109 /// // SUM over 100M elements = 100M FLOPs
110 /// let flops = BackendDispatcher::estimate_simple_aggregation_flops(100_000_000);
111 /// assert_eq!(flops, 100_000_000.0);
112 /// ```
113 #[must_use]
114 #[allow(clippy::cast_precision_loss)]
115 pub const fn estimate_simple_aggregation_flops(num_elements: usize) -> f64 {
116 num_elements as f64
117 }
118
119 /// Estimate FLOPs for GROUP BY aggregation
120 ///
121 /// GROUP BY requires hashing (5 FLOPs/element) + aggregation (1 FLOP/element)
122 ///
123 /// # Arguments
124 /// * `num_elements` - Number of elements to process
125 ///
126 /// # Returns
127 /// Estimated FLOPs
128 ///
129 /// # Example
130 /// ```
131 /// use trueno_db::backend::BackendDispatcher;
132 ///
133 /// // GROUP BY over 100M elements = 600M FLOPs
134 /// let flops = BackendDispatcher::estimate_group_by_flops(100_000_000);
135 /// assert_eq!(flops, 600_000_000.0);
136 /// ```
137 #[must_use]
138 #[allow(clippy::cast_precision_loss)]
139 pub const fn estimate_group_by_flops(num_elements: usize) -> f64 {
140 // Hashing: 5 FLOPs/element, Aggregation: 1 FLOP/element
141 (num_elements as f64) * 6.0
142 }
143
144 /// Estimate FLOPs for WHERE filter
145 ///
146 /// Filters require predicate evaluation (~2 FLOPs per element)
147 ///
148 /// # Arguments
149 /// * `num_elements` - Number of elements to filter
150 ///
151 /// # Returns
152 /// Estimated FLOPs
153 #[must_use]
154 #[allow(clippy::cast_precision_loss)]
155 pub const fn estimate_filter_flops(num_elements: usize) -> f64 {
156 (num_elements as f64) * 2.0
157 }
158
159 /// Estimate FLOPs for JOIN operation
160 ///
161 /// Hash join: Build hash table (5 FLOPs/elem) + Probe (5 FLOPs/elem)
162 ///
163 /// # Arguments
164 /// * `left_size` - Number of elements in left table
165 /// * `right_size` - Number of elements in right table
166 ///
167 /// # Returns
168 /// Estimated FLOPs
169 #[must_use]
170 #[allow(clippy::cast_precision_loss)]
171 pub const fn estimate_join_flops(left_size: usize, right_size: usize) -> f64 {
172 // Build phase: 5 FLOPs per left element
173 // Probe phase: 5 FLOPs per right element
174 ((left_size + right_size) as f64) * 5.0
175 }
176}