Skip to main content

trueno_db/backend/
mod.rs

1//! Compute backend dispatcher
2//!
3//! Toyota Way Principles:
4//! - Genchi Genbutsu: Physics-based cost model (`PCIe` Gen4 x16 = 32 GB/s)
5//! - Muda elimination: GPU only if compute > 5x transfer time
6
7/// Cost-based backend selection
8///
9/// References:
10/// - Gregg & Hazelwood (2011): `PCIe` bus bottleneck analysis
11/// - Breß et al. (2014): Operator variant selection on heterogeneous hardware
12pub struct BackendDispatcher {
13    _private: (),
14}
15
16/// `PCIe` Gen4 x16 bandwidth: 32 GB/s
17const PCIE_BANDWIDTH_GBPS: f64 = 32.0;
18
19/// Minimum data size for GPU consideration (10 MB)
20const MIN_GPU_DATA_SIZE_BYTES: usize = 10_000_000;
21
22/// GPU compute throughput (conservative estimate: 100 GFLOP/s)
23/// Modern GPUs can achieve 1-10 TFLOP/s, but we use conservative estimate
24const GPU_THROUGHPUT_GFLOPS: f64 = 100.0;
25
26/// Transfer overhead multiplier (5x rule from spec)
27/// GPU compute must be > 5x transfer time to be worthwhile
28const TRANSFER_OVERHEAD_MULTIPLIER: f64 = 5.0;
29
30impl BackendDispatcher {
31    /// Select backend based on arithmetic intensity (FLOPs/Byte)
32    ///
33    /// # Arguments
34    /// * `total_bytes` - Total data size in bytes
35    /// * `estimated_flops` - Estimated floating point operations
36    ///
37    /// # Returns
38    /// Backend selection (GPU, SIMD, or Scalar)
39    ///
40    /// # Algorithm
41    /// 1. Check minimum data size threshold (10 MB)
42    /// 2. Calculate `PCIe` transfer time: bytes / 32 GB/s
43    /// 3. Estimate GPU compute time: FLOPs / 100 GFLOP/s
44    /// 4. Apply 5x rule: GPU only if compute > 5x transfer
45    #[must_use]
46    #[allow(clippy::cast_precision_loss)]
47    pub fn select(total_bytes: usize, estimated_flops: f64) -> super::Backend {
48        // Rule 1: Minimum data size threshold (10 MB)
49        if total_bytes < MIN_GPU_DATA_SIZE_BYTES {
50            return super::Backend::Simd;
51        }
52
53        // Rule 2: Calculate transfer time (PCIe Gen4 x16 = 32 GB/s)
54        let pcie_transfer_time_ms =
55            (total_bytes as f64 / (PCIE_BANDWIDTH_GBPS * 1_000_000_000.0)) * 1000.0;
56
57        // Rule 3: Estimate GPU compute time
58        let estimated_gpu_compute_ms =
59            (estimated_flops / (GPU_THROUGHPUT_GFLOPS * 1_000_000_000.0)) * 1000.0;
60
61        // Rule 4: Apply 5x rule (Toyota Way: Genchi Genbutsu - physics-based decision)
62        if estimated_gpu_compute_ms > pcie_transfer_time_ms * TRANSFER_OVERHEAD_MULTIPLIER {
63            super::Backend::Gpu
64        } else {
65            super::Backend::Simd
66        }
67    }
68
69    /// Calculate arithmetic intensity (FLOPs per byte)
70    ///
71    /// Higher arithmetic intensity means more compute per data transfer,
72    /// making GPU acceleration more beneficial.
73    ///
74    /// # Arguments
75    /// * `total_flops` - Total floating point operations
76    /// * `total_bytes` - Total data size in bytes
77    ///
78    /// # Returns
79    /// Arithmetic intensity ratio (FLOPs/Byte)
80    ///
81    /// # Example
82    /// ```
83    /// use trueno_db::backend::BackendDispatcher;
84    ///
85    /// // Matrix multiply: N^3 FLOPs for N^2 elements = N FLOPs/element
86    /// let intensity = BackendDispatcher::arithmetic_intensity(1_000_000_000.0, 100_000_000);
87    /// assert_eq!(intensity, 10.0); // 10 FLOPs per byte
88    /// ```
89    #[must_use]
90    #[allow(clippy::cast_precision_loss)]
91    pub const fn arithmetic_intensity(total_flops: f64, total_bytes: usize) -> f64 {
92        total_flops / total_bytes as f64
93    }
94
95    /// Estimate FLOPs for simple aggregation (SUM, AVG, COUNT, MIN, MAX)
96    ///
97    /// Simple aggregations perform ~1 FLOP per element (single pass)
98    ///
99    /// # Arguments
100    /// * `num_elements` - Number of elements to aggregate
101    ///
102    /// # Returns
103    /// Estimated FLOPs
104    ///
105    /// # Example
106    /// ```
107    /// use trueno_db::backend::BackendDispatcher;
108    ///
109    /// // SUM over 100M elements = 100M FLOPs
110    /// let flops = BackendDispatcher::estimate_simple_aggregation_flops(100_000_000);
111    /// assert_eq!(flops, 100_000_000.0);
112    /// ```
113    #[must_use]
114    #[allow(clippy::cast_precision_loss)]
115    pub const fn estimate_simple_aggregation_flops(num_elements: usize) -> f64 {
116        num_elements as f64
117    }
118
119    /// Estimate FLOPs for GROUP BY aggregation
120    ///
121    /// GROUP BY requires hashing (5 FLOPs/element) + aggregation (1 FLOP/element)
122    ///
123    /// # Arguments
124    /// * `num_elements` - Number of elements to process
125    ///
126    /// # Returns
127    /// Estimated FLOPs
128    ///
129    /// # Example
130    /// ```
131    /// use trueno_db::backend::BackendDispatcher;
132    ///
133    /// // GROUP BY over 100M elements = 600M FLOPs
134    /// let flops = BackendDispatcher::estimate_group_by_flops(100_000_000);
135    /// assert_eq!(flops, 600_000_000.0);
136    /// ```
137    #[must_use]
138    #[allow(clippy::cast_precision_loss)]
139    pub const fn estimate_group_by_flops(num_elements: usize) -> f64 {
140        // Hashing: 5 FLOPs/element, Aggregation: 1 FLOP/element
141        (num_elements as f64) * 6.0
142    }
143
144    /// Estimate FLOPs for WHERE filter
145    ///
146    /// Filters require predicate evaluation (~2 FLOPs per element)
147    ///
148    /// # Arguments
149    /// * `num_elements` - Number of elements to filter
150    ///
151    /// # Returns
152    /// Estimated FLOPs
153    #[must_use]
154    #[allow(clippy::cast_precision_loss)]
155    pub const fn estimate_filter_flops(num_elements: usize) -> f64 {
156        (num_elements as f64) * 2.0
157    }
158
159    /// Estimate FLOPs for JOIN operation
160    ///
161    /// Hash join: Build hash table (5 FLOPs/elem) + Probe (5 FLOPs/elem)
162    ///
163    /// # Arguments
164    /// * `left_size` - Number of elements in left table
165    /// * `right_size` - Number of elements in right table
166    ///
167    /// # Returns
168    /// Estimated FLOPs
169    #[must_use]
170    #[allow(clippy::cast_precision_loss)]
171    pub const fn estimate_join_flops(left_size: usize, right_size: usize) -> f64 {
172        // Build phase: 5 FLOPs per left element
173        // Probe phase: 5 FLOPs per right element
174        ((left_size + right_size) as f64) * 5.0
175    }
176}