trueno 0.17.1

High-performance SIMD compute library with GPU support for matrix operations
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
//! Heijunka Scheduler (Leveled Testing)
//!
//! Implements Toyota Production System's Heijunka principle:
//! level the workload to reduce waste and variability.

use crate::Backend;
use std::collections::VecDeque;
use std::marker::PhantomData;

/// Backend-specific tolerance configuration
///
/// Implements Poka-Yoke (mistake-proofing) by providing compile-time
/// guarantees for correct tolerance values per backend type.
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct BackendTolerance {
    /// Scalar vs SIMD tolerance (should be exact: 0.0)
    pub scalar_vs_simd: f32,
    /// SIMD vs GPU tolerance (IEEE 754: 1e-5)
    pub simd_vs_gpu: f32,
    /// GPU vs GPU tolerance (same precision: 1e-6)
    pub gpu_vs_gpu: f32,
}

impl Default for BackendTolerance {
    fn default() -> Self {
        Self { scalar_vs_simd: 0.0, simd_vs_gpu: 1e-5, gpu_vs_gpu: 1e-6 }
    }
}

impl BackendTolerance {
    /// Strict tolerance for exact comparisons
    #[must_use]
    pub const fn strict() -> Self {
        Self { scalar_vs_simd: 0.0, simd_vs_gpu: 0.0, gpu_vs_gpu: 0.0 }
    }

    /// Relaxed tolerance for approximate comparisons
    #[must_use]
    pub const fn relaxed() -> Self {
        Self { scalar_vs_simd: 1e-6, simd_vs_gpu: 1e-4, gpu_vs_gpu: 1e-5 }
    }

    /// Get tolerance for comparing two backends
    #[must_use]
    pub fn for_backends(&self, a: Backend, b: Backend) -> f32 {
        match (a, b) {
            (Backend::Scalar, Backend::Scalar) => 0.0,
            (
                Backend::Scalar,
                Backend::SSE2
                | Backend::AVX
                | Backend::AVX2
                | Backend::AVX512
                | Backend::NEON
                | Backend::WasmSIMD
                | Backend::Auto,
            )
            | (
                Backend::SSE2
                | Backend::AVX
                | Backend::AVX2
                | Backend::AVX512
                | Backend::NEON
                | Backend::WasmSIMD
                | Backend::Auto,
                Backend::Scalar,
            ) => self.scalar_vs_simd,
            (Backend::GPU, Backend::GPU) => self.gpu_vs_gpu,
            (
                Backend::GPU,
                Backend::Scalar
                | Backend::SSE2
                | Backend::AVX
                | Backend::AVX2
                | Backend::AVX512
                | Backend::NEON
                | Backend::WasmSIMD
                | Backend::Auto,
            )
            | (
                Backend::Scalar
                | Backend::SSE2
                | Backend::AVX
                | Backend::AVX2
                | Backend::AVX512
                | Backend::NEON
                | Backend::WasmSIMD
                | Backend::Auto,
                Backend::GPU,
            ) => self.simd_vs_gpu,
            // SIMD vs SIMD (all remaining non-Scalar, non-GPU combinations)
            (
                Backend::SSE2
                | Backend::AVX
                | Backend::AVX2
                | Backend::AVX512
                | Backend::NEON
                | Backend::WasmSIMD
                | Backend::Auto,
                Backend::SSE2
                | Backend::AVX
                | Backend::AVX2
                | Backend::AVX512
                | Backend::NEON
                | Backend::WasmSIMD
                | Backend::Auto,
            ) => self.scalar_vs_simd,
        }
    }
}

/// Poka-Yoke: Type-safe backend selection
///
/// Provides compile-time and runtime guarantees for correct backend selection
/// based on input size and operation type.
#[derive(Debug, Clone)]
pub struct BackendSelector {
    /// Minimum size for GPU offload (default: 100,000)
    gpu_threshold: usize,
    /// Minimum size for parallel execution (default: 1,000)
    parallel_threshold: usize,
}

impl Default for BackendSelector {
    fn default() -> Self {
        Self { gpu_threshold: 100_000, parallel_threshold: 1_000 }
    }
}

impl BackendSelector {
    /// Create a new backend selector with custom thresholds
    #[must_use]
    pub const fn new(gpu_threshold: usize, parallel_threshold: usize) -> Self {
        Self { gpu_threshold, parallel_threshold }
    }

    /// Get the GPU threshold
    #[must_use]
    pub const fn gpu_threshold(&self) -> usize {
        self.gpu_threshold
    }

    /// Get the parallel threshold
    #[must_use]
    pub const fn parallel_threshold(&self) -> usize {
        self.parallel_threshold
    }

    /// Select backend based on input size
    ///
    /// # Decision Logic (TRUENO-SPEC-012)
    ///
    /// - N < 1,000: Pure SIMD (no parallelization overhead)
    /// - 1,000 <= N < 100,000: SIMD + Parallel (Rayon)
    /// - N >= 100,000: GPU (if available), else SIMD + Parallel
    #[must_use]
    pub fn select_for_size(&self, size: usize, gpu_available: bool) -> BackendCategory {
        if size < self.parallel_threshold {
            BackendCategory::SimdOnly
        } else if size < self.gpu_threshold {
            BackendCategory::SimdParallel
        } else if gpu_available {
            BackendCategory::Gpu
        } else {
            BackendCategory::SimdParallel // Graceful fallback
        }
    }

    /// Check if size is at GPU threshold boundary (for testing)
    #[must_use]
    pub fn is_at_gpu_boundary(&self, size: usize) -> bool {
        size == self.gpu_threshold || size == self.gpu_threshold - 1
    }

    /// Check if size is at parallel threshold boundary (for testing)
    #[must_use]
    pub fn is_at_parallel_boundary(&self, size: usize) -> bool {
        size == self.parallel_threshold || size == self.parallel_threshold - 1
    }
}

/// Backend category for selection result
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BackendCategory {
    /// Pure SIMD (N < 1,000)
    SimdOnly,
    /// SIMD with parallel execution (1,000 <= N < 100,000)
    SimdParallel,
    /// GPU compute (N >= 100,000)
    Gpu,
}

/// Simulation test configuration
#[derive(Debug, Clone)]
pub struct SimulationTest {
    /// Backend to test
    pub backend: Backend,
    /// Input size
    pub input_size: usize,
    /// Test cycle number
    pub cycle: u32,
    /// Seed for deterministic RNG
    pub seed: u64,
}

/// Heijunka: Balanced test distribution across backends and sizes
///
/// Implements Toyota Production System's Heijunka principle:
/// level the workload to reduce waste and variability.
#[derive(Debug)]
pub struct HeijunkaScheduler {
    /// Test queue balanced across backends
    queue: VecDeque<SimulationTest>,
    /// Backends to cycle through
    backends: Vec<Backend>,
}

impl HeijunkaScheduler {
    /// Create a leveled test schedule
    #[must_use]
    pub fn new(
        backends: Vec<Backend>,
        input_sizes: Vec<usize>,
        cycles_per_backend: u32,
        master_seed: u64,
    ) -> Self {
        let mut queue = VecDeque::new();

        // Interleave tests across backends (leveling)
        for size in &input_sizes {
            for backend in &backends {
                for cycle in 0..cycles_per_backend {
                    let seed = compute_seed(*backend, *size, cycle, master_seed);
                    queue.push_back(SimulationTest {
                        backend: *backend,
                        input_size: *size,
                        cycle,
                        seed,
                    });
                }
            }
        }

        Self { queue, backends: backends.clone() }
    }

    /// Get the next test from the queue
    pub fn next_test(&mut self) -> Option<SimulationTest> {
        self.queue.pop_front()
    }

    /// Get remaining test count
    #[must_use]
    pub fn remaining(&self) -> usize {
        self.queue.len()
    }

    /// Get backends being tested
    #[must_use]
    pub fn backends(&self) -> &[Backend] {
        &self.backends
    }

    /// Check if schedule is empty
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.queue.is_empty()
    }
}

/// Compute deterministic seed for a test configuration
pub(crate) fn compute_seed(backend: Backend, size: usize, cycle: u32, master_seed: u64) -> u64 {
    let backend_bits = backend as u64;
    let size_bits = size as u64;
    let cycle_bits = u64::from(cycle);

    master_seed
        .wrapping_add(backend_bits.wrapping_mul(0x9E37_79B9_7F4A_7C15))
        .wrapping_add(size_bits.wrapping_mul(0x6A09_E667_BB67_AE85))
        .wrapping_add(cycle_bits.wrapping_mul(0x3C6E_F372_FE94_F82B))
}

/// Simulation test configuration builder
#[derive(Debug, Clone)]
pub struct SimTestConfigBuilder<S> {
    seed: u64,
    tolerance: BackendTolerance,
    backends: Vec<Backend>,
    input_sizes: Vec<usize>,
    cycles: u32,
    _state: PhantomData<S>,
}

/// Builder state: seed not set
pub struct NeedsSeed;
/// Builder state: ready to build
pub struct Ready;

impl Default for SimTestConfigBuilder<NeedsSeed> {
    fn default() -> Self {
        Self::new()
    }
}

impl SimTestConfigBuilder<NeedsSeed> {
    /// Create a new config builder
    #[must_use]
    pub fn new() -> Self {
        Self {
            seed: 0,
            tolerance: BackendTolerance::default(),
            backends: vec![Backend::Scalar, Backend::AVX2],
            input_sizes: vec![100, 1_000, 10_000, 100_000],
            cycles: 10,
            _state: PhantomData,
        }
    }

    /// Set the master seed (required)
    #[must_use]
    pub fn seed(self, seed: u64) -> SimTestConfigBuilder<Ready> {
        SimTestConfigBuilder {
            seed,
            tolerance: self.tolerance,
            backends: self.backends,
            input_sizes: self.input_sizes,
            cycles: self.cycles,
            _state: PhantomData,
        }
    }
}

impl SimTestConfigBuilder<Ready> {
    /// Set tolerance configuration
    #[must_use]
    pub fn tolerance(mut self, tolerance: BackendTolerance) -> Self {
        self.tolerance = tolerance;
        self
    }

    /// Set backends to test
    #[must_use]
    pub fn backends(mut self, backends: Vec<Backend>) -> Self {
        self.backends = backends;
        self
    }

    /// Set input sizes to test
    #[must_use]
    pub fn input_sizes(mut self, sizes: Vec<usize>) -> Self {
        self.input_sizes = sizes;
        self
    }

    /// Set number of test cycles
    #[must_use]
    pub fn cycles(mut self, cycles: u32) -> Self {
        self.cycles = cycles;
        self
    }

    /// Build the configuration
    #[must_use]
    pub fn build(self) -> SimTestConfig {
        SimTestConfig {
            seed: self.seed,
            tolerance: self.tolerance,
            backends: self.backends,
            input_sizes: self.input_sizes,
            cycles: self.cycles,
        }
    }
}

/// Simulation test configuration
#[derive(Debug, Clone)]
pub struct SimTestConfig {
    /// Master seed for deterministic RNG
    pub seed: u64,
    /// Backend tolerance configuration
    pub tolerance: BackendTolerance,
    /// Backends to test
    pub backends: Vec<Backend>,
    /// Input sizes to test
    pub input_sizes: Vec<usize>,
    /// Number of test cycles
    pub cycles: u32,
}

impl SimTestConfig {
    /// Create a config builder
    #[must_use]
    pub fn builder() -> SimTestConfigBuilder<NeedsSeed> {
        SimTestConfigBuilder::new()
    }

    /// Create a Heijunka scheduler from this config
    #[must_use]
    pub fn create_scheduler(&self) -> HeijunkaScheduler {
        HeijunkaScheduler::new(
            self.backends.clone(),
            self.input_sizes.clone(),
            self.cycles,
            self.seed,
        )
    }
}

#[cfg(test)]
mod tests;

#[cfg(test)]
mod proptests;