trueno/lib.rs
1// ============================================================================
2// Development-phase lint allows - to be addressed incrementally
3// ============================================================================
4// Allow manual_div_ceil - clearer for block calculations
5#![allow(clippy::manual_div_ceil)]
6// Allow manual_is_multiple_of - clearer alignment checks
7#![allow(clippy::manual_is_multiple_of)]
8// Allow needless_range_loop - index access is clearer in some SIMD algorithms
9#![allow(clippy::needless_range_loop)]
10// Allow empty line after doc comments - formatting preference
11#![allow(clippy::empty_line_after_doc_comments)]
12// Allow similar names - semantic distinction is clear
13#![allow(clippy::similar_names)]
14// Allow many single char names - standard math/matrix notation
15#![allow(clippy::many_single_char_names)]
16// Allow too many arguments - SIMD/compute APIs require many parameters
17#![allow(clippy::too_many_arguments)]
18// Allow type complexity - complex SIMD types
19#![allow(clippy::type_complexity)]
20// Allow macro metavars in unsafe - necessary for SIMD dispatch macros
21#![allow(clippy::macro_metavars_in_unsafe)]
22// Allow missing panics doc - will be added incrementally
23#![allow(clippy::missing_panics_doc)]
24// Allow uninit_vec - intentional pattern for perf-critical paths where
25// every element is SET (not accumulated) before any read. Each use has
26// a SAFETY comment documenting the write-before-read invariant.
27#![allow(clippy::uninit_vec)]
28// Allow missing errors doc - will be added incrementally
29#![allow(clippy::missing_errors_doc)]
30// Allow missing safety doc - will be added incrementally
31#![allow(clippy::missing_safety_doc)]
32// Allow excessive precision - SIMD math constants need specific precision
33#![allow(clippy::excessive_precision)]
34// Allow unnecessary cast - clearer type annotations in some cases
35#![allow(clippy::unnecessary_cast)]
36// Allow cast_possible_truncation - handled in SIMD code
37#![allow(clippy::cast_possible_truncation)]
38// Allow cast_sign_loss - handled in SIMD code
39#![allow(clippy::cast_sign_loss)]
40// Allow cast_precision_loss - handled in SIMD code
41#![allow(clippy::cast_precision_loss)]
42// Allow large stack arrays - SIMD/GPU test data and proptest expansions
43#![allow(clippy::large_stack_arrays)]
44// Allow unwrap/float_cmp in test code — safe in assertions, banned in production
45#![cfg_attr(test, allow(clippy::disallowed_methods, clippy::float_cmp))]
46
47//! Trueno: Multi-Target High-Performance Compute Library
48//!
49//! **Trueno** (Spanish: "thunder") provides unified, high-performance compute primitives
50//! across three execution targets:
51//!
52//! 1. **CPU SIMD** - x86 (SSE2/AVX/AVX2/AVX-512), ARM (NEON), WASM (SIMD128)
53//! 2. **GPU** - Vulkan/Metal/DX12/WebGPU via `wgpu`
54//! 3. **WebAssembly** - Portable SIMD128 for browser/edge deployment
55//!
56//! # Design Principles
57//!
58//! - **Write once, optimize everywhere**: Single algorithm, multiple backends
59//! - **Runtime dispatch**: Auto-select best implementation based on CPU features
60//! - **Zero unsafe in public API**: Safety via type system, `unsafe` isolated in backends
61//! - **Benchmarked performance**: Every optimization must prove ≥10% speedup
62//! - **Extreme TDD**: >90% test coverage, mutation testing, property-based tests
63//!
64//! # Quick Start
65//!
66//! ```rust
67//! use trueno::Vector;
68//!
69//! let a = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0]);
70//! let b = Vector::from_slice(&[5.0, 6.0, 7.0, 8.0]);
71//!
72//! // Auto-selects best backend (AVX2/GPU/WASM)
73//! let result = a.add(&b).unwrap();
74//! assert_eq!(result.as_slice(), &[6.0, 8.0, 10.0, 12.0]);
75//! ```
76
77// Contract assertions from YAML (pv codegen)
78#[macro_use]
79#[allow(unused_macros)]
80mod generated_contracts;
81
82// Fallback macros for contracts not yet in codegen
83macro_rules! contract_pre_add { () => {{}}; ($($x:expr),+ $(,)?) => {{ $(let _ = &$x;)+ }}; }
84macro_rules! contract_pre_gemv { () => {{}}; ($($x:expr),+ $(,)?) => {{ $(let _ = &$x;)+ }}; }
85
86pub mod activations;
87pub mod backends;
88pub mod blis;
89pub mod brick;
90pub mod chaos;
91pub mod contracts;
92pub mod eigen;
93pub mod error;
94pub mod hardware;
95pub mod hash;
96pub mod inference;
97pub mod matrix;
98pub mod monitor;
99pub mod simulation;
100pub mod tiling;
101pub mod tuner;
102pub mod vector;
103
104// Canonical scalar activation functions (UCBD §4, trueno #103)
105pub use activations::{
106 f16_to_f32, f32_to_f16, gelu_scalar, relu_scalar, sigmoid_scalar, silu_scalar, tanh_scalar,
107};
108pub use eigen::SymmetricEigen;
109pub use error::{Result, TruenoError};
110pub use hash::{hash_bytes, hash_key, hash_keys_batch, hash_keys_batch_with_backend};
111pub use matrix::Matrix;
112pub use monitor::{
113 cuda_monitor_available, GpuBackend, GpuClockMetrics, GpuDeviceInfo, GpuMemoryMetrics,
114 GpuMetrics, GpuMonitor, GpuPcieMetrics, GpuPowerMetrics, GpuThermalMetrics, GpuUtilization,
115 GpuVendor, MonitorConfig, MonitorError,
116};
117#[cfg(feature = "cuda-monitor")]
118pub use monitor::{enumerate_cuda_devices, query_cuda_device_info, query_cuda_memory};
119pub use vector::Vector;
120
121// ComputeBrick exports
122pub use brick::{
123 fnv1a_f32_checksum,
124 AddOp,
125 AssertionResult,
126 AttentionOp,
127 // QUANT-Q5K: Q5_K and Q6_K quantization formats (llama.cpp compatible)
128 BlockQ5K,
129 BlockQ6K,
130 BrickBottleneck,
131 BrickCategory,
132 BrickError,
133 // PAR-200: BrickProfiler v2 types
134 BrickId,
135 BrickIdTimer,
136 BrickLayer,
137 BrickProfiler,
138 BrickSample,
139 BrickStats,
140 BrickTimer,
141 BrickVerification,
142 ByteBudget,
143 CategoryStats,
144 ComputeAssertion,
145 ComputeBackend,
146 ComputeBrick,
147 ComputeOp,
148 DivergenceInfo,
149 DotOp,
150 DotQ5KOp,
151 DotQ6KOp,
152 EdgeType,
153 ExecutionEdge,
154 ExecutionGraph,
155 ExecutionNode,
156 // PAR-201: Execution path graph types
157 ExecutionNodeId,
158 FusedGateUpOp,
159 FusedGateUpWeights,
160 FusedQKVOp,
161 FusedQKVWeights,
162 // CORRECTNESS-011: Divergence detection types
163 KernelChecksum,
164 MatmulOp,
165 PtxRegistry,
166 SoftmaxOp,
167 SyncMode,
168 // TILING-SPEC-001: Tile-level profiling types
169 TileLevel,
170 TileStats,
171 TileTimer,
172 TokenBudget,
173 TokenResult,
174};
175
176// Hardware capability exports (PMAT-447)
177pub use hardware::{
178 default_hardware_path, Bottleneck, CpuCapability, GpuBackend as HardwareGpuBackend,
179 GpuCapability, HardwareCapability, RooflineParams, SimdWidth,
180};
181
182// ML Tuner exports (T-TUNER-003 through T-TUNER-007, GH#80-84)
183pub use tuner::{
184 BottleneckClass, BottleneckPrediction, BrickTuner, ConceptDriftStatus, ExperimentSuggestion,
185 FeatureExtractor, KernelClassifier, KernelRecommendation, KernelType, QuantType, RunConfig,
186 ThroughputPrediction, ThroughputRegressor, TrainingSample, TrainingStats, TunerDataCollector,
187 TunerError, TunerFeatures, TunerRecommendation, UserFeedback,
188};
189
190// Tiling Compute Blocks exports (TILING-SPEC-001)
191pub use tiling::{
192 optimal_prefetch_distance, pack_a_index, pack_b_index, swizzle_index, PackingLayout,
193 PrefetchLocality, TcbGeometry, TcbIndexCalculator, TcbLevel, TiledQ4KMatvec, TilingBackend,
194 TilingConfig, TilingError, TilingStats, Q4K_SUPERBLOCK_BYTES, Q4K_SUPERBLOCK_SIZE,
195};
196
197/// Backend execution target
198#[derive(Debug, Clone, Copy, PartialEq, Eq)]
199pub enum Backend {
200 /// Scalar fallback (no SIMD)
201 Scalar,
202 /// SSE2 (x86_64 baseline)
203 SSE2,
204 /// AVX (256-bit)
205 AVX,
206 /// AVX2 (256-bit with FMA)
207 AVX2,
208 /// AVX-512 (512-bit)
209 AVX512,
210 /// ARM NEON
211 NEON,
212 /// WebAssembly SIMD128
213 WasmSIMD,
214 /// GPU compute (wgpu)
215 GPU,
216 /// Auto-select best available
217 Auto,
218}
219
220impl Backend {
221 /// Select the best available backend for the current platform
222 ///
223 /// This is a convenience wrapper around `select_best_available_backend()`
224 pub fn select_best() -> Self {
225 select_best_available_backend()
226 }
227}
228
229/// Operation complexity for GPU dispatch eligibility
230#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
231pub enum OpComplexity {
232 /// Simple operations (add, mul) - prefer SIMD unless very large
233 Low = 0,
234 /// Moderate operations (dot, reduce) - GPU beneficial at 100K+
235 Medium = 1,
236 /// Complex operations (matmul, convolution) - GPU beneficial at 10K+
237 High = 2,
238}
239
240/// Operation type for SIMD backend selection
241///
242/// Based on AVX-512 performance analysis (see AVX512_ANALYSIS.md), operations are
243/// categorized by their memory vs compute characteristics to guide optimal backend selection.
244#[derive(Debug, Clone, Copy, PartialEq, Eq)]
245pub enum OperationType {
246 /// Memory-bound operations (add, sub, mul, scale, div)
247 ///
248 /// These operations perform minimal computation per memory access (arithmetic intensity < 1 op/byte).
249 /// Prefer AVX2 over AVX-512 due to memory bandwidth bottleneck.
250 ///
251 /// AVX-512 performance: 0.67-1.20x scalar (often slower!)
252 /// AVX2 performance: 1.0-1.2x scalar
253 MemoryBound,
254
255 /// Compute-bound operations (dot, max, min, argmax, argmin)
256 ///
257 /// These operations perform significant computation per memory access (arithmetic intensity > 1 op/byte).
258 /// AVX-512 excels due to wider SIMD parallelism.
259 ///
260 /// AVX-512 performance: 7-14x scalar (validated)
261 /// AVX2 performance: 4-12x scalar (validated)
262 ComputeBound,
263
264 /// Mixed operations (fma, sqrt, exp, sigmoid, activations)
265 ///
266 /// Performance depends on data size and hardware.
267 /// Use size-based heuristics or default to AVX2 for safety.
268 Mixed,
269}
270
271/// Detect best SIMD backend for x86/x86_64 platforms
272///
273/// **IMPORTANT**: Prefers AVX2 over AVX-512 by default based on performance analysis.
274///
275/// AVX-512 is **NOT** universally faster - it causes 10-33% slowdown for memory-bound
276/// operations (add, mul, sub) due to memory bandwidth bottleneck and thermal throttling.
277/// See AVX512_ANALYSIS.md for detailed benchmarking results.
278///
279/// For operation-specific backend selection, use `select_backend_for_operation()`.
280#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
281fn detect_x86_backend() -> Backend {
282 // Prefer AVX2 over AVX-512 for safety (AVX-512 causes regressions for memory-bound ops)
283 if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
284 return Backend::AVX2;
285 }
286 // Note: AVX-512 is intentionally NOT checked here
287 // Use select_backend_for_operation(OperationType::ComputeBound) for AVX-512
288 if is_x86_feature_detected!("avx") {
289 return Backend::AVX;
290 }
291 if is_x86_feature_detected!("sse2") {
292 return Backend::SSE2;
293 }
294 Backend::Scalar
295}
296
297/// Detect best SIMD backend for ARM platforms
298#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
299fn detect_arm_backend() -> Backend {
300 #[cfg(target_feature = "neon")]
301 {
302 Backend::NEON
303 }
304 #[cfg(not(target_feature = "neon"))]
305 {
306 Backend::Scalar
307 }
308}
309
310/// Detect best SIMD backend for WebAssembly
311#[cfg(target_arch = "wasm32")]
312fn detect_wasm_backend() -> Backend {
313 #[cfg(target_feature = "simd128")]
314 {
315 Backend::WasmSIMD
316 }
317 #[cfg(not(target_feature = "simd128"))]
318 {
319 Backend::Scalar
320 }
321}
322
323/// Select the best available backend for the current platform
324///
325/// This function performs runtime CPU feature detection and selects the most
326/// optimized backend available. The selection follows this priority:
327///
328/// **x86/x86_64**:
329/// 1. AVX-512 (if `avx512f` feature detected)
330/// 2. AVX2 (if `avx2` and `fma` features detected)
331/// 3. AVX (if `avx` feature detected)
332/// 4. SSE2 (baseline for x86_64)
333/// 5. Scalar (fallback)
334///
335/// **ARM**:
336/// 1. NEON (if available)
337/// 2. Scalar (fallback)
338///
339/// **WASM**: SIMD128 (if available), else Scalar
340///
341/// **Other platforms**: Scalar
342///
343/// # Returns
344///
345/// The most optimized backend available on this CPU/platform
346///
347/// # Examples
348///
349/// ```
350/// use trueno::select_best_available_backend;
351///
352/// let backend = select_best_available_backend();
353/// println!("Using backend: {:?}", backend);
354/// ```
355pub fn select_best_available_backend() -> Backend {
356 // Cache backend selection using OnceLock to avoid repeated CPU feature detection
357 // This eliminates 3-5% overhead from calling is_x86_feature_detected!() repeatedly
358 static BEST_BACKEND: std::sync::OnceLock<Backend> = std::sync::OnceLock::new();
359
360 *BEST_BACKEND.get_or_init(|| {
361 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
362 {
363 detect_x86_backend()
364 }
365
366 #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
367 {
368 detect_arm_backend()
369 }
370
371 #[cfg(target_arch = "wasm32")]
372 {
373 detect_wasm_backend()
374 }
375
376 #[cfg(not(any(
377 target_arch = "x86_64",
378 target_arch = "x86",
379 target_arch = "aarch64",
380 target_arch = "arm",
381 target_arch = "wasm32"
382 )))]
383 {
384 Backend::Scalar
385 }
386 })
387}
388
389/// Select the optimal backend for a specific operation type
390///
391/// This function considers the memory vs compute characteristics of operations
392/// to select the backend that will provide the best performance. Based on
393/// comprehensive benchmarking (see AVX512_ANALYSIS.md), AVX-512 is avoided
394/// for memory-bound operations where it causes 10-33% performance degradation.
395///
396/// # Operation Classification
397///
398/// - **MemoryBound**: add, sub, mul, div, scale, abs, clamp, lerp, relu
399/// - Prefer AVX2 (1.0-1.2x scalar) over AVX-512 (0.67-1.20x scalar)
400/// - Memory bandwidth bottleneck limits wider SIMD benefit
401///
402/// - **ComputeBound**: dot, max, min, argmax, argmin, norm_l1, norm_l2, norm_linf
403/// - Prefer AVX-512 (7-14x scalar) over AVX2 (4-12x scalar)
404/// - High arithmetic intensity benefits from wider SIMD
405///
406/// - **Mixed**: fma, sqrt, exp, ln, sigmoid, tanh, gelu, swish
407/// - Default to AVX2 for safety (avoids AVX-512 thermal throttling)
408/// - Size-based heuristics could improve this in future
409///
410/// # Backend Selection Priority
411///
412/// **For MemoryBound operations**:
413/// 1. AVX2 (if available) - BEST for memory-bound
414/// 2. SSE2 (x86_64 baseline)
415/// 3. AVX-512 (AVOIDED - causes slowdown)
416/// 4. NEON (ARM)
417/// 5. WASM SIMD128
418/// 6. Scalar (fallback)
419///
420/// **For ComputeBound operations**:
421/// 1. AVX-512 (if available) - BEST for compute-bound
422/// 2. AVX2
423/// 3. SSE2
424/// 4. NEON (ARM)
425/// 5. WASM SIMD128
426/// 6. Scalar (fallback)
427///
428/// # Arguments
429///
430/// * `op_type` - The type of operation being performed
431///
432/// # Returns
433///
434/// The optimal backend for the given operation type
435///
436/// # Examples
437///
438/// ```
439/// use trueno::{select_backend_for_operation, OperationType};
440///
441/// // Memory-bound operation - prefers AVX2 over AVX-512
442/// let backend = select_backend_for_operation(OperationType::MemoryBound);
443///
444/// // Compute-bound operation - uses AVX-512 if available
445/// let backend = select_backend_for_operation(OperationType::ComputeBound);
446/// ```
447///
448/// # Performance Impact
449///
450/// Using operation-aware backend selection fixes performance regressions:
451/// - mul with AVX-512: 0.67x → 1.0x (use AVX2 instead)
452/// - sub with AVX-512: 0.87x → 1.0x (use AVX2 instead)
453/// - dot with AVX-512: 7.89x (keep AVX-512)
454pub fn select_backend_for_operation(op_type: OperationType) -> Backend {
455 // Allow unused on non-x86 architectures
456 let _ = &op_type;
457
458 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
459 {
460 select_x86_backend_for_operation(op_type)
461 }
462
463 #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
464 {
465 detect_arm_backend()
466 }
467
468 #[cfg(target_arch = "wasm32")]
469 {
470 detect_wasm_backend()
471 }
472
473 #[cfg(not(any(
474 target_arch = "x86_64",
475 target_arch = "x86",
476 target_arch = "aarch64",
477 target_arch = "arm",
478 target_arch = "wasm32"
479 )))]
480 {
481 Backend::Scalar
482 }
483}
484
485/// Select the best x86 backend based on operation type and available features.
486///
487/// Separated from `select_backend_for_operation` to reduce cyclomatic complexity.
488#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
489fn select_x86_backend_for_operation(op_type: OperationType) -> Backend {
490 use std::arch::is_x86_feature_detected;
491
492 // Check for AVX-512 (only for compute-bound operations)
493 let use_avx512 = op_type == OperationType::ComputeBound && is_x86_feature_detected!("avx512f");
494 if use_avx512 {
495 return Backend::AVX512;
496 }
497
498 // AVX2 with FMA is preferred for most operations
499 if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
500 return Backend::AVX2;
501 }
502
503 // Fallback chain: AVX -> SSE2 -> Scalar
504 if is_x86_feature_detected!("avx") {
505 return Backend::AVX;
506 }
507 if is_x86_feature_detected!("sse2") {
508 return Backend::SSE2;
509 }
510
511 Backend::Scalar
512}
513
514#[cfg(test)]
515mod contract_tests;
516
517#[cfg(test)]
518mod contract_tests_image;
519
520#[cfg(test)]
521mod contract_tests_linalg;
522
523#[cfg(test)]
524mod tests;