trueno/
lib.rs

1// ============================================================================
2// Development-phase lint allows - to be addressed incrementally
3// ============================================================================
4// Allow manual_div_ceil - clearer for block calculations
5#![allow(clippy::manual_div_ceil)]
6// Allow manual_is_multiple_of - clearer alignment checks
7#![allow(clippy::manual_is_multiple_of)]
8// Allow needless_range_loop - index access is clearer in some SIMD algorithms
9#![allow(clippy::needless_range_loop)]
10// Allow empty line after doc comments - formatting preference
11#![allow(clippy::empty_line_after_doc_comments)]
12// Allow similar names - semantic distinction is clear
13#![allow(clippy::similar_names)]
14// Allow many single char names - standard math/matrix notation
15#![allow(clippy::many_single_char_names)]
16// Allow too many arguments - SIMD/compute APIs require many parameters
17#![allow(clippy::too_many_arguments)]
18// Allow type complexity - complex SIMD types
19#![allow(clippy::type_complexity)]
20// Allow macro metavars in unsafe - necessary for SIMD dispatch macros
21#![allow(clippy::macro_metavars_in_unsafe)]
22// Allow missing panics doc - will be added incrementally
23#![allow(clippy::missing_panics_doc)]
24// Allow uninit_vec - intentional pattern for perf-critical paths where
25// every element is SET (not accumulated) before any read. Each use has
26// a SAFETY comment documenting the write-before-read invariant.
27#![allow(clippy::uninit_vec)]
28// Allow missing errors doc - will be added incrementally
29#![allow(clippy::missing_errors_doc)]
30// Allow missing safety doc - will be added incrementally
31#![allow(clippy::missing_safety_doc)]
32// Allow excessive precision - SIMD math constants need specific precision
33#![allow(clippy::excessive_precision)]
34// Allow unnecessary cast - clearer type annotations in some cases
35#![allow(clippy::unnecessary_cast)]
36// Allow cast_possible_truncation - handled in SIMD code
37#![allow(clippy::cast_possible_truncation)]
38// Allow cast_sign_loss - handled in SIMD code
39#![allow(clippy::cast_sign_loss)]
40// Allow cast_precision_loss - handled in SIMD code
41#![allow(clippy::cast_precision_loss)]
42// Allow large stack arrays - SIMD/GPU test data and proptest expansions
43#![allow(clippy::large_stack_arrays)]
44// Allow unwrap/float_cmp in test code — safe in assertions, banned in production
45#![cfg_attr(test, allow(clippy::disallowed_methods, clippy::float_cmp))]
46
47//! Trueno: Multi-Target High-Performance Compute Library
48//!
49//! **Trueno** (Spanish: "thunder") provides unified, high-performance compute primitives
50//! across three execution targets:
51//!
52//! 1. **CPU SIMD** - x86 (SSE2/AVX/AVX2/AVX-512), ARM (NEON), WASM (SIMD128)
53//! 2. **GPU** - Vulkan/Metal/DX12/WebGPU via `wgpu`
54//! 3. **WebAssembly** - Portable SIMD128 for browser/edge deployment
55//!
56//! # Design Principles
57//!
58//! - **Write once, optimize everywhere**: Single algorithm, multiple backends
59//! - **Runtime dispatch**: Auto-select best implementation based on CPU features
60//! - **Zero unsafe in public API**: Safety via type system, `unsafe` isolated in backends
61//! - **Benchmarked performance**: Every optimization must prove ≥10% speedup
62//! - **Extreme TDD**: >90% test coverage, mutation testing, property-based tests
63//!
64//! # Quick Start
65//!
66//! ```rust
67//! use trueno::Vector;
68//!
69//! let a = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0]);
70//! let b = Vector::from_slice(&[5.0, 6.0, 7.0, 8.0]);
71//!
72//! // Auto-selects best backend (AVX2/GPU/WASM)
73//! let result = a.add(&b).unwrap();
74//! assert_eq!(result.as_slice(), &[6.0, 8.0, 10.0, 12.0]);
75//! ```
76
77// Contract assertions from YAML (pv codegen)
78#[macro_use]
79#[allow(unused_macros)]
80mod generated_contracts;
81
82// Fallback macros for contracts not yet in codegen
83macro_rules! contract_pre_add { () => {{}}; ($($x:expr),+ $(,)?) => {{ $(let _ = &$x;)+ }}; }
84macro_rules! contract_pre_gemv { () => {{}}; ($($x:expr),+ $(,)?) => {{ $(let _ = &$x;)+ }}; }
85
86pub mod activations;
87pub mod backends;
88pub mod blis;
89pub mod brick;
90pub mod chaos;
91pub mod contracts;
92pub mod eigen;
93pub mod error;
94pub mod hardware;
95pub mod hash;
96pub mod inference;
97pub mod matrix;
98pub mod monitor;
99pub mod simulation;
100pub mod tiling;
101pub mod tuner;
102pub mod vector;
103
104// Canonical scalar activation functions (UCBD §4, trueno #103)
105pub use activations::{
106    f16_to_f32, f32_to_f16, gelu_scalar, relu_scalar, sigmoid_scalar, silu_scalar, tanh_scalar,
107};
108pub use eigen::SymmetricEigen;
109pub use error::{Result, TruenoError};
110pub use hash::{hash_bytes, hash_key, hash_keys_batch, hash_keys_batch_with_backend};
111pub use matrix::Matrix;
112pub use monitor::{
113    cuda_monitor_available, GpuBackend, GpuClockMetrics, GpuDeviceInfo, GpuMemoryMetrics,
114    GpuMetrics, GpuMonitor, GpuPcieMetrics, GpuPowerMetrics, GpuThermalMetrics, GpuUtilization,
115    GpuVendor, MonitorConfig, MonitorError,
116};
117#[cfg(feature = "cuda-monitor")]
118pub use monitor::{enumerate_cuda_devices, query_cuda_device_info, query_cuda_memory};
119pub use vector::Vector;
120
121// ComputeBrick exports
122pub use brick::{
123    fnv1a_f32_checksum,
124    AddOp,
125    AssertionResult,
126    AttentionOp,
127    // QUANT-Q5K: Q5_K and Q6_K quantization formats (llama.cpp compatible)
128    BlockQ5K,
129    BlockQ6K,
130    BrickBottleneck,
131    BrickCategory,
132    BrickError,
133    // PAR-200: BrickProfiler v2 types
134    BrickId,
135    BrickIdTimer,
136    BrickLayer,
137    BrickProfiler,
138    BrickSample,
139    BrickStats,
140    BrickTimer,
141    BrickVerification,
142    ByteBudget,
143    CategoryStats,
144    ComputeAssertion,
145    ComputeBackend,
146    ComputeBrick,
147    ComputeOp,
148    DivergenceInfo,
149    DotOp,
150    DotQ5KOp,
151    DotQ6KOp,
152    EdgeType,
153    ExecutionEdge,
154    ExecutionGraph,
155    ExecutionNode,
156    // PAR-201: Execution path graph types
157    ExecutionNodeId,
158    FusedGateUpOp,
159    FusedGateUpWeights,
160    FusedQKVOp,
161    FusedQKVWeights,
162    // CORRECTNESS-011: Divergence detection types
163    KernelChecksum,
164    MatmulOp,
165    PtxRegistry,
166    SoftmaxOp,
167    SyncMode,
168    // TILING-SPEC-001: Tile-level profiling types
169    TileLevel,
170    TileStats,
171    TileTimer,
172    TokenBudget,
173    TokenResult,
174};
175
176// Hardware capability exports (PMAT-447)
177pub use hardware::{
178    default_hardware_path, Bottleneck, CpuCapability, GpuBackend as HardwareGpuBackend,
179    GpuCapability, HardwareCapability, RooflineParams, SimdWidth,
180};
181
182// ML Tuner exports (T-TUNER-003 through T-TUNER-007, GH#80-84)
183pub use tuner::{
184    BottleneckClass, BottleneckPrediction, BrickTuner, ConceptDriftStatus, ExperimentSuggestion,
185    FeatureExtractor, KernelClassifier, KernelRecommendation, KernelType, QuantType, RunConfig,
186    ThroughputPrediction, ThroughputRegressor, TrainingSample, TrainingStats, TunerDataCollector,
187    TunerError, TunerFeatures, TunerRecommendation, UserFeedback,
188};
189
190// Tiling Compute Blocks exports (TILING-SPEC-001)
191pub use tiling::{
192    optimal_prefetch_distance, pack_a_index, pack_b_index, swizzle_index, PackingLayout,
193    PrefetchLocality, TcbGeometry, TcbIndexCalculator, TcbLevel, TiledQ4KMatvec, TilingBackend,
194    TilingConfig, TilingError, TilingStats, Q4K_SUPERBLOCK_BYTES, Q4K_SUPERBLOCK_SIZE,
195};
196
197/// Backend execution target
198#[derive(Debug, Clone, Copy, PartialEq, Eq)]
199pub enum Backend {
200    /// Scalar fallback (no SIMD)
201    Scalar,
202    /// SSE2 (x86_64 baseline)
203    SSE2,
204    /// AVX (256-bit)
205    AVX,
206    /// AVX2 (256-bit with FMA)
207    AVX2,
208    /// AVX-512 (512-bit)
209    AVX512,
210    /// ARM NEON
211    NEON,
212    /// WebAssembly SIMD128
213    WasmSIMD,
214    /// GPU compute (wgpu)
215    GPU,
216    /// Auto-select best available
217    Auto,
218}
219
220impl Backend {
221    /// Select the best available backend for the current platform
222    ///
223    /// This is a convenience wrapper around `select_best_available_backend()`
224    pub fn select_best() -> Self {
225        select_best_available_backend()
226    }
227}
228
229/// Operation complexity for GPU dispatch eligibility
230#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
231pub enum OpComplexity {
232    /// Simple operations (add, mul) - prefer SIMD unless very large
233    Low = 0,
234    /// Moderate operations (dot, reduce) - GPU beneficial at 100K+
235    Medium = 1,
236    /// Complex operations (matmul, convolution) - GPU beneficial at 10K+
237    High = 2,
238}
239
240/// Operation type for SIMD backend selection
241///
242/// Based on AVX-512 performance analysis (see AVX512_ANALYSIS.md), operations are
243/// categorized by their memory vs compute characteristics to guide optimal backend selection.
244#[derive(Debug, Clone, Copy, PartialEq, Eq)]
245pub enum OperationType {
246    /// Memory-bound operations (add, sub, mul, scale, div)
247    ///
248    /// These operations perform minimal computation per memory access (arithmetic intensity < 1 op/byte).
249    /// Prefer AVX2 over AVX-512 due to memory bandwidth bottleneck.
250    ///
251    /// AVX-512 performance: 0.67-1.20x scalar (often slower!)
252    /// AVX2 performance: 1.0-1.2x scalar
253    MemoryBound,
254
255    /// Compute-bound operations (dot, max, min, argmax, argmin)
256    ///
257    /// These operations perform significant computation per memory access (arithmetic intensity > 1 op/byte).
258    /// AVX-512 excels due to wider SIMD parallelism.
259    ///
260    /// AVX-512 performance: 7-14x scalar (validated)
261    /// AVX2 performance: 4-12x scalar (validated)
262    ComputeBound,
263
264    /// Mixed operations (fma, sqrt, exp, sigmoid, activations)
265    ///
266    /// Performance depends on data size and hardware.
267    /// Use size-based heuristics or default to AVX2 for safety.
268    Mixed,
269}
270
271/// Detect best SIMD backend for x86/x86_64 platforms
272///
273/// **IMPORTANT**: Prefers AVX2 over AVX-512 by default based on performance analysis.
274///
275/// AVX-512 is **NOT** universally faster - it causes 10-33% slowdown for memory-bound
276/// operations (add, mul, sub) due to memory bandwidth bottleneck and thermal throttling.
277/// See AVX512_ANALYSIS.md for detailed benchmarking results.
278///
279/// For operation-specific backend selection, use `select_backend_for_operation()`.
280#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
281fn detect_x86_backend() -> Backend {
282    // Prefer AVX2 over AVX-512 for safety (AVX-512 causes regressions for memory-bound ops)
283    if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
284        return Backend::AVX2;
285    }
286    // Note: AVX-512 is intentionally NOT checked here
287    // Use select_backend_for_operation(OperationType::ComputeBound) for AVX-512
288    if is_x86_feature_detected!("avx") {
289        return Backend::AVX;
290    }
291    if is_x86_feature_detected!("sse2") {
292        return Backend::SSE2;
293    }
294    Backend::Scalar
295}
296
297/// Detect best SIMD backend for ARM platforms
298#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
299fn detect_arm_backend() -> Backend {
300    #[cfg(target_feature = "neon")]
301    {
302        Backend::NEON
303    }
304    #[cfg(not(target_feature = "neon"))]
305    {
306        Backend::Scalar
307    }
308}
309
310/// Detect best SIMD backend for WebAssembly
311#[cfg(target_arch = "wasm32")]
312fn detect_wasm_backend() -> Backend {
313    #[cfg(target_feature = "simd128")]
314    {
315        Backend::WasmSIMD
316    }
317    #[cfg(not(target_feature = "simd128"))]
318    {
319        Backend::Scalar
320    }
321}
322
323/// Select the best available backend for the current platform
324///
325/// This function performs runtime CPU feature detection and selects the most
326/// optimized backend available. The selection follows this priority:
327///
328/// **x86/x86_64**:
329/// 1. AVX-512 (if `avx512f` feature detected)
330/// 2. AVX2 (if `avx2` and `fma` features detected)
331/// 3. AVX (if `avx` feature detected)
332/// 4. SSE2 (baseline for x86_64)
333/// 5. Scalar (fallback)
334///
335/// **ARM**:
336/// 1. NEON (if available)
337/// 2. Scalar (fallback)
338///
339/// **WASM**: SIMD128 (if available), else Scalar
340///
341/// **Other platforms**: Scalar
342///
343/// # Returns
344///
345/// The most optimized backend available on this CPU/platform
346///
347/// # Examples
348///
349/// ```
350/// use trueno::select_best_available_backend;
351///
352/// let backend = select_best_available_backend();
353/// println!("Using backend: {:?}", backend);
354/// ```
355pub fn select_best_available_backend() -> Backend {
356    // Cache backend selection using OnceLock to avoid repeated CPU feature detection
357    // This eliminates 3-5% overhead from calling is_x86_feature_detected!() repeatedly
358    static BEST_BACKEND: std::sync::OnceLock<Backend> = std::sync::OnceLock::new();
359
360    *BEST_BACKEND.get_or_init(|| {
361        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
362        {
363            detect_x86_backend()
364        }
365
366        #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
367        {
368            detect_arm_backend()
369        }
370
371        #[cfg(target_arch = "wasm32")]
372        {
373            detect_wasm_backend()
374        }
375
376        #[cfg(not(any(
377            target_arch = "x86_64",
378            target_arch = "x86",
379            target_arch = "aarch64",
380            target_arch = "arm",
381            target_arch = "wasm32"
382        )))]
383        {
384            Backend::Scalar
385        }
386    })
387}
388
389/// Select the optimal backend for a specific operation type
390///
391/// This function considers the memory vs compute characteristics of operations
392/// to select the backend that will provide the best performance. Based on
393/// comprehensive benchmarking (see AVX512_ANALYSIS.md), AVX-512 is avoided
394/// for memory-bound operations where it causes 10-33% performance degradation.
395///
396/// # Operation Classification
397///
398/// - **MemoryBound**: add, sub, mul, div, scale, abs, clamp, lerp, relu
399///   - Prefer AVX2 (1.0-1.2x scalar) over AVX-512 (0.67-1.20x scalar)
400///   - Memory bandwidth bottleneck limits wider SIMD benefit
401///
402/// - **ComputeBound**: dot, max, min, argmax, argmin, norm_l1, norm_l2, norm_linf
403///   - Prefer AVX-512 (7-14x scalar) over AVX2 (4-12x scalar)
404///   - High arithmetic intensity benefits from wider SIMD
405///
406/// - **Mixed**: fma, sqrt, exp, ln, sigmoid, tanh, gelu, swish
407///   - Default to AVX2 for safety (avoids AVX-512 thermal throttling)
408///   - Size-based heuristics could improve this in future
409///
410/// # Backend Selection Priority
411///
412/// **For MemoryBound operations**:
413/// 1. AVX2 (if available) - BEST for memory-bound
414/// 2. SSE2 (x86_64 baseline)
415/// 3. AVX-512 (AVOIDED - causes slowdown)
416/// 4. NEON (ARM)
417/// 5. WASM SIMD128
418/// 6. Scalar (fallback)
419///
420/// **For ComputeBound operations**:
421/// 1. AVX-512 (if available) - BEST for compute-bound
422/// 2. AVX2
423/// 3. SSE2
424/// 4. NEON (ARM)
425/// 5. WASM SIMD128
426/// 6. Scalar (fallback)
427///
428/// # Arguments
429///
430/// * `op_type` - The type of operation being performed
431///
432/// # Returns
433///
434/// The optimal backend for the given operation type
435///
436/// # Examples
437///
438/// ```
439/// use trueno::{select_backend_for_operation, OperationType};
440///
441/// // Memory-bound operation - prefers AVX2 over AVX-512
442/// let backend = select_backend_for_operation(OperationType::MemoryBound);
443///
444/// // Compute-bound operation - uses AVX-512 if available
445/// let backend = select_backend_for_operation(OperationType::ComputeBound);
446/// ```
447///
448/// # Performance Impact
449///
450/// Using operation-aware backend selection fixes performance regressions:
451/// - mul with AVX-512: 0.67x → 1.0x (use AVX2 instead)
452/// - sub with AVX-512: 0.87x → 1.0x (use AVX2 instead)
453/// - dot with AVX-512: 7.89x (keep AVX-512)
454pub fn select_backend_for_operation(op_type: OperationType) -> Backend {
455    // Allow unused on non-x86 architectures
456    let _ = &op_type;
457
458    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
459    {
460        select_x86_backend_for_operation(op_type)
461    }
462
463    #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
464    {
465        detect_arm_backend()
466    }
467
468    #[cfg(target_arch = "wasm32")]
469    {
470        detect_wasm_backend()
471    }
472
473    #[cfg(not(any(
474        target_arch = "x86_64",
475        target_arch = "x86",
476        target_arch = "aarch64",
477        target_arch = "arm",
478        target_arch = "wasm32"
479    )))]
480    {
481        Backend::Scalar
482    }
483}
484
485/// Select the best x86 backend based on operation type and available features.
486///
487/// Separated from `select_backend_for_operation` to reduce cyclomatic complexity.
488#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
489fn select_x86_backend_for_operation(op_type: OperationType) -> Backend {
490    use std::arch::is_x86_feature_detected;
491
492    // Check for AVX-512 (only for compute-bound operations)
493    let use_avx512 = op_type == OperationType::ComputeBound && is_x86_feature_detected!("avx512f");
494    if use_avx512 {
495        return Backend::AVX512;
496    }
497
498    // AVX2 with FMA is preferred for most operations
499    if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
500        return Backend::AVX2;
501    }
502
503    // Fallback chain: AVX -> SSE2 -> Scalar
504    if is_x86_feature_detected!("avx") {
505        return Backend::AVX;
506    }
507    if is_x86_feature_detected!("sse2") {
508        return Backend::SSE2;
509    }
510
511    Backend::Scalar
512}
513
514#[cfg(test)]
515mod contract_tests;
516
517#[cfg(test)]
518mod contract_tests_image;
519
520#[cfg(test)]
521mod contract_tests_linalg;
522
523#[cfg(test)]
524mod tests;
trueno/lib.rs

trueno/
lib.rs