Skip to main content

structured_zstd/
cpu_kernel.rs

1//! CPU kernel dispatch — single detect+match at the dispatch site,
2//! propagated through the inner pipeline as a generic parameter so
3//! leaf hot-path code monomorphises against the chosen kernel.
4//!
5//! See issue #247 for the architecture rationale: per-subsystem
6//! dispatch scatters the choice across HUF / FSE / SIMD-copy
7//! independently and pays the cost N times per call. Lifting the
8//! dispatch to the outermost feasible call site collapses it to one
9//! detect there; the inner leaf-hot-path ops then route through
10//! `K::method` calls on the chosen kernel zero-sized type.
11//!
12//! Current wiring (as of #247 Part 2): the only active dispatch site
13//! is `decoding::literals_section_decoder::decompress_literals`,
14//! which `match`es `detect_cpu_kernel()` and routes into per-K
15//! `decompress_literals_*` `#[target_feature]` wrappers. The full
16//! pipeline-wide propagation envisioned in the issue (FrameDecoder /
17//! FrameCompressor entry, sequence executor, match copy) is
18//! incremental; subsequent tiers extend the dispatch surface without
19//! changing this trait or the kernel ZSTs.
20//!
21//! Structure code (block loop, FCS check, offset history, repeat
22//! semantics) stays single-impl and only carries `K` as a phantom on
23//! the outer function. Monomorphisation specialises ONLY the bodies
24//! that actually differ per ISA — `mask_lower_bits`, `huf_burst`,
25//! `copy_chunk`, etc.
26
27#[cfg(feature = "std")]
28use std::sync::OnceLock;
29
30/// Trait covering the leaf hot-path operations whose bodies differ
31/// per ISA. Implementations are ZSTs; the trait is `Copy` so it can
32/// be `Default`-constructed at each call site without runtime cost.
33///
34/// New methods land here ONLY when their codegen genuinely differs
35/// per kernel (BMI2 intrinsic vs scalar shift, AVX2 256-bit move vs
36/// SSE2 128-bit move, etc.). Structure ops that have one canonical
37/// implementation must NOT be on this trait — they stay on the
38/// existing decoder / encoder types.
39// Public (rather than `pub(crate)`) because `BitReaderReversed` is
40// generic over `K: CpuKernel = ScalarKernel` and is re-exported via
41// the `bench_internals`-gated `testing` module; under that feature
42// the visibility of every type that appears in `BitReaderReversed`'s
43// bounds (the trait + the default kernel) must match the type's own
44// visibility, otherwise rustc rejects with `private_bounds` /
45// `private_interfaces`. The trait surface stays narrow on stable
46// crate users: nothing outside `bench_internals` constructs a
47// non-Scalar kernel directly.
48pub trait CpuKernel: Copy + 'static {
49    /// Mask the low `n` bits of `value`, returning the remaining
50    /// high bits zeroed. The FSE bitstream hot path fires this 3×
51    /// per decoded sequence; on BMI2-capable hardware this maps to
52    /// a single `_bzhi_u64` instruction, otherwise to a scalar
53    /// `u64::MAX >> (64 - n)` shift + mask.
54    ///
55    /// Precondition: `n <= 64`. Behaviour for `n == 0` is "return 0";
56    /// behaviour for `n > 64` is unspecified — callers MUST uphold
57    /// the bound. The test-only `mask_lower_bits` helper in
58    /// `bit_reader_reverse.rs` debug-asserts the bound for its
59    /// unit tests, but production callers (FSE / HUF hot paths)
60    /// derive `n` from `accuracy_log` / `max_num_bits` which the
61    /// per-stream table builders pin to `n <= MAX_*_BITS` at
62    /// construction time; no per-call wrapper assert runs.
63    fn mask_lower_bits(value: u64, n: u8) -> u64;
64}
65
66/// Scalar fallback — portable, no SIMD or BMI2 intrinsics. Selected
67/// when no x86 or aarch64 feature is detected at runtime.
68#[derive(Copy, Clone, Default)]
69pub struct ScalarKernel;
70
71impl CpuKernel for ScalarKernel {
72    #[inline(always)]
73    fn mask_lower_bits(value: u64, n: u8) -> u64 {
74        // `checked_shr` returns `None` for shift counts >= 64, which
75        // happens exactly when `n == 0` (`64 - 0 = 64`). Mapping
76        // both that case and the invalid `n > 64` underflow to 0
77        // gives the mathematically-correct empty mask for n=0 and
78        // a safe-ish fallback for the invalid range.
79        let mask = u64::MAX
80            .checked_shr(64u32.wrapping_sub(n as u32))
81            .unwrap_or(0);
82        value & mask
83    }
84}
85
86// The SSE2 tier exists in `CpuKernelTag` (it carries the 128-bit copy-chunk
87// choice for the unified copy dispatch) but needs no `CpuKernel` ZST yet: the
88// only trait method, `mask_lower_bits`, has no SSE2-specific form (SSE2 has no
89// bit-extract), so the Sse2 tag routes through the scalar bodies for the
90// FSE/HUF paths. A dedicated `Sse2Kernel` lands when `copy_chunk` moves onto
91// the trait.
92
93/// x86_64 BMI2-only kernel: `_bzhi_u64` for mask_lower_bits. Selected
94/// when the CPU has BMI2 but not the AVX2 SIMD width to upgrade to
95/// the Avx2 kernel. Treated as a stepping stone between Sse2 and
96/// Avx2 on hardware that has BMI2 but not AVX2 (rare in practice but
97/// matches donor's gating).
98#[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
99#[derive(Copy, Clone, Default)]
100pub(crate) struct Bmi2Kernel;
101
102#[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
103impl CpuKernel for Bmi2Kernel {
104    #[inline(always)]
105    fn mask_lower_bits(value: u64, n: u8) -> u64 {
106        // SAFETY: this kernel ZST is only reachable via the
107        // `match detect_cpu_kernel() { CpuKernelTag::Bmi2 => ... }`
108        // dispatch arms at decoder entry sites, all of which fire only
109        // after `detect_cpu_kernel` confirmed BMI2 is available on the
110        // running CPU.
111        unsafe { mask_lower_bits_bmi2_impl(value, n) }
112    }
113}
114
115/// x86_64 AVX2 + BMI2 kernel (x86-64-v3 baseline). The common modern
116/// x86 case — most CPUs released since 2013 (Haswell) have AVX2+BMI2.
117/// Uses `_bzhi_u64` for mask ops; future trait methods will use AVX2
118/// 256-bit moves for `copy_chunk` and pext for HUF burst.
119#[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
120#[derive(Copy, Clone, Default)]
121pub(crate) struct Avx2Kernel;
122
123#[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
124impl CpuKernel for Avx2Kernel {
125    #[inline(always)]
126    fn mask_lower_bits(value: u64, n: u8) -> u64 {
127        // SAFETY: Avx2Kernel is selected only after runtime detect
128        // confirmed both AVX2 and BMI2 — `_bzhi_u64` is callable.
129        unsafe { mask_lower_bits_bmi2_impl(value, n) }
130    }
131}
132
133/// x86_64 AVX-512 VBMI2 + AVX2 + BMI2 kernel. Selected when the CPU
134/// has the AVX-512 VBMI2 family available — VBMI2 unlocks a faster
135/// HUF burst inner loop (VPSHUFB-based table lookup); BMI2 mask_lower
136/// bits stays identical to Avx2 kernel.
137#[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
138#[derive(Copy, Clone, Default)]
139pub(crate) struct Vbmi2Kernel;
140
141#[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
142impl CpuKernel for Vbmi2Kernel {
143    #[inline(always)]
144    fn mask_lower_bits(value: u64, n: u8) -> u64 {
145        // SAFETY: same precondition as Avx2Kernel — BMI2 confirmed
146        // at runtime before this kernel is instantiated.
147        unsafe { mask_lower_bits_bmi2_impl(value, n) }
148    }
149}
150
151/// aarch64 NEON baseline kernel. Used on all aarch64 hardware that
152/// exposes NEON (effectively universal on the supported targets).
153///
154/// `#[allow(dead_code)]`: scaffolding for the future aarch64 dispatch
155/// arm in `decompress_literals` / `decode_and_execute_sequences`.
156/// The struct + trait impl land first so the dispatch wiring can be
157/// added incrementally without churning the CpuKernel surface; until
158/// the dispatch arm uses it the type is reachable only as a phantom.
159#[cfg(all(target_arch = "aarch64", feature = "kernel_neon"))]
160#[allow(dead_code)]
161#[derive(Copy, Clone, Default)]
162pub(crate) struct NeonKernel;
163
164#[cfg(all(target_arch = "aarch64", feature = "kernel_neon"))]
165impl CpuKernel for NeonKernel {
166    #[inline(always)]
167    fn mask_lower_bits(value: u64, n: u8) -> u64 {
168        // aarch64 has no BMI2 equivalent that improves on the scalar
169        // shift-and-mask sequence for this op; the codegen is
170        // identical to the Scalar kernel here. Other trait methods
171        // (huf_burst, copy_chunk) will diverge once they land.
172        ScalarKernel::mask_lower_bits(value, n)
173    }
174}
175
176/// aarch64 SVE kernel. Variable-vector-length SVE extends NEON for
177/// HUF burst / SIMD copy on Graviton3 / Apple M-series with SVE
178/// support. Mask op identical to NEON / Scalar.
179///
180/// `#[allow(dead_code)]`: same scaffolding rationale as `NeonKernel`.
181#[cfg(all(target_arch = "aarch64", feature = "kernel_sve"))]
182#[allow(dead_code)]
183#[derive(Copy, Clone, Default)]
184pub(crate) struct SveKernel;
185
186#[cfg(all(target_arch = "aarch64", feature = "kernel_sve"))]
187impl CpuKernel for SveKernel {
188    #[inline(always)]
189    fn mask_lower_bits(value: u64, n: u8) -> u64 {
190        ScalarKernel::mask_lower_bits(value, n)
191    }
192}
193
194/// Single `#[target_feature(enable = "bmi2")]` wrapper around the
195/// `_bzhi_u64` intrinsic. Lifted to a free function so each kernel
196/// impl that needs the BMI2 path (Bmi2 / Avx2 / Vbmi2) calls the
197/// same shared body. With `#[inline]` LLVM inlines the call into
198/// any caller that itself has BMI2 in scope; outside that scope the
199/// target_feature boundary is preserved.
200#[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
201#[target_feature(enable = "bmi2")]
202#[inline]
203unsafe fn mask_lower_bits_bmi2_impl(value: u64, n: u8) -> u64 {
204    // The intrinsic call is permitted directly inside a function
205    // already annotated `#[target_feature(enable = "bmi2")]` — no
206    // `unsafe { ... }` block needed (the function-level `unsafe`
207    // already covers it). SAFETY: caller selected a kernel whose
208    // CpuKernelTag was resolved after `is_x86_feature_detected!("bmi2")`
209    // returned true, so the BMI2 instruction set is available.
210    core::arch::x86_64::_bzhi_u64(value, n as u32)
211}
212
213/// Pure boolean-input variant of the x86 kernel-tag selection. Both the
214/// `std` runtime-detect path and the `no_std` compile-time-cfg path
215/// route through this helper so the precedence rules stay in one place
216/// (and are unit-testable without runtime CPUID).
217///
218/// The VBMI2 tier requires every AVX-512 sub-feature it touches AND the
219/// AVX2 baseline — VBMI2 kernels mix VBMI2-only intrinsics with AVX2
220/// 256-bit moves, so the dispatch must be conditioned on `has_avx2` too.
221/// Likewise the Avx2 tier requires both AVX2 and BMI2.
222#[cfg(target_arch = "x86_64")]
223#[inline(always)]
224// Params go unused when the matching `kernel_*` feature is disabled (the
225// rung that consumes them is `#[cfg]`-ed out); they are still passed by the
226// detect callers. Silence the conditional unused-variable warning rather
227// than thread per-feature `_`-prefixes through the signature.
228#[allow(unused_variables)]
229const fn select_x86_kernel(
230    has_avx512vbmi2: bool,
231    has_avx512f: bool,
232    has_avx512vl: bool,
233    has_avx512bw: bool,
234    has_bmi2: bool,
235    has_avx2: bool,
236    has_sse2: bool,
237) -> CpuKernelTag {
238    #[cfg(feature = "kernel_vbmi2")]
239    if has_avx512vbmi2 && has_avx512f && has_avx512vl && has_avx512bw && has_bmi2 && has_avx2 {
240        return CpuKernelTag::Vbmi2;
241    }
242    #[cfg(feature = "kernel_avx2")]
243    if has_avx2 && has_bmi2 {
244        return CpuKernelTag::Avx2;
245    }
246    #[cfg(feature = "kernel_bmi2")]
247    if has_bmi2 {
248        return CpuKernelTag::Bmi2;
249    }
250    #[cfg(feature = "kernel_sse2")]
251    if has_sse2 {
252        return CpuKernelTag::Sse2;
253    }
254    CpuKernelTag::Scalar
255}
256
257/// Cached runtime-detected kernel tag. The actual `CpuKernel` impl
258/// (`ScalarKernel` / `Bmi2Kernel` / `Avx2Kernel` / `Vbmi2Kernel` /
259/// `NeonKernel` / `SveKernel`) is constructed at the dispatch site —
260/// currently only `decoding::literals_section_decoder::decompress_literals`
261/// — via a `match` on this tag that branches into the per-K
262/// `target_feature`-wrapped specialisation. Pipeline-wide dispatch
263/// (FrameDecoder / FrameCompressor entry, sequence executor, match
264/// copy) lands incrementally in follow-up tiers.
265#[derive(Copy, Clone, Debug, Eq, PartialEq)]
266pub(crate) enum CpuKernelTag {
267    Scalar,
268    #[cfg(all(target_arch = "x86_64", feature = "kernel_sse2"))]
269    Sse2,
270    #[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
271    Bmi2,
272    #[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
273    Avx2,
274    #[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
275    Vbmi2,
276    #[cfg(all(target_arch = "aarch64", feature = "kernel_neon"))]
277    Neon,
278    // Both constructors of `Sve` need a reachable feature: runtime
279    // detection via `std::arch::is_aarch64_feature_detected!` (so
280    // `feature = "std"`) or compile-time `target_feature = "sve"` in
281    // RUSTFLAGS. Without either, the variant is unreachable and a
282    // `match` arm referencing it warns as dead.
283    #[cfg(all(
284        target_arch = "aarch64",
285        feature = "kernel_sve",
286        any(feature = "std", target_feature = "sve"),
287    ))]
288    Sve,
289}
290
291/// Detect once and cache the best available CPU kernel for the
292/// current process. Subsequent calls return the cached tag without
293/// re-running CPU-feature detection. Std-only — no-std targets use
294/// the compile-time variant below that resolves at build time.
295#[cfg(feature = "std")]
296pub(crate) fn detect_cpu_kernel() -> CpuKernelTag {
297    static CACHED: OnceLock<CpuKernelTag> = OnceLock::new();
298    *CACHED.get_or_init(detect_cpu_kernel_uncached)
299}
300
301#[cfg(feature = "std")]
302fn detect_cpu_kernel_uncached() -> CpuKernelTag {
303    #[cfg(target_arch = "x86_64")]
304    {
305        use std::arch::is_x86_feature_detected;
306        // Gate each probe on its tier feature: `cfg!(...)` const-folds, so the
307        // `&&` short-circuits away the runtime `is_x86_feature_detected!` call
308        // (and its CPUID/cache traffic) for tiers the build disabled — the
309        // matching `select_x86_kernel` rung is `#[cfg]`-ed out anyway.
310        return select_x86_kernel(
311            cfg!(feature = "kernel_vbmi2") && is_x86_feature_detected!("avx512vbmi2"),
312            cfg!(feature = "kernel_vbmi2") && is_x86_feature_detected!("avx512f"),
313            cfg!(feature = "kernel_vbmi2") && is_x86_feature_detected!("avx512vl"),
314            cfg!(feature = "kernel_vbmi2") && is_x86_feature_detected!("avx512bw"),
315            cfg!(feature = "kernel_bmi2") && is_x86_feature_detected!("bmi2"),
316            cfg!(feature = "kernel_avx2") && is_x86_feature_detected!("avx2"),
317            cfg!(feature = "kernel_sse2") && is_x86_feature_detected!("sse2"),
318        );
319    }
320    #[cfg(target_arch = "aarch64")]
321    {
322        #[cfg(any(feature = "kernel_sve", feature = "kernel_neon"))]
323        use std::arch::is_aarch64_feature_detected;
324        #[cfg(feature = "kernel_sve")]
325        if is_aarch64_feature_detected!("sve") {
326            return CpuKernelTag::Sve;
327        }
328        #[cfg(feature = "kernel_neon")]
329        if is_aarch64_feature_detected!("neon") {
330            return CpuKernelTag::Neon;
331        }
332        return CpuKernelTag::Scalar;
333    }
334    #[allow(unreachable_code)]
335    CpuKernelTag::Scalar
336}
337
338/// no-std variant: rely on compile-time `target_feature` flags
339/// instead of runtime detection. Resolves to the most-capable kernel
340/// that the build target supports.
341#[cfg(not(feature = "std"))]
342pub(crate) fn detect_cpu_kernel() -> CpuKernelTag {
343    #[cfg(target_arch = "x86_64")]
344    {
345        // Route through the same const-fn precedence helper as the
346        // `feature = "std"` path. `cfg!(target_feature = ...)`
347        // returns a compile-time bool that constant-folds through
348        // `select_x86_kernel`, so the runtime call has the same
349        // codegen as the previous hand-written #[cfg] chain.
350        return select_x86_kernel(
351            cfg!(target_feature = "avx512vbmi2"),
352            cfg!(target_feature = "avx512f"),
353            cfg!(target_feature = "avx512vl"),
354            cfg!(target_feature = "avx512bw"),
355            cfg!(target_feature = "bmi2"),
356            cfg!(target_feature = "avx2"),
357            cfg!(target_feature = "sse2"),
358        );
359    }
360    #[cfg(target_arch = "aarch64")]
361    {
362        #[cfg(all(feature = "kernel_sve", target_feature = "sve"))]
363        {
364            return CpuKernelTag::Sve;
365        }
366        #[cfg(all(feature = "kernel_neon", target_feature = "neon"))]
367        {
368            return CpuKernelTag::Neon;
369        }
370    }
371    #[allow(unreachable_code)]
372    CpuKernelTag::Scalar
373}
374
375impl CpuKernelTag {
376    /// Stable lowercase diagnostic name for this tier (used by
377    /// [`active_cpu_kernel_name`] and the bench/dashboard reporting). Pure
378    /// mapping over the tag, so every arm is exercisable in tests regardless
379    /// of which tier the running CPU actually resolves to.
380    pub(crate) fn name(self) -> &'static str {
381        match self {
382            CpuKernelTag::Scalar => "scalar",
383            #[cfg(all(target_arch = "x86_64", feature = "kernel_sse2"))]
384            CpuKernelTag::Sse2 => "sse2",
385            #[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
386            CpuKernelTag::Bmi2 => "bmi2",
387            #[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
388            CpuKernelTag::Avx2 => "avx2",
389            #[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
390            CpuKernelTag::Vbmi2 => "vbmi2",
391            #[cfg(all(target_arch = "aarch64", feature = "kernel_neon"))]
392            CpuKernelTag::Neon => "neon",
393            #[cfg(all(
394                target_arch = "aarch64",
395                feature = "kernel_sve",
396                any(feature = "std", target_feature = "sve"),
397            ))]
398            CpuKernelTag::Sve => "sve",
399        }
400    }
401}
402
403/// Name of the CPU kernel tier this process selected for the entropy /
404/// sequence hot paths: decode (literals + FSE sequence decode) and encode
405/// (entropy) share this dispatch (see #247). Returned as a stable lowercase
406/// string for diagnostics and benchmark/dashboard reporting; the value is
407/// what the runtime CPU-feature detection (or compile-time `target_feature`
408/// on `no_std`) actually resolves to on this machine, so a dashboard can
409/// attribute a measurement to the kernel that produced it.
410pub fn active_cpu_kernel_name() -> &'static str {
411    detect_cpu_kernel().name()
412}
413
414#[cfg(test)]
415mod tests {
416    use super::*;
417
418    #[test]
419    fn scalar_mask_lower_bits_zero_n_returns_zero() {
420        assert_eq!(ScalarKernel::mask_lower_bits(0xDEADBEEF, 0), 0);
421    }
422
423    #[test]
424    fn scalar_mask_lower_bits_full_64_returns_full_value() {
425        assert_eq!(
426            ScalarKernel::mask_lower_bits(0xFFFF_FFFF_FFFF_FFFF, 64),
427            0xFFFF_FFFF_FFFF_FFFF
428        );
429    }
430
431    #[test]
432    fn scalar_mask_lower_bits_mid_keeps_low_n_bits() {
433        // n=8: keep low 8 bits, zero the rest
434        assert_eq!(ScalarKernel::mask_lower_bits(0xDEAD_BEEF, 8), 0xEF);
435        assert_eq!(
436            ScalarKernel::mask_lower_bits(0x0102_0304_0506_0708, 16),
437            0x0708
438        );
439    }
440
441    // Gated on `std` AND `kernel_avx2`: the `is_x86_feature_detected!`
442    // guard below is a no-op under `--no-default-features` (no std,
443    // no runtime feature detection), so the test body would call
444    // `Avx2Kernel::mask_lower_bits` unconditionally and SIGILL on any
445    // non-BMI2 CPU — hence `feature = "std"`. `Avx2Kernel` itself is
446    // `#[cfg(feature = "kernel_avx2")]`, so the test must also require
447    // that feature or a `std`-only trimmed build (`kernel_avx2` off)
448    // fails to compile against the undefined type.
449    #[cfg(all(target_arch = "x86_64", feature = "std", feature = "kernel_avx2"))]
450    #[test]
451    fn avx2_mask_lower_bits_matches_scalar_on_bmi2_hw() {
452        // Only run when BMI2 actually available — otherwise constructing
453        // Avx2Kernel via dispatch wouldn't happen.
454        if !std::arch::is_x86_feature_detected!("bmi2") {
455            return;
456        }
457        for n in 0..=64u8 {
458            let v = 0x1234_5678_9ABC_DEF0u64;
459            assert_eq!(
460                Avx2Kernel::mask_lower_bits(v, n),
461                ScalarKernel::mask_lower_bits(v, n),
462                "mismatch at n={}",
463                n
464            );
465        }
466    }
467
468    /// Regression: a CPU advertising AVX-512 VBMI2 but NOT AVX2 (the
469    /// AMD64 baseline allows this combination at the spec level) was
470    /// previously selected as `Vbmi2`, which would SIGILL on the
471    /// first AVX2-mixed VBMI2 kernel invocation. The selection must
472    /// fall through to Scalar (or a non-AVX tier) in that case.
473    #[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
474    #[test]
475    fn select_x86_kernel_vbmi2_without_avx2_does_not_pick_vbmi2() {
476        let tag = select_x86_kernel(
477            /* avx512vbmi2 */ true, /* avx512f */ true, /* avx512vl */ true,
478            /* avx512bw */ true, /* bmi2 */ true, /* avx2 */ false,
479            /* sse2 */ true,
480        );
481        assert_ne!(
482            tag,
483            CpuKernelTag::Vbmi2,
484            "selecting Vbmi2 without AVX2 would call AVX2 instructions and SIGILL"
485        );
486    }
487
488    /// Sanity: when every flag is present the selector returns Vbmi2.
489    #[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
490    #[test]
491    fn select_x86_kernel_full_x86_v4_picks_vbmi2() {
492        let tag = select_x86_kernel(true, true, true, true, true, true, true);
493        assert_eq!(tag, CpuKernelTag::Vbmi2);
494    }
495
496    /// Sanity: AVX2 + BMI2 without AVX-512 → Avx2.
497    #[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
498    #[test]
499    fn select_x86_kernel_avx2_baseline_picks_avx2() {
500        let tag = select_x86_kernel(false, false, false, false, true, true, true);
501        assert_eq!(tag, CpuKernelTag::Avx2);
502    }
503
504    /// SSE2-only (no BMI2/AVX2) → Sse2, the x86_64 floor above Scalar.
505    #[cfg(all(target_arch = "x86_64", feature = "kernel_sse2"))]
506    #[test]
507    fn select_x86_kernel_sse2_only_picks_sse2() {
508        let tag = select_x86_kernel(false, false, false, false, false, false, true);
509        assert_eq!(tag, CpuKernelTag::Sse2);
510    }
511
512    /// No SIMD flags at all → Scalar (off-x86_64 / pre-SSE2 x86).
513    #[cfg(target_arch = "x86_64")]
514    #[test]
515    fn select_x86_kernel_no_features_picks_scalar() {
516        let tag = select_x86_kernel(false, false, false, false, false, false, false);
517        assert_eq!(tag, CpuKernelTag::Scalar);
518    }
519
520    #[test]
521    fn detect_returns_consistent_tag() {
522        let first = detect_cpu_kernel();
523        let second = detect_cpu_kernel();
524        assert_eq!(
525            first, second,
526            "cached detect must return same tag on repeated calls"
527        );
528    }
529
530    #[test]
531    fn active_kernel_name_is_known_lowercase_tier() {
532        // The diagnostic name must be one of the stable lowercase tier
533        // strings the dashboard parses, and must match whatever tier
534        // detection resolves to on this host (no `unknown` / empty leak).
535        const KNOWN: &[&str] = &["scalar", "sse2", "bmi2", "avx2", "vbmi2", "neon", "sve"];
536        let name = active_cpu_kernel_name();
537        assert!(
538            KNOWN.contains(&name),
539            "active kernel name {name:?} is not a recognised tier"
540        );
541        assert_eq!(
542            name,
543            name.to_ascii_lowercase(),
544            "tier name must be lowercase for stable dashboard parsing"
545        );
546    }
547
548    #[test]
549    fn every_kernel_tag_maps_to_its_lowercase_name() {
550        // `active_cpu_kernel_name` only exercises whichever arm the running
551        // CPU resolves to, so map each constructible tag directly to cover
552        // every branch on this build's feature set.
553        assert_eq!(CpuKernelTag::Scalar.name(), "scalar");
554        #[cfg(all(target_arch = "x86_64", feature = "kernel_sse2"))]
555        assert_eq!(CpuKernelTag::Sse2.name(), "sse2");
556        #[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
557        assert_eq!(CpuKernelTag::Bmi2.name(), "bmi2");
558        #[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
559        assert_eq!(CpuKernelTag::Avx2.name(), "avx2");
560        #[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
561        assert_eq!(CpuKernelTag::Vbmi2.name(), "vbmi2");
562        #[cfg(all(target_arch = "aarch64", feature = "kernel_neon"))]
563        assert_eq!(CpuKernelTag::Neon.name(), "neon");
564        #[cfg(all(
565            target_arch = "aarch64",
566            feature = "kernel_sve",
567            any(feature = "std", target_feature = "sve"),
568        ))]
569        assert_eq!(CpuKernelTag::Sve.name(), "sve");
570    }
571
572    #[test]
573    fn active_kernel_name_is_stable_across_calls() {
574        // Backed by the cached `detect_cpu_kernel`, so repeated calls must
575        // return the identical static string.
576        assert_eq!(active_cpu_kernel_name(), active_cpu_kernel_name());
577    }
578}