structured_zstd/cpu_kernel.rs
1//! CPU kernel dispatch — single detect+match at the dispatch site,
2//! propagated through the inner pipeline as a generic parameter so
3//! leaf hot-path code monomorphises against the chosen kernel.
4//!
5//! See issue #247 for the architecture rationale: per-subsystem
6//! dispatch scatters the choice across HUF / FSE / SIMD-copy
7//! independently and pays the cost N times per call. Lifting the
8//! dispatch to the outermost feasible call site collapses it to one
9//! detect there; the inner leaf-hot-path ops then route through
10//! `K::method` calls on the chosen kernel zero-sized type.
11//!
12//! Current wiring (as of #247 Part 2): the only active dispatch site
13//! is `decoding::literals_section_decoder::decompress_literals`,
14//! which `match`es `detect_cpu_kernel()` and routes into per-K
15//! `decompress_literals_*` `#[target_feature]` wrappers. The full
16//! pipeline-wide propagation envisioned in the issue (FrameDecoder /
17//! FrameCompressor entry, sequence executor, match copy) is
18//! incremental; subsequent tiers extend the dispatch surface without
19//! changing this trait or the kernel ZSTs.
20//!
21//! Structure code (block loop, FCS check, offset history, repeat
22//! semantics) stays single-impl and only carries `K` as a phantom on
23//! the outer function. Monomorphisation specialises ONLY the bodies
24//! that actually differ per ISA — `mask_lower_bits`, `huf_burst`,
25//! `copy_chunk`, etc.
26
27#[cfg(feature = "std")]
28use std::sync::OnceLock;
29
30/// Trait covering the leaf hot-path operations whose bodies differ
31/// per ISA. Implementations are ZSTs; the trait is `Copy` so it can
32/// be `Default`-constructed at each call site without runtime cost.
33///
34/// New methods land here ONLY when their codegen genuinely differs
35/// per kernel (BMI2 intrinsic vs scalar shift, AVX2 256-bit move vs
36/// SSE2 128-bit move, etc.). Structure ops that have one canonical
37/// implementation must NOT be on this trait — they stay on the
38/// existing decoder / encoder types.
39// Public (rather than `pub(crate)`) because `BitReaderReversed` is
40// generic over `K: CpuKernel = ScalarKernel` and is re-exported via
41// the `bench_internals`-gated `testing` module; under that feature
42// the visibility of every type that appears in `BitReaderReversed`'s
43// bounds (the trait + the default kernel) must match the type's own
44// visibility, otherwise rustc rejects with `private_bounds` /
45// `private_interfaces`. The trait surface stays narrow on stable
46// crate users: nothing outside `bench_internals` constructs a
47// non-Scalar kernel directly.
48pub trait CpuKernel: Copy + 'static {
49 /// Mask the low `n` bits of `value`, returning the remaining
50 /// high bits zeroed. The FSE bitstream hot path fires this 3×
51 /// per decoded sequence; on BMI2-capable hardware this maps to
52 /// a single `_bzhi_u64` instruction, otherwise to a scalar
53 /// `u64::MAX >> (64 - n)` shift + mask.
54 ///
55 /// Precondition: `n <= 64`. Behaviour for `n == 0` is "return 0";
56 /// behaviour for `n > 64` is unspecified — callers MUST uphold
57 /// the bound. The test-only `mask_lower_bits` helper in
58 /// `bit_reader_reverse.rs` debug-asserts the bound for its
59 /// unit tests, but production callers (FSE / HUF hot paths)
60 /// derive `n` from `accuracy_log` / `max_num_bits` which the
61 /// per-stream table builders pin to `n <= MAX_*_BITS` at
62 /// construction time; no per-call wrapper assert runs.
63 fn mask_lower_bits(value: u64, n: u8) -> u64;
64}
65
66/// Scalar fallback — portable, no SIMD or BMI2 intrinsics. Selected
67/// when no x86 or aarch64 feature is detected at runtime.
68#[derive(Copy, Clone, Default)]
69pub struct ScalarKernel;
70
71impl CpuKernel for ScalarKernel {
72 #[inline(always)]
73 fn mask_lower_bits(value: u64, n: u8) -> u64 {
74 // `checked_shr` returns `None` for shift counts >= 64, which
75 // happens exactly when `n == 0` (`64 - 0 = 64`). Mapping
76 // both that case and the invalid `n > 64` underflow to 0
77 // gives the mathematically-correct empty mask for n=0 and
78 // a safe-ish fallback for the invalid range.
79 let mask = u64::MAX
80 .checked_shr(64u32.wrapping_sub(n as u32))
81 .unwrap_or(0);
82 value & mask
83 }
84}
85
86// The SSE2 tier exists in `CpuKernelTag` (it carries the 128-bit copy-chunk
87// choice for the unified copy dispatch) but needs no `CpuKernel` ZST yet: the
88// only trait method, `mask_lower_bits`, has no SSE2-specific form (SSE2 has no
89// bit-extract), so the Sse2 tag routes through the scalar bodies for the
90// FSE/HUF paths. A dedicated `Sse2Kernel` lands when `copy_chunk` moves onto
91// the trait.
92
93/// x86_64 BMI2-only kernel: `_bzhi_u64` for mask_lower_bits. Selected
94/// when the CPU has BMI2 but not the AVX2 SIMD width to upgrade to
95/// the Avx2 kernel. Treated as a stepping stone between Sse2 and
96/// Avx2 on hardware that has BMI2 but not AVX2 (rare in practice but
97/// matches donor's gating).
98#[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
99#[derive(Copy, Clone, Default)]
100pub(crate) struct Bmi2Kernel;
101
102#[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
103impl CpuKernel for Bmi2Kernel {
104 #[inline(always)]
105 fn mask_lower_bits(value: u64, n: u8) -> u64 {
106 // SAFETY: this kernel ZST is only reachable via the
107 // `match detect_cpu_kernel() { CpuKernelTag::Bmi2 => ... }`
108 // dispatch arms at decoder entry sites, all of which fire only
109 // after `detect_cpu_kernel` confirmed BMI2 is available on the
110 // running CPU.
111 unsafe { mask_lower_bits_bmi2_impl(value, n) }
112 }
113}
114
115/// x86_64 AVX2 + BMI2 kernel (x86-64-v3 baseline). The common modern
116/// x86 case — most CPUs released since 2013 (Haswell) have AVX2+BMI2.
117/// Uses `_bzhi_u64` for mask ops; future trait methods will use AVX2
118/// 256-bit moves for `copy_chunk` and pext for HUF burst.
119#[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
120#[derive(Copy, Clone, Default)]
121pub(crate) struct Avx2Kernel;
122
123#[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
124impl CpuKernel for Avx2Kernel {
125 #[inline(always)]
126 fn mask_lower_bits(value: u64, n: u8) -> u64 {
127 // SAFETY: Avx2Kernel is selected only after runtime detect
128 // confirmed both AVX2 and BMI2 — `_bzhi_u64` is callable.
129 unsafe { mask_lower_bits_bmi2_impl(value, n) }
130 }
131}
132
133/// x86_64 AVX-512 VBMI2 + AVX2 + BMI2 kernel. Selected when the CPU
134/// has the AVX-512 VBMI2 family available — VBMI2 unlocks a faster
135/// HUF burst inner loop (VPSHUFB-based table lookup); BMI2 mask_lower
136/// bits stays identical to Avx2 kernel.
137#[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
138#[derive(Copy, Clone, Default)]
139pub(crate) struct Vbmi2Kernel;
140
141#[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
142impl CpuKernel for Vbmi2Kernel {
143 #[inline(always)]
144 fn mask_lower_bits(value: u64, n: u8) -> u64 {
145 // SAFETY: same precondition as Avx2Kernel — BMI2 confirmed
146 // at runtime before this kernel is instantiated.
147 unsafe { mask_lower_bits_bmi2_impl(value, n) }
148 }
149}
150
151/// aarch64 NEON baseline kernel. Used on all aarch64 hardware that
152/// exposes NEON (effectively universal on the supported targets).
153///
154/// `#[allow(dead_code)]`: scaffolding for the future aarch64 dispatch
155/// arm in `decompress_literals` / `decode_and_execute_sequences`.
156/// The struct + trait impl land first so the dispatch wiring can be
157/// added incrementally without churning the CpuKernel surface; until
158/// the dispatch arm uses it the type is reachable only as a phantom.
159#[cfg(all(target_arch = "aarch64", feature = "kernel_neon"))]
160#[allow(dead_code)]
161#[derive(Copy, Clone, Default)]
162pub(crate) struct NeonKernel;
163
164#[cfg(all(target_arch = "aarch64", feature = "kernel_neon"))]
165impl CpuKernel for NeonKernel {
166 #[inline(always)]
167 fn mask_lower_bits(value: u64, n: u8) -> u64 {
168 // aarch64 has no BMI2 equivalent that improves on the scalar
169 // shift-and-mask sequence for this op; the codegen is
170 // identical to the Scalar kernel here. Other trait methods
171 // (huf_burst, copy_chunk) will diverge once they land.
172 ScalarKernel::mask_lower_bits(value, n)
173 }
174}
175
176/// aarch64 SVE kernel. Variable-vector-length SVE extends NEON for
177/// HUF burst / SIMD copy on Graviton3 / Apple M-series with SVE
178/// support. Mask op identical to NEON / Scalar.
179///
180/// `#[allow(dead_code)]`: same scaffolding rationale as `NeonKernel`.
181#[cfg(all(target_arch = "aarch64", feature = "kernel_sve"))]
182#[allow(dead_code)]
183#[derive(Copy, Clone, Default)]
184pub(crate) struct SveKernel;
185
186#[cfg(all(target_arch = "aarch64", feature = "kernel_sve"))]
187impl CpuKernel for SveKernel {
188 #[inline(always)]
189 fn mask_lower_bits(value: u64, n: u8) -> u64 {
190 ScalarKernel::mask_lower_bits(value, n)
191 }
192}
193
194/// Single `#[target_feature(enable = "bmi2")]` wrapper around the
195/// `_bzhi_u64` intrinsic. Lifted to a free function so each kernel
196/// impl that needs the BMI2 path (Bmi2 / Avx2 / Vbmi2) calls the
197/// same shared body. With `#[inline]` LLVM inlines the call into
198/// any caller that itself has BMI2 in scope; outside that scope the
199/// target_feature boundary is preserved.
200#[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
201#[target_feature(enable = "bmi2")]
202#[inline]
203unsafe fn mask_lower_bits_bmi2_impl(value: u64, n: u8) -> u64 {
204 // The intrinsic call is permitted directly inside a function
205 // already annotated `#[target_feature(enable = "bmi2")]` — no
206 // `unsafe { ... }` block needed (the function-level `unsafe`
207 // already covers it). SAFETY: caller selected a kernel whose
208 // CpuKernelTag was resolved after `is_x86_feature_detected!("bmi2")`
209 // returned true, so the BMI2 instruction set is available.
210 core::arch::x86_64::_bzhi_u64(value, n as u32)
211}
212
213/// Pure boolean-input variant of the x86 kernel-tag selection. Both the
214/// `std` runtime-detect path and the `no_std` compile-time-cfg path
215/// route through this helper so the precedence rules stay in one place
216/// (and are unit-testable without runtime CPUID).
217///
218/// The VBMI2 tier requires every AVX-512 sub-feature it touches AND the
219/// AVX2 baseline — VBMI2 kernels mix VBMI2-only intrinsics with AVX2
220/// 256-bit moves, so the dispatch must be conditioned on `has_avx2` too.
221/// Likewise the Avx2 tier requires both AVX2 and BMI2.
222#[cfg(target_arch = "x86_64")]
223#[inline(always)]
224// Params go unused when the matching `kernel_*` feature is disabled (the
225// rung that consumes them is `#[cfg]`-ed out); they are still passed by the
226// detect callers. Silence the conditional unused-variable warning rather
227// than thread per-feature `_`-prefixes through the signature.
228#[allow(unused_variables)]
229const fn select_x86_kernel(
230 has_avx512vbmi2: bool,
231 has_avx512f: bool,
232 has_avx512vl: bool,
233 has_avx512bw: bool,
234 has_bmi2: bool,
235 has_avx2: bool,
236 has_sse2: bool,
237) -> CpuKernelTag {
238 #[cfg(feature = "kernel_vbmi2")]
239 if has_avx512vbmi2 && has_avx512f && has_avx512vl && has_avx512bw && has_bmi2 && has_avx2 {
240 return CpuKernelTag::Vbmi2;
241 }
242 #[cfg(feature = "kernel_avx2")]
243 if has_avx2 && has_bmi2 {
244 return CpuKernelTag::Avx2;
245 }
246 #[cfg(feature = "kernel_bmi2")]
247 if has_bmi2 {
248 return CpuKernelTag::Bmi2;
249 }
250 #[cfg(feature = "kernel_sse2")]
251 if has_sse2 {
252 return CpuKernelTag::Sse2;
253 }
254 CpuKernelTag::Scalar
255}
256
257/// Cached runtime-detected kernel tag. The actual `CpuKernel` impl
258/// (`ScalarKernel` / `Bmi2Kernel` / `Avx2Kernel` / `Vbmi2Kernel` /
259/// `NeonKernel` / `SveKernel`) is constructed at the dispatch site —
260/// currently only `decoding::literals_section_decoder::decompress_literals`
261/// — via a `match` on this tag that branches into the per-K
262/// `target_feature`-wrapped specialisation. Pipeline-wide dispatch
263/// (FrameDecoder / FrameCompressor entry, sequence executor, match
264/// copy) lands incrementally in follow-up tiers.
265#[derive(Copy, Clone, Debug, Eq, PartialEq)]
266pub(crate) enum CpuKernelTag {
267 Scalar,
268 #[cfg(all(target_arch = "x86_64", feature = "kernel_sse2"))]
269 Sse2,
270 #[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
271 Bmi2,
272 #[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
273 Avx2,
274 #[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
275 Vbmi2,
276 #[cfg(all(target_arch = "aarch64", feature = "kernel_neon"))]
277 Neon,
278 // Both constructors of `Sve` need a reachable feature: runtime
279 // detection via `std::arch::is_aarch64_feature_detected!` (so
280 // `feature = "std"`) or compile-time `target_feature = "sve"` in
281 // RUSTFLAGS. Without either, the variant is unreachable and a
282 // `match` arm referencing it warns as dead.
283 #[cfg(all(
284 target_arch = "aarch64",
285 feature = "kernel_sve",
286 any(feature = "std", target_feature = "sve"),
287 ))]
288 Sve,
289}
290
291/// Detect once and cache the best available CPU kernel for the
292/// current process. Subsequent calls return the cached tag without
293/// re-running CPU-feature detection. Std-only — no-std targets use
294/// the compile-time variant below that resolves at build time.
295#[cfg(feature = "std")]
296pub(crate) fn detect_cpu_kernel() -> CpuKernelTag {
297 static CACHED: OnceLock<CpuKernelTag> = OnceLock::new();
298 *CACHED.get_or_init(detect_cpu_kernel_uncached)
299}
300
301#[cfg(feature = "std")]
302fn detect_cpu_kernel_uncached() -> CpuKernelTag {
303 #[cfg(target_arch = "x86_64")]
304 {
305 use std::arch::is_x86_feature_detected;
306 // Gate each probe on its tier feature: `cfg!(...)` const-folds, so the
307 // `&&` short-circuits away the runtime `is_x86_feature_detected!` call
308 // (and its CPUID/cache traffic) for tiers the build disabled — the
309 // matching `select_x86_kernel` rung is `#[cfg]`-ed out anyway.
310 return select_x86_kernel(
311 cfg!(feature = "kernel_vbmi2") && is_x86_feature_detected!("avx512vbmi2"),
312 cfg!(feature = "kernel_vbmi2") && is_x86_feature_detected!("avx512f"),
313 cfg!(feature = "kernel_vbmi2") && is_x86_feature_detected!("avx512vl"),
314 cfg!(feature = "kernel_vbmi2") && is_x86_feature_detected!("avx512bw"),
315 cfg!(feature = "kernel_bmi2") && is_x86_feature_detected!("bmi2"),
316 cfg!(feature = "kernel_avx2") && is_x86_feature_detected!("avx2"),
317 cfg!(feature = "kernel_sse2") && is_x86_feature_detected!("sse2"),
318 );
319 }
320 #[cfg(target_arch = "aarch64")]
321 {
322 #[cfg(any(feature = "kernel_sve", feature = "kernel_neon"))]
323 use std::arch::is_aarch64_feature_detected;
324 #[cfg(feature = "kernel_sve")]
325 if is_aarch64_feature_detected!("sve") {
326 return CpuKernelTag::Sve;
327 }
328 #[cfg(feature = "kernel_neon")]
329 if is_aarch64_feature_detected!("neon") {
330 return CpuKernelTag::Neon;
331 }
332 return CpuKernelTag::Scalar;
333 }
334 #[allow(unreachable_code)]
335 CpuKernelTag::Scalar
336}
337
338/// no-std variant: rely on compile-time `target_feature` flags
339/// instead of runtime detection. Resolves to the most-capable kernel
340/// that the build target supports.
341#[cfg(not(feature = "std"))]
342pub(crate) fn detect_cpu_kernel() -> CpuKernelTag {
343 #[cfg(target_arch = "x86_64")]
344 {
345 // Route through the same const-fn precedence helper as the
346 // `feature = "std"` path. `cfg!(target_feature = ...)`
347 // returns a compile-time bool that constant-folds through
348 // `select_x86_kernel`, so the runtime call has the same
349 // codegen as the previous hand-written #[cfg] chain.
350 return select_x86_kernel(
351 cfg!(target_feature = "avx512vbmi2"),
352 cfg!(target_feature = "avx512f"),
353 cfg!(target_feature = "avx512vl"),
354 cfg!(target_feature = "avx512bw"),
355 cfg!(target_feature = "bmi2"),
356 cfg!(target_feature = "avx2"),
357 cfg!(target_feature = "sse2"),
358 );
359 }
360 #[cfg(target_arch = "aarch64")]
361 {
362 #[cfg(all(feature = "kernel_sve", target_feature = "sve"))]
363 {
364 return CpuKernelTag::Sve;
365 }
366 #[cfg(all(feature = "kernel_neon", target_feature = "neon"))]
367 {
368 return CpuKernelTag::Neon;
369 }
370 }
371 #[allow(unreachable_code)]
372 CpuKernelTag::Scalar
373}
374
375impl CpuKernelTag {
376 /// Stable lowercase diagnostic name for this tier (used by
377 /// [`active_cpu_kernel_name`] and the bench/dashboard reporting). Pure
378 /// mapping over the tag, so every arm is exercisable in tests regardless
379 /// of which tier the running CPU actually resolves to.
380 pub(crate) fn name(self) -> &'static str {
381 match self {
382 CpuKernelTag::Scalar => "scalar",
383 #[cfg(all(target_arch = "x86_64", feature = "kernel_sse2"))]
384 CpuKernelTag::Sse2 => "sse2",
385 #[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
386 CpuKernelTag::Bmi2 => "bmi2",
387 #[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
388 CpuKernelTag::Avx2 => "avx2",
389 #[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
390 CpuKernelTag::Vbmi2 => "vbmi2",
391 #[cfg(all(target_arch = "aarch64", feature = "kernel_neon"))]
392 CpuKernelTag::Neon => "neon",
393 #[cfg(all(
394 target_arch = "aarch64",
395 feature = "kernel_sve",
396 any(feature = "std", target_feature = "sve"),
397 ))]
398 CpuKernelTag::Sve => "sve",
399 }
400 }
401}
402
403/// Name of the CPU kernel tier this process selected for the entropy /
404/// sequence hot paths: decode (literals + FSE sequence decode) and encode
405/// (entropy) share this dispatch (see #247). Returned as a stable lowercase
406/// string for diagnostics and benchmark/dashboard reporting; the value is
407/// what the runtime CPU-feature detection (or compile-time `target_feature`
408/// on `no_std`) actually resolves to on this machine, so a dashboard can
409/// attribute a measurement to the kernel that produced it.
410pub fn active_cpu_kernel_name() -> &'static str {
411 detect_cpu_kernel().name()
412}
413
414#[cfg(test)]
415mod tests {
416 use super::*;
417
418 #[test]
419 fn scalar_mask_lower_bits_zero_n_returns_zero() {
420 assert_eq!(ScalarKernel::mask_lower_bits(0xDEADBEEF, 0), 0);
421 }
422
423 #[test]
424 fn scalar_mask_lower_bits_full_64_returns_full_value() {
425 assert_eq!(
426 ScalarKernel::mask_lower_bits(0xFFFF_FFFF_FFFF_FFFF, 64),
427 0xFFFF_FFFF_FFFF_FFFF
428 );
429 }
430
431 #[test]
432 fn scalar_mask_lower_bits_mid_keeps_low_n_bits() {
433 // n=8: keep low 8 bits, zero the rest
434 assert_eq!(ScalarKernel::mask_lower_bits(0xDEAD_BEEF, 8), 0xEF);
435 assert_eq!(
436 ScalarKernel::mask_lower_bits(0x0102_0304_0506_0708, 16),
437 0x0708
438 );
439 }
440
441 // Gated on `std` AND `kernel_avx2`: the `is_x86_feature_detected!`
442 // guard below is a no-op under `--no-default-features` (no std,
443 // no runtime feature detection), so the test body would call
444 // `Avx2Kernel::mask_lower_bits` unconditionally and SIGILL on any
445 // non-BMI2 CPU — hence `feature = "std"`. `Avx2Kernel` itself is
446 // `#[cfg(feature = "kernel_avx2")]`, so the test must also require
447 // that feature or a `std`-only trimmed build (`kernel_avx2` off)
448 // fails to compile against the undefined type.
449 #[cfg(all(target_arch = "x86_64", feature = "std", feature = "kernel_avx2"))]
450 #[test]
451 fn avx2_mask_lower_bits_matches_scalar_on_bmi2_hw() {
452 // Only run when BMI2 actually available — otherwise constructing
453 // Avx2Kernel via dispatch wouldn't happen.
454 if !std::arch::is_x86_feature_detected!("bmi2") {
455 return;
456 }
457 for n in 0..=64u8 {
458 let v = 0x1234_5678_9ABC_DEF0u64;
459 assert_eq!(
460 Avx2Kernel::mask_lower_bits(v, n),
461 ScalarKernel::mask_lower_bits(v, n),
462 "mismatch at n={}",
463 n
464 );
465 }
466 }
467
468 /// Regression: a CPU advertising AVX-512 VBMI2 but NOT AVX2 (the
469 /// AMD64 baseline allows this combination at the spec level) was
470 /// previously selected as `Vbmi2`, which would SIGILL on the
471 /// first AVX2-mixed VBMI2 kernel invocation. The selection must
472 /// fall through to Scalar (or a non-AVX tier) in that case.
473 #[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
474 #[test]
475 fn select_x86_kernel_vbmi2_without_avx2_does_not_pick_vbmi2() {
476 let tag = select_x86_kernel(
477 /* avx512vbmi2 */ true, /* avx512f */ true, /* avx512vl */ true,
478 /* avx512bw */ true, /* bmi2 */ true, /* avx2 */ false,
479 /* sse2 */ true,
480 );
481 assert_ne!(
482 tag,
483 CpuKernelTag::Vbmi2,
484 "selecting Vbmi2 without AVX2 would call AVX2 instructions and SIGILL"
485 );
486 }
487
488 /// Sanity: when every flag is present the selector returns Vbmi2.
489 #[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
490 #[test]
491 fn select_x86_kernel_full_x86_v4_picks_vbmi2() {
492 let tag = select_x86_kernel(true, true, true, true, true, true, true);
493 assert_eq!(tag, CpuKernelTag::Vbmi2);
494 }
495
496 /// Sanity: AVX2 + BMI2 without AVX-512 → Avx2.
497 #[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
498 #[test]
499 fn select_x86_kernel_avx2_baseline_picks_avx2() {
500 let tag = select_x86_kernel(false, false, false, false, true, true, true);
501 assert_eq!(tag, CpuKernelTag::Avx2);
502 }
503
504 /// SSE2-only (no BMI2/AVX2) → Sse2, the x86_64 floor above Scalar.
505 #[cfg(all(target_arch = "x86_64", feature = "kernel_sse2"))]
506 #[test]
507 fn select_x86_kernel_sse2_only_picks_sse2() {
508 let tag = select_x86_kernel(false, false, false, false, false, false, true);
509 assert_eq!(tag, CpuKernelTag::Sse2);
510 }
511
512 /// No SIMD flags at all → Scalar (off-x86_64 / pre-SSE2 x86).
513 #[cfg(target_arch = "x86_64")]
514 #[test]
515 fn select_x86_kernel_no_features_picks_scalar() {
516 let tag = select_x86_kernel(false, false, false, false, false, false, false);
517 assert_eq!(tag, CpuKernelTag::Scalar);
518 }
519
520 #[test]
521 fn detect_returns_consistent_tag() {
522 let first = detect_cpu_kernel();
523 let second = detect_cpu_kernel();
524 assert_eq!(
525 first, second,
526 "cached detect must return same tag on repeated calls"
527 );
528 }
529
530 #[test]
531 fn active_kernel_name_is_known_lowercase_tier() {
532 // The diagnostic name must be one of the stable lowercase tier
533 // strings the dashboard parses, and must match whatever tier
534 // detection resolves to on this host (no `unknown` / empty leak).
535 const KNOWN: &[&str] = &["scalar", "sse2", "bmi2", "avx2", "vbmi2", "neon", "sve"];
536 let name = active_cpu_kernel_name();
537 assert!(
538 KNOWN.contains(&name),
539 "active kernel name {name:?} is not a recognised tier"
540 );
541 assert_eq!(
542 name,
543 name.to_ascii_lowercase(),
544 "tier name must be lowercase for stable dashboard parsing"
545 );
546 }
547
548 #[test]
549 fn every_kernel_tag_maps_to_its_lowercase_name() {
550 // `active_cpu_kernel_name` only exercises whichever arm the running
551 // CPU resolves to, so map each constructible tag directly to cover
552 // every branch on this build's feature set.
553 assert_eq!(CpuKernelTag::Scalar.name(), "scalar");
554 #[cfg(all(target_arch = "x86_64", feature = "kernel_sse2"))]
555 assert_eq!(CpuKernelTag::Sse2.name(), "sse2");
556 #[cfg(all(target_arch = "x86_64", feature = "kernel_bmi2"))]
557 assert_eq!(CpuKernelTag::Bmi2.name(), "bmi2");
558 #[cfg(all(target_arch = "x86_64", feature = "kernel_avx2"))]
559 assert_eq!(CpuKernelTag::Avx2.name(), "avx2");
560 #[cfg(all(target_arch = "x86_64", feature = "kernel_vbmi2"))]
561 assert_eq!(CpuKernelTag::Vbmi2.name(), "vbmi2");
562 #[cfg(all(target_arch = "aarch64", feature = "kernel_neon"))]
563 assert_eq!(CpuKernelTag::Neon.name(), "neon");
564 #[cfg(all(
565 target_arch = "aarch64",
566 feature = "kernel_sve",
567 any(feature = "std", target_feature = "sve"),
568 ))]
569 assert_eq!(CpuKernelTag::Sve.name(), "sve");
570 }
571
572 #[test]
573 fn active_kernel_name_is_stable_across_calls() {
574 // Backed by the cached `detect_cpu_kernel`, so repeated calls must
575 // return the identical static string.
576 assert_eq!(active_cpu_kernel_name(), active_cpu_kernel_name());
577 }
578}