colorthief-dataset 0.1.0

Static xkcd color-hierarchy table with pre-computed LAB used by `colorthief` for human-vocabulary color naming.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
//! Nearest-neighbor lookup against the xkcd LAB palette.
//!
//! Public entry point is [`crate::Color::nearest_to`]; this module
//! owns the actual scan over the 949-entry palette plus per-arch SIMD
//! specialisation, hidden behind a single internal [`nearest_idx`]
//! dispatcher.
//!
//! # Backends
//!
//! - [`scalar`] — always compiled, the reference implementation.
//! - [`aarch64_neon`] — `cfg(target_arch = "aarch64")`, 4 entries/iter
//!   via 128-bit NEON. Compile-time gated; NEON is mandatory in
//!   Armv8-A.
//! - [`x86_sse41`] — `cfg(target_arch = "x86_64")`, 4 entries/iter via
//!   128-bit SSE4.1. Runtime feature-detected (`std`-only).
//! - [`x86_avx2`] — `cfg(target_arch = "x86_64")`, 8 entries/iter via
//!   256-bit AVX2. Runtime feature-detected (`std`-only).
//! - [`x86_avx512`] — `cfg(target_arch = "x86_64")`, 16 entries/iter
//!   via 512-bit AVX-512F. Runtime feature-detected (`std`-only).
//!   Requires Rust 1.89+ for stable `_mm512_*` intrinsics; the
//!   workspace MSRV is 1.95.
//! - [`wasm_simd128`] — `cfg(all(target_arch = "wasm32",
//!   target_feature = "simd128"))`, 4 entries/iter via WASM SIMD128.
//!   Compile-time gated.
//!
//! # Dispatch
//!
//! - On aarch64 → `aarch64_neon` (compile-time).
//! - On x86_64 with `feature = "std"` → runtime detection picks the
//!   highest-tier available (`avx2` > `sse4.1` > `scalar`). On
//!   `no_std` x86 we fall through to scalar — runtime detection
//!   needs `std`.
//! - On wasm32 with `target_feature = "simd128"` → `wasm_simd128`
//!   (compile-time).
//! - Else → scalar.
//!
//! Pattern mirrors the colconv project's `src/row/arch/` layout.
//!
//! # Bit-parity contract
//!
//! Every backend evaluates the squared distance with the same
//! associativity (`(dl² + da²) + db²`) and uses plain mul/add (no
//! FMA), so they produce bit-identical `f32` results on the same
//! inputs. The grid-parity tests in this module enforce this against
//! a representative RGB grid for every backend reachable on the
//! current target.

use crate::{
  Color,
  generated::{COLORS, LABS_A, LABS_B, LABS_C, LABS_L},
};

pub(crate) mod scalar;

/// CIEDE2000 — scalar-only on every target. See [`ciede2000`] for why
/// SIMD isn't worth pursuing here. A NEON attempt was benchmarked
/// against the scalar baseline on 2026-05-03 and regressed by ~35%
/// (115.9 µs vs 85.9 µs / query) — the transcendental-heavy formula
/// can't usefully parallelise, so we keep the scalar path.
pub(crate) mod ciede2000;

/// CIEDE2000 candidate-set LUT — gated on `feature = "lut"` (default
/// on). The LUT is pre-computed at xtask codegen time: each of the
/// 32³ cells stores the small set of palette indices that are the
/// CIEDE2000-nearest at *some* RGB inside the cell's 8×8×8 box. At
/// runtime, the cell lookup is O(1) and the candidate scan is bounded
/// by the per-cell max (10 in the current palette), which collapses
/// the per-query CIEDE2000 cost from ~71 µs (full scan over 949) to a
/// few hundred ns. Provably exact at u8 RGB resolution.
#[cfg(feature = "lut")]
pub(crate) mod ciede2000_lut;

/// CIE94 (Delta E 94) — scalar reference. The SIMD-friendly formula
/// (no `atan2` / `sin` / `cos` / `exp`; only `sqrt` + arithmetic) has
/// per-arch backends below mirroring Delta E 76.
pub(crate) mod cie94;

// `target_feature = "neon"` (not just `target_arch = "aarch64"`):
// `aarch64-unknown-none-softfloat` is a Tier-2 target with
// `target_arch = "aarch64"` but no `target_feature = "neon"`, and
// calling a `#[target_feature(enable = "neon")]` fn there is UB per
// the Rust reference. Other aarch64 targets (-linux-gnu, -apple-darwin,
// -unknown-none, etc.) all have NEON in the default feature set, so
// this gate excludes only the softfloat variant.
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
pub(crate) mod cie94_aarch64_neon;

#[cfg(target_arch = "x86_64")]
pub(crate) mod cie94_x86_sse41;

#[cfg(target_arch = "x86_64")]
pub(crate) mod cie94_x86_avx2;

#[cfg(target_arch = "x86_64")]
pub(crate) mod cie94_x86_avx512;

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
pub(crate) mod cie94_wasm_simd128;

// See the comment on `cie94_aarch64_neon` above for why we gate on
// `target_feature = "neon"` rather than just `target_arch = "aarch64"`.
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
pub(crate) mod aarch64_neon;

#[cfg(target_arch = "x86_64")]
pub(crate) mod x86_sse41;

#[cfg(target_arch = "x86_64")]
pub(crate) mod x86_avx2;

#[cfg(target_arch = "x86_64")]
pub(crate) mod x86_avx512;

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
pub(crate) mod wasm_simd128;

/// Internal dispatcher: returns the index into [`COLORS`] of the entry
/// whose pre-computed LAB is closest to `query` by Delta E 76 (squared
/// Euclidean — `sqrt` is monotonic, no need to take it).
///
/// # Tier-forcing cfg flags
///
/// Mirrors colconv's coverage strategy. Each flag short-circuits the
/// dispatcher to a lower tier so coverage runs can exercise every
/// branch even on a host whose CPU only naturally hits the top tier:
///
/// - `--cfg colorthief_force_scalar` — bypass every SIMD backend and
///   call the scalar reference unconditionally.
/// - `--cfg colorthief_disable_avx512` — on x86_64, skip the AVX-512F
///   tier so the dispatcher falls through to AVX2 (or lower).
/// - `--cfg colorthief_disable_avx2` — on x86_64, skip the AVX2 tier
///   so the dispatcher falls through to SSE4.1 (or scalar if SSE4.1
///   is also unavailable at runtime). Stacks with
///   `colorthief_disable_avx512` to force the SSE4.1 path.
///
/// These flags are declared in the workspace's
/// `[workspace.lints.rust] unexpected_cfgs.check-cfg` so passing them
/// via `RUSTFLAGS` doesn't trip the unexpected-cfgs lint.
///
/// `#[allow(unsafe_code)]` is scoped here because the x86 backends are
/// `unsafe fn` (the `#[target_feature]` attribute requires it) and we
/// call them inside `is_x86_feature_detected!` guards. The aarch64
/// and WASM backends expose safe wrappers so they don't need the
/// allow.
///
/// `#[allow(unreachable_code)]` because each per-arch cfg branch
/// `return`s and on a target that hits Tier 1 the trailing scalar
/// fallback is unreachable. The trailing call exists for x86_64 (when
/// no SIMD feature detects), no_std x86_64, every other arch, and the
/// `colorthief_force_scalar` coverage runs.
#[allow(unsafe_code)]
#[allow(unreachable_code)]
#[inline]
pub(crate) fn nearest_idx(query: [f32; 3]) -> usize {
  // Tier 1: aarch64 NEON. NEON is part of the default feature set on
  // every aarch64 target Rust supports *except*
  // `aarch64-unknown-none-softfloat` (Tier 2, soft-float embedded). We
  // gate on `target_feature = "neon"` rather than just `target_arch`
  // so the softfloat target falls through to scalar — calling a
  // `#[target_feature(enable = "neon")]` fn without the feature in
  // scope is UB per the Rust reference.
  #[cfg(all(
    target_arch = "aarch64",
    target_feature = "neon",
    not(colorthief_force_scalar)
  ))]
  {
    return aarch64_neon::nearest_idx(query);
  }

  // Tier 1: WASM SIMD128. Compile-time gated; the module is only
  // declared when `target_feature = "simd128"`.
  #[cfg(all(
    target_arch = "wasm32",
    target_feature = "simd128",
    not(colorthief_force_scalar)
  ))]
  {
    return wasm_simd128::nearest_idx(query);
  }

  // Tier 1-3: x86_64 std runtime feature detection. AVX-512F →
  // AVX2 → SSE4.1 cascade; the `colorthief_disable_avx512` and
  // `colorthief_disable_avx2` flags force coverage runs through the
  // lower tiers even on machines that natively support the higher
  // ones. The `is_x86_feature_detected!` macro caches the lookup in
  // an atomic so per-call overhead is a single relaxed load.
  #[cfg(all(target_arch = "x86_64", feature = "std", not(colorthief_force_scalar)))]
  {
    if !cfg!(colorthief_disable_avx512) && std::is_x86_feature_detected!("avx512f") {
      // SAFETY: feature just verified; `x86_avx512::nearest_idx`
      // carries `#[target_feature(enable = "avx512f")]`.
      return unsafe { x86_avx512::nearest_idx(query) };
    }
    if !cfg!(colorthief_disable_avx2) && std::is_x86_feature_detected!("avx2") {
      // SAFETY: feature just verified.
      return unsafe { x86_avx2::nearest_idx(query) };
    }
    if std::is_x86_feature_detected!("sse4.1") {
      // SAFETY: feature just verified.
      return unsafe { x86_sse41::nearest_idx(query) };
    }
  }

  // Fallback: scalar.
  scalar::nearest_idx(query)
}

/// Convenience wrapper used by [`crate::Color::nearest_to`].
#[inline]
pub(crate) fn nearest(query: [f32; 3]) -> &'static Color {
  COLORS[nearest_idx(query)]
}

/// CIEDE2000 nearest-neighbor — the dispatcher behind both
/// [`crate::Color::nearest_to_ciede2000`] and
/// [`crate::Color::nearest_to_ciede2000_exact`].
///
/// When `feature = "lut"` is enabled (the default), routes through
/// the candidate-set LUT in [`ciede2000_lut`] — provably exact at u8
/// RGB resolution, ~few-hundred-ns/query. When the feature is
/// disabled, falls through to the full-scan reference
/// [`ciede2000::nearest_idx`] (~71 µs/query, also provably exact).
///
/// The Delta E 76 prefilter at `K = 96` is **not** used as a
/// production path: a 256³ exhaustive sweep
/// (`tests/parity_exhaustive.rs::parity_ciede2000_prefilter_vs_exact_256_grid`)
/// showed 2283 divergences vs. full-scan, so the prefilter can't
/// claim strict exactness. It's retained as a benchmark baseline only.
#[cfg(feature = "lut")]
#[inline]
pub(crate) fn nearest_ciede2000(rgb: [u8; 3]) -> &'static Color {
  let query = crate::rgb_to_lab(rgb);
  COLORS[ciede2000_lut::nearest_idx(rgb, query)]
}

#[cfg(not(feature = "lut"))]
#[inline]
pub(crate) fn nearest_ciede2000(rgb: [u8; 3]) -> &'static Color {
  let query = crate::rgb_to_lab(rgb);
  COLORS[ciede2000::nearest_idx(query)]
}

/// CIE94 (Delta E 94) nearest-neighbor with the same SIMD dispatch
/// cascade as [`nearest_idx`] for Delta E 76. Honours the same
/// coverage cfg flags (`colorthief_force_scalar`,
/// `colorthief_disable_avx2`).
#[allow(unsafe_code)]
#[allow(unreachable_code)]
#[inline]
pub(crate) fn nearest_cie94(query: [f32; 3]) -> &'static Color {
  // Tier 1: aarch64 NEON. See `nearest_idx` above for why we gate on
  // `target_feature = "neon"` rather than just `target_arch`.
  #[cfg(all(
    target_arch = "aarch64",
    target_feature = "neon",
    not(colorthief_force_scalar)
  ))]
  {
    return COLORS[cie94_aarch64_neon::nearest_idx(query)];
  }

  // Tier 1: WASM SIMD128 (compile-time gated).
  #[cfg(all(
    target_arch = "wasm32",
    target_feature = "simd128",
    not(colorthief_force_scalar)
  ))]
  {
    return COLORS[cie94_wasm_simd128::nearest_idx(query)];
  }

  // Tier 1-3: x86_64 std runtime feature detection. AVX-512F → AVX2
  // → SSE4.1, same cascade as Delta E 76's `nearest_idx`. Gated on
  // `feature = "std"` because `is_x86_feature_detected!` requires
  // `std`; on `no_std` x86_64 we fall through to scalar (matches the
  // Delta E 76 cascade above at line 170).
  #[cfg(all(target_arch = "x86_64", feature = "std", not(colorthief_force_scalar)))]
  {
    if !cfg!(colorthief_disable_avx512) && std::is_x86_feature_detected!("avx512f") {
      // SAFETY: feature just verified.
      return COLORS[unsafe { cie94_x86_avx512::nearest_idx(query) }];
    }
    if !cfg!(colorthief_disable_avx2) && std::is_x86_feature_detected!("avx2") {
      // SAFETY: feature just verified.
      return COLORS[unsafe { cie94_x86_avx2::nearest_idx(query) }];
    }
    if std::is_x86_feature_detected!("sse4.1") {
      // SAFETY: feature just verified.
      return COLORS[unsafe { cie94_x86_sse41::nearest_idx(query) }];
    }
  }

  COLORS[cie94::nearest_idx(query)]
}

#[cfg(test)]
#[allow(unsafe_code)]
mod tests {
  use super::*;

  /// Iterate the standard parity grid (17³ = 4913 RGB points evenly
  /// spaced 16 apart). Reused across every backend's parity test.
  ///
  /// Gated on `feature = "std"` to match the parity tests below — they
  /// all need `Vec` to collect mismatches, which requires `alloc` (and
  /// the test harness itself needs std).
  ///
  /// `#[allow(dead_code)]` because the helper has no consumers on
  /// targets without any of our SIMD arch matches (e.g. `s390x`,
  /// `i686`, `powerpc64`, `riscv64gc` in the miri matrix, or aarch64
  /// without `target_feature = "neon"` like `aarch64-unknown-none-softfloat`).
  /// On those targets every parity test is cfg-gated out and this
  /// function would otherwise trip `-Dwarnings`.
  #[cfg(feature = "std")]
  #[allow(dead_code)]
  fn parity_grid() -> impl Iterator<Item = [u8; 3]> {
    (0..256u32).step_by(16).flat_map(move |r| {
      (0..256u32).step_by(16).flat_map(move |g| {
        (0..256u32)
          .step_by(16)
          .map(move |b| [r as u8, g as u8, b as u8])
      })
    })
  }

  /// SoA arrays must align with the AoS [`COLORS`] indexing: every
  /// `LABS_*[i]` matches `COLORS[i].lab()`. Pins the xtask invariant
  /// that the SoA write order matches the const emission order.
  #[test]
  fn soa_lab_arrays_align_with_aos_colors() {
    assert_eq!(LABS_L.len(), COLORS.len());
    assert_eq!(LABS_A.len(), COLORS.len());
    assert_eq!(LABS_B.len(), COLORS.len());
    for (i, c) in COLORS.iter().enumerate() {
      let lab = c.lab();
      assert_eq!(LABS_L[i], lab[0], "L mismatch at index {i}");
      assert_eq!(LABS_A[i], lab[1], "a mismatch at index {i}");
      assert_eq!(LABS_B[i], lab[2], "b mismatch at index {i}");
    }
  }

  /// aarch64 NEON ↔ scalar. Needs `feature = "std"` for `Vec` and
  /// the test harness; under `--no-default-features --features alloc`
  /// the test is skipped (the standard test runner requires std).
  #[test]
  #[cfg_attr(miri, ignore = "4913-query × 949-entry grid is too slow under miri")]
  #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "std"))]
  fn neon_and_scalar_agree_across_grid() {
    let mut mismatches = Vec::new();
    for rgb in parity_grid() {
      let query = crate::rgb_to_lab(rgb);
      let s = scalar::nearest_idx(query);
      let n = aarch64_neon::nearest_idx(query);
      if s != n {
        mismatches.push((rgb, s, n));
      }
    }
    assert!(
      mismatches.is_empty(),
      "{} scalar/NEON mismatches across the 17³ grid; first few: {:?}",
      mismatches.len(),
      &mismatches[..mismatches.len().min(5)]
    );
  }

  /// x86 SSE4.1 ↔ scalar (runs only when SSE4.1 is detected on the
  /// host running the test binary).
  #[test]
  #[cfg_attr(miri, ignore = "4913-query × 949-entry grid is too slow under miri")]
  #[cfg(all(target_arch = "x86_64", feature = "std"))]
  fn sse41_and_scalar_agree_across_grid() {
    if !std::is_x86_feature_detected!("sse4.1") {
      eprintln!("skipping: SSE4.1 not detected on this host");
      return;
    }
    let mut mismatches = Vec::new();
    for rgb in parity_grid() {
      let query = crate::rgb_to_lab(rgb);
      let s = scalar::nearest_idx(query);
      // SAFETY: feature just verified.
      let v = unsafe { x86_sse41::nearest_idx(query) };
      if s != v {
        mismatches.push((rgb, s, v));
      }
    }
    assert!(
      mismatches.is_empty(),
      "{} scalar/SSE4.1 mismatches; first few: {:?}",
      mismatches.len(),
      &mismatches[..mismatches.len().min(5)]
    );
  }

  /// x86 AVX-512F ↔ scalar (runs only when AVX-512F is detected on
  /// the host).
  #[test]
  #[cfg_attr(miri, ignore = "4913-query × 949-entry grid is too slow under miri")]
  #[cfg(all(target_arch = "x86_64", feature = "std"))]
  fn avx512_and_scalar_agree_across_grid() {
    if !std::is_x86_feature_detected!("avx512f") {
      eprintln!("skipping: AVX-512F not detected on this host");
      return;
    }
    let mut mismatches = Vec::new();
    for rgb in parity_grid() {
      let query = crate::rgb_to_lab(rgb);
      let s = scalar::nearest_idx(query);
      // SAFETY: feature just verified.
      let v = unsafe { x86_avx512::nearest_idx(query) };
      if s != v {
        mismatches.push((rgb, s, v));
      }
    }
    assert!(
      mismatches.is_empty(),
      "{} scalar/AVX-512F mismatches; first few: {:?}",
      mismatches.len(),
      &mismatches[..mismatches.len().min(5)]
    );
  }

  /// CIE94 x86 AVX-512F ↔ scalar (runs only when AVX-512F is detected
  /// on the host).
  #[test]
  #[cfg_attr(miri, ignore = "4913-query × 949-entry grid is too slow under miri")]
  #[cfg(all(target_arch = "x86_64", feature = "std"))]
  fn cie94_avx512_and_scalar_agree_across_grid() {
    if !std::is_x86_feature_detected!("avx512f") {
      eprintln!("skipping: AVX-512F not detected on this host");
      return;
    }
    let mut mismatches = Vec::new();
    for rgb in parity_grid() {
      let query = crate::rgb_to_lab(rgb);
      let s = cie94::nearest_idx(query);
      // SAFETY: feature verified.
      let v = unsafe { cie94_x86_avx512::nearest_idx(query) };
      if s != v {
        mismatches.push((rgb, s, v));
      }
    }
    assert!(
      mismatches.is_empty(),
      "{} CIE94 scalar/AVX-512F mismatches; first few: {:?}",
      mismatches.len(),
      &mismatches[..mismatches.len().min(5)]
    );
  }

  /// x86 AVX2 ↔ scalar (runs only when AVX2 is detected on the host).
  #[test]
  #[cfg_attr(miri, ignore = "4913-query × 949-entry grid is too slow under miri")]
  #[cfg(all(target_arch = "x86_64", feature = "std"))]
  fn avx2_and_scalar_agree_across_grid() {
    if !std::is_x86_feature_detected!("avx2") {
      eprintln!("skipping: AVX2 not detected on this host");
      return;
    }
    let mut mismatches = Vec::new();
    for rgb in parity_grid() {
      let query = crate::rgb_to_lab(rgb);
      let s = scalar::nearest_idx(query);
      // SAFETY: feature just verified.
      let v = unsafe { x86_avx2::nearest_idx(query) };
      if s != v {
        mismatches.push((rgb, s, v));
      }
    }
    assert!(
      mismatches.is_empty(),
      "{} scalar/AVX2 mismatches; first few: {:?}",
      mismatches.len(),
      &mismatches[..mismatches.len().min(5)]
    );
  }

  /// CIE94 aarch64 NEON ↔ scalar.
  #[test]
  #[cfg_attr(miri, ignore = "4913-query × 949-entry grid is too slow under miri")]
  #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "std"))]
  fn cie94_neon_and_scalar_agree_across_grid() {
    let mut mismatches = Vec::new();
    for rgb in parity_grid() {
      let query = crate::rgb_to_lab(rgb);
      let s = cie94::nearest_idx(query);
      let n = cie94_aarch64_neon::nearest_idx(query);
      if s != n {
        mismatches.push((rgb, s, n));
      }
    }
    assert!(
      mismatches.is_empty(),
      "{} CIE94 scalar/NEON mismatches; first few: {:?}",
      mismatches.len(),
      &mismatches[..mismatches.len().min(5)]
    );
  }

  /// CIE94 x86 SSE4.1 ↔ scalar.
  #[test]
  #[cfg_attr(miri, ignore = "4913-query × 949-entry grid is too slow under miri")]
  #[cfg(all(target_arch = "x86_64", feature = "std"))]
  fn cie94_sse41_and_scalar_agree_across_grid() {
    if !std::is_x86_feature_detected!("sse4.1") {
      eprintln!("skipping: SSE4.1 not detected");
      return;
    }
    let mut mismatches = Vec::new();
    for rgb in parity_grid() {
      let query = crate::rgb_to_lab(rgb);
      let s = cie94::nearest_idx(query);
      // SAFETY: feature verified.
      let v = unsafe { cie94_x86_sse41::nearest_idx(query) };
      if s != v {
        mismatches.push((rgb, s, v));
      }
    }
    assert!(
      mismatches.is_empty(),
      "{} CIE94 scalar/SSE4.1 mismatches; first few: {:?}",
      mismatches.len(),
      &mismatches[..mismatches.len().min(5)]
    );
  }

  /// CIE94 x86 AVX2 ↔ scalar.
  #[test]
  #[cfg_attr(miri, ignore = "4913-query × 949-entry grid is too slow under miri")]
  #[cfg(all(target_arch = "x86_64", feature = "std"))]
  fn cie94_avx2_and_scalar_agree_across_grid() {
    if !std::is_x86_feature_detected!("avx2") {
      eprintln!("skipping: AVX2 not detected");
      return;
    }
    let mut mismatches = Vec::new();
    for rgb in parity_grid() {
      let query = crate::rgb_to_lab(rgb);
      let s = cie94::nearest_idx(query);
      // SAFETY: feature verified.
      let v = unsafe { cie94_x86_avx2::nearest_idx(query) };
      if s != v {
        mismatches.push((rgb, s, v));
      }
    }
    assert!(
      mismatches.is_empty(),
      "{} CIE94 scalar/AVX2 mismatches; first few: {:?}",
      mismatches.len(),
      &mismatches[..mismatches.len().min(5)]
    );
  }

  /// CIE94 WASM SIMD128 ↔ scalar.
  #[test]
  #[cfg_attr(miri, ignore = "4913-query × 949-entry grid is too slow under miri")]
  #[cfg(all(target_arch = "wasm32", target_feature = "simd128", feature = "std"))]
  fn cie94_wasm_simd128_and_scalar_agree_across_grid() {
    let mut mismatches = Vec::new();
    for rgb in parity_grid() {
      let query = crate::rgb_to_lab(rgb);
      let s = cie94::nearest_idx(query);
      let v = cie94_wasm_simd128::nearest_idx(query);
      if s != v {
        mismatches.push((rgb, s, v));
      }
    }
    assert!(
      mismatches.is_empty(),
      "{} CIE94 scalar/WASM SIMD128 mismatches; first few: {:?}",
      mismatches.len(),
      &mismatches[..mismatches.len().min(5)]
    );
  }

  /// WASM SIMD128 ↔ scalar.
  #[test]
  #[cfg_attr(miri, ignore = "4913-query × 949-entry grid is too slow under miri")]
  #[cfg(all(target_arch = "wasm32", target_feature = "simd128", feature = "std"))]
  fn wasm_simd128_and_scalar_agree_across_grid() {
    let mut mismatches = Vec::new();
    for rgb in parity_grid() {
      let query = crate::rgb_to_lab(rgb);
      let s = scalar::nearest_idx(query);
      let v = wasm_simd128::nearest_idx(query);
      if s != v {
        mismatches.push((rgb, s, v));
      }
    }
    assert!(
      mismatches.is_empty(),
      "{} scalar/WASM SIMD128 mismatches; first few: {:?}",
      mismatches.len(),
      &mismatches[..mismatches.len().min(5)]
    );
  }
}