Skip to main content

hamming_bitwise_fast/
array.rs

1//! Fixed-size array APIs for bitwise Hamming distance.
2//!
3//! Use this module when the vector size is known at compile time (e.g., 1024-bit
4//! embeddings stored as `[u8; 128]`). This is faster than the equivalent
5//! [`slice`](crate::slice) API.
6
7// ============================================================================
8// PERFORMANCE INVARIANT: AVX-512 Gather Avoidance (load-bearing without LTO)
9// ============================================================================
10//
11// `batch()` iterates over `&[[u8; N]]` — contiguous memory. With AVX-512
12// target features available, LLVM can transform that loop to use VPGATHERQQ
13// gather instructions, which are 2–10x slower than contiguous VMOVDQU64 loads
14// (each element fetched separately, cache locality destroyed, no prefetcher
15// win). The asm! barrier on `target` below forces LLVM to use the simple
16// load-xor-popcount form instead.
17//
18// Whether the barrier matters depends on LTO:
19//
20//   - With LTO + multiversion (the recommended config): LLVM inlines across
21//     the multiversion dispatch boundary, sees `N` is a compile-time constant,
22//     unrolls the inner loop, and never emits gathers in the first place.
23//     The barrier is a verified no-op here — assembly is identical with and
24//     without it.
25//
26//   - Without LTO + multiversion: each multiversion specialization is a
27//     separate translation unit. LLVM can't see N, falls back to outer-loop
28//     vectorization, and emits VPGATHERQQ across iterations. Measured: 112
29//     such instructions per benchmark binary, ~4x slower than the barriered
30//     form on Zen 5. The barrier is the difference between fast and slow here.
31//
32// The barrier is kept unconditionally as defense for users who don't enable
33// LTO (and as insurance against future LLVM versions changing the heuristic
34// under LTO too). It has no measurable cost under LTO.
35//
36// Why not `black_box`? Both prevent the gather, but `black_box` compiles to
37// a stack store + reload (~5-cycle store-forwarding penalty per iteration).
38// Under LTO + AVX-512 that penalty is ~7x slower than the asm! barrier
39// (gather_demo: black_box = 2.85µs vs asm_barrier = 410ns at 64B).
40//
41// On non-x86 (ARM etc.), no barrier is needed — gather instructions don't
42// exist on those architectures, and `opaque_ptr` is a plain identity.
43//
44// Verify: inspect AVX-512 assembly under CARGO_PROFILE_BENCH_LTO=false for
45//         absence of VPGATHERQQ in the asm-barriered batch loop.
46// Proof:  benches/batch_input_type.rs `gather_demo` (no_barrier / black_box /
47//         asm_barrier A/B/C comparison).
48// ============================================================================
49
50/// Make a pointer opaque to LLVM's stride analysis without store-forwarding.
51///
52/// On x86, uses `asm!` with `nomem` + `nostack` — the pointer stays in a
53/// register but LLVM treats it as a new, unknown value (preventing the
54/// outer-loop gather vectorization LLVM otherwise picks under no-LTO
55/// multiversion builds). On non-x86, returns the pointer unchanged since
56/// gather instructions don't exist on those architectures.
57///
58/// # Safety
59///
60/// The pointer must be valid. The asm block is a no-op (empty template),
61/// so the returned pointer is identical to the input.
62#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
63#[inline(always)]
64#[allow(clippy::pointers_in_nomem_asm_block)]
65unsafe fn opaque_ptr<T>(mut ptr: *const T) -> *const T {
66    core::arch::asm!("/* {0} */", inout(reg) ptr, options(nomem, nostack, preserves_flags));
67    ptr
68}
69
70// ============================================================================
71// Public API
72// ============================================================================
73
74/// Compute the bitwise Hamming distance between two fixed-size byte arrays.
75///
76/// This is the recommended API when the vector size is known at compile time.
77/// It is faster than [`slice::distance`](crate::slice::distance).
78///
79/// # Example
80///
81/// ```
82/// use hamming_bitwise_fast::array;
83///
84/// let a: [u8; 128] = [0x12; 128];  // 1024-bit
85/// let b: [u8; 128] = [0xFE; 128];
86/// let distance = array::distance(&a, &b);
87/// ```
88#[cfg_attr(
89    all(
90        feature = "multiversion_x86",
91        any(target_arch = "x86", target_arch = "x86_64")
92    ),
93    multiversion::multiversion(targets(
94        "x86_64+avx512vpopcntdq+avx512vl+popcnt",
95        "x86_64+avx512bw+avx512vl+popcnt",
96        "x86_64+avx2+popcnt",
97        "x86_64+sse4.2+popcnt",
98        "x86+avx2+popcnt",
99        "x86+sse4.2+popcnt",
100    ))
101)]
102#[inline(always)]
103pub fn distance<const N: usize>(a: &[u8; N], b: &[u8; N]) -> u32 {
104    crate::distance_impl(a, b)
105}
106
107/// Compute Hamming distance from one source to many targets (one-to-many).
108///
109/// Faster than calling [`distance`] in a loop for one-to-many comparisons.
110///
111/// # Panics
112///
113/// Panics if `out.len() != targets.len()`.
114///
115/// # Example
116///
117/// ```
118/// use hamming_bitwise_fast::array;
119///
120/// let source: [u8; 128] = [0; 128];
121/// let targets = vec![[1u8; 128], [2u8; 128], [3u8; 128]];
122/// let mut distances = vec![0u32; 3];  // pre-allocate and reuse
123///
124/// array::batch(&source, &targets, &mut distances);
125/// ```
126#[cfg_attr(
127    all(
128        feature = "multiversion_x86",
129        any(target_arch = "x86", target_arch = "x86_64")
130    ),
131    multiversion::multiversion(targets(
132        "x86_64+avx512vpopcntdq+avx512vl+popcnt",
133        "x86_64+avx512bw+avx512vl+popcnt",
134        "x86_64+avx2+popcnt",
135        "x86_64+sse4.2+popcnt",
136        "x86+avx2+popcnt",
137        "x86+sse4.2+popcnt",
138    ))
139)]
140#[inline(always)]
141pub fn batch<const N: usize>(source: &[u8; N], targets: &[[u8; N]], out: &mut [u32]) {
142    assert_eq!(targets.len(), out.len());
143
144    for (target, dist) in targets.iter().zip(out.iter_mut()) {
145        // Gather avoidance for no-LTO multiversion builds. With LTO this line
146        // is a verified no-op; without LTO it prevents LLVM from emitting
147        // VPGATHERQQ across iterations (~4x slowdown). See the module-level
148        // PERFORMANCE INVARIANT block.
149        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
150        let target = unsafe { &*opaque_ptr(target as *const [u8; N]) };
151        *dist = crate::distance_impl(source, target);
152    }
153}