1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
//! SIMD-optimized hypervector operations.
//!
//! Provides platform-specific SIMD implementations for bind operations:
//! - x86/x86_64: SSE (128-bit) and AVX2 (256-bit) with runtime detection
//! - aarch64: NEON (128-bit)
//!
//! Also provides optimized Hamming distance calculation.
/// Optimized Hamming distance calculation using unrolled loop.
///
/// This implementation uses a 4x unrolled loop with independent accumulators
/// to break the serial dependency chain of popcount operations, maximizing
/// Instruction-Level Parallelism (ILP). It operates on 64-bit words to avoid
/// the overhead of 128-bit operations on many architectures.
#[inline]
pub(crate) fn hamming_distance_optimized(lhs: &[u128; 80], rhs: &[u128; 80]) -> u32 {
let distance: u32;
// SAFETY: Transmuting to u64 pointers is safe because u128 is 16-byte aligned
// and u64 is 8-byte aligned. Array size 80 * u128 is 160 * u64.
unsafe {
let lptr = lhs.as_ptr() as *const u64;
let rptr = rhs.as_ptr() as *const u64;
// Use multiple independent accumulators to break the serial dependency chain.
// This allows the CPU to utilize multiple execution ports for ILP.
let mut s0 = 0;
let mut s1 = 0;
let mut s2 = 0;
let mut s3 = 0;
// Unroll for better port utilization and pipelining
for i in (0..160).step_by(4) {
s0 += (*lptr.add(i) ^ *rptr.add(i)).count_ones();
s1 += (*lptr.add(i + 1) ^ *rptr.add(i + 1)).count_ones();
s2 += (*lptr.add(i + 2) ^ *rptr.add(i + 2)).count_ones();
s3 += (*lptr.add(i + 3) ^ *rptr.add(i + 3)).count_ones();
}
distance = (s0 + s1) + (s2 + s3);
}
distance
}
/// SSE-optimized bind (128-bit XOR).
#[cfg(all(
not(target_arch = "wasm32"),
any(target_arch = "x86_64", target_arch = "x86")
))]
#[inline]
pub(crate) fn bind_simd_x86(lhs: &[u128; 80], rhs: &[u128; 80]) -> [u128; 80] {
#[cfg(target_arch = "x86")]
use std::arch::x86::{__m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_xor_si128};
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_xor_si128};
let mut out = [0u128; 80];
for i in 0..80 {
// SAFETY: `u128` is 16-byte aligned, matching `__m128i` requirements.
// Array indexing is within bounds (0..80).
unsafe {
let a = _mm_loadu_si128((&lhs[i] as *const u128).cast::<__m128i>());
let b = _mm_loadu_si128((&rhs[i] as *const u128).cast::<__m128i>());
let x = _mm_xor_si128(a, b);
_mm_storeu_si128((&mut out[i] as *mut u128).cast::<__m128i>(), x);
}
}
out
}
/// AVX2-optimized bind (256-bit XOR, processes 2 words per instruction).
/// Uses runtime feature detection to dispatch when AVX2 is available.
#[cfg(all(not(target_arch = "wasm32"), target_arch = "x86_64"))]
#[inline]
#[target_feature(enable = "avx2")]
/// # Safety
/// This function is unsafe because it uses AVX2 intrinsics. The caller must ensure that
/// AVX2 is supported by the CPU at runtime.
pub(crate) unsafe fn bind_simd_avx2(lhs: &[u128; 80], rhs: &[u128; 80]) -> [u128; 80] {
use std::arch::x86_64::{__m256i, _mm256_loadu_si256, _mm256_storeu_si256, _mm256_xor_si256};
let mut out = [0u128; 80];
// Process pairs of u128s (32 bytes per AVX2 instruction)
for i in (0..80).step_by(2) {
// SAFETY: AVX2 requires 32-byte alignment; u128 array is 16-byte aligned.
// Using unaligned loads (_mm256_loadu_si256) handles this safely.
// Pointer arithmetic and array access are within bounds (80 elements).
unsafe {
let ptr_lhs = lhs.as_ptr().add(i) as *const __m256i;
let ptr_rhs = rhs.as_ptr().add(i) as *const __m256i;
let ptr_out = out.as_mut_ptr().add(i) as *mut __m256i;
let a = _mm256_loadu_si256(ptr_lhs);
let b = _mm256_loadu_si256(ptr_rhs);
let x = _mm256_xor_si256(a, b);
_mm256_storeu_si256(ptr_out, x);
}
}
out
}
/// ARM NEON-optimized bind (128-bit XOR).
/// Uses uint64x2_t to process each 128-bit word as two 64-bit halves.
/// NEON is always available on aarch64.
#[cfg(all(not(target_arch = "wasm32"), target_arch = "aarch64"))]
#[inline]
#[target_feature(enable = "neon")]
/// # Safety
/// This function is unsafe because it uses NEON intrinsics. The caller must ensure that
/// NEON is supported by the CPU (always true for aarch64).
pub(crate) unsafe fn bind_simd_neon(lhs: &[u128; 80], rhs: &[u128; 80]) -> [u128; 80] {
use std::arch::aarch64::{veorq_u64, vld1q_u64, vst1q_u64};
let mut out = [0u128; 80];
for i in 0..80 {
// SAFETY: u128 is 16-byte aligned; we cast to *const u64 which is correct
// for vld1q_u64. The pointer arithmetic is within bounds (80 words).
// All unsafe operations are in an explicit unsafe block as required by
// #[target_feature(enable = "neon")].
unsafe {
let lhs_ptr = lhs.as_ptr().add(i) as *const u64;
let rhs_ptr = rhs.as_ptr().add(i) as *const u64;
let out_ptr = out.as_mut_ptr().add(i) as *mut u64;
let a = vld1q_u64(lhs_ptr);
let b = vld1q_u64(rhs_ptr);
let x = veorq_u64(a, b);
vst1q_u64(out_ptr, x);
}
}
out
}