velesdb_core/
simd_explicit.rs

1//! Explicit SIMD optimizations using the `wide` crate for portable vectorization.
2//!
3//! This module provides SIMD-accelerated implementations of vector operations
4//! that explicitly use SIMD instructions rather than relying on auto-vectorization.
5//!
6//! # Performance Goals
7//!
8//! - `dot_product_simd`: Target ≥10% faster than auto-vectorized version
9//! - `cosine_similarity_simd`: Single-pass fused computation with SIMD
10//! - `euclidean_distance_simd`: Vectorized squared difference accumulation
11//!
12//! # Architecture Support
13//!
14//! The `wide` crate (v0.7+) automatically uses optimal SIMD for each platform:
15//!
16//! | Platform | SIMD Instructions | Performance |
17//! |----------|-------------------|-------------|
18//! | **`x86_64`** | AVX2/SSE4.1/SSE2 | ~41ns (768D) |
19//! | **`aarch64`** (M1/M2/RPi) | NEON | ~50ns (768D) |
20//! | **WASM** | SIMD128 | ~80ns (768D) |
21//! | **Fallback** | Scalar | ~150ns (768D) |
22//!
23//! No code changes needed - `wide` detects CPU features at runtime.
24
25use wide::f32x8;
26
27/// Computes dot product using explicit SIMD (8-wide f32 lanes).
28///
29/// # Algorithm
30///
31/// Processes 8 floats per iteration using SIMD multiply-accumulate,
32/// then reduces horizontally.
33///
34/// # Panics
35///
36/// Panics if vectors have different lengths.
37///
38/// # Example
39///
40/// ```
41/// use velesdb_core::simd_explicit::dot_product_simd;
42///
43/// let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
44/// let b = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
45/// let result = dot_product_simd(&a, &b);
46/// assert!((result - 36.0).abs() < 1e-5);
47/// ```
48#[inline]
49#[must_use]
50pub fn dot_product_simd(a: &[f32], b: &[f32]) -> f32 {
51    assert_eq!(a.len(), b.len(), "Vector dimensions must match");
52
53    let len = a.len();
54    let simd_len = len / 8;
55    let remainder = len % 8;
56
57    let mut sum = f32x8::ZERO;
58
59    // Process 8 elements at a time using FMA (fused multiply-add)
60    // FMA provides better precision and can be faster on modern CPUs
61    for i in 0..simd_len {
62        let offset = i * 8;
63        let va = f32x8::from(&a[offset..offset + 8]);
64        let vb = f32x8::from(&b[offset..offset + 8]);
65        sum = va.mul_add(vb, sum); // FMA: sum = (va * vb) + sum
66    }
67
68    // Horizontal sum of SIMD lanes
69    let mut result = sum.reduce_add();
70
71    // Handle remainder
72    let base = simd_len * 8;
73    for i in 0..remainder {
74        result += a[base + i] * b[base + i];
75    }
76
77    result
78}
79
80/// Computes euclidean distance using explicit SIMD.
81///
82/// # Algorithm
83///
84/// Computes `sqrt(sum((a[i] - b[i])²))` using SIMD for the squared differences.
85///
86/// # Panics
87///
88/// Panics if vectors have different lengths.
89#[inline]
90#[must_use]
91pub fn euclidean_distance_simd(a: &[f32], b: &[f32]) -> f32 {
92    squared_l2_distance_simd(a, b).sqrt()
93}
94
95/// Computes squared L2 distance using explicit SIMD.
96///
97/// Avoids the sqrt for comparison purposes (faster when only ranking matters).
98///
99/// # Panics
100///
101/// Panics if vectors have different lengths.
102#[inline]
103#[must_use]
104pub fn squared_l2_distance_simd(a: &[f32], b: &[f32]) -> f32 {
105    assert_eq!(a.len(), b.len(), "Vector dimensions must match");
106
107    let len = a.len();
108    let simd_len = len / 8;
109    let remainder = len % 8;
110
111    let mut sum = f32x8::ZERO;
112
113    for i in 0..simd_len {
114        let offset = i * 8;
115        let va = f32x8::from(&a[offset..offset + 8]);
116        let vb = f32x8::from(&b[offset..offset + 8]);
117        let diff = va - vb;
118        sum = diff.mul_add(diff, sum); // FMA: sum = (diff * diff) + sum
119    }
120
121    let mut result = sum.reduce_add();
122
123    let base = simd_len * 8;
124    for i in 0..remainder {
125        let diff = a[base + i] - b[base + i];
126        result += diff * diff;
127    }
128
129    result
130}
131
132/// Computes cosine similarity using explicit SIMD with fused dot+norms.
133///
134/// # Algorithm
135///
136/// Single-pass computation of dot(a,b), norm(a)², norm(b)² using SIMD,
137/// then: `dot / (sqrt(norm_a) * sqrt(norm_b))`
138///
139/// # Panics
140///
141/// Panics if vectors have different lengths.
142#[inline]
143#[must_use]
144#[allow(clippy::similar_names)]
145pub fn cosine_similarity_simd(a: &[f32], b: &[f32]) -> f32 {
146    assert_eq!(a.len(), b.len(), "Vector dimensions must match");
147
148    let len = a.len();
149    let simd_len = len / 8;
150    let remainder = len % 8;
151
152    let mut dot_sum = f32x8::ZERO;
153    let mut norm_a_sum = f32x8::ZERO;
154    let mut norm_b_sum = f32x8::ZERO;
155
156    // FMA for all three accumulations - better precision and potentially faster
157    for i in 0..simd_len {
158        let offset = i * 8;
159        let va = f32x8::from(&a[offset..offset + 8]);
160        let vb = f32x8::from(&b[offset..offset + 8]);
161
162        dot_sum = va.mul_add(vb, dot_sum);
163        norm_a_sum = va.mul_add(va, norm_a_sum);
164        norm_b_sum = vb.mul_add(vb, norm_b_sum);
165    }
166
167    let mut dot = dot_sum.reduce_add();
168    let mut norm_a_sq = norm_a_sum.reduce_add();
169    let mut norm_b_sq = norm_b_sum.reduce_add();
170
171    // Handle remainder
172    let base = simd_len * 8;
173    for i in 0..remainder {
174        let ai = a[base + i];
175        let bi = b[base + i];
176        dot += ai * bi;
177        norm_a_sq += ai * ai;
178        norm_b_sq += bi * bi;
179    }
180
181    let norm_a = norm_a_sq.sqrt();
182    let norm_b = norm_b_sq.sqrt();
183
184    if norm_a == 0.0 || norm_b == 0.0 {
185        return 0.0;
186    }
187
188    dot / (norm_a * norm_b)
189}
190
191/// Computes the L2 norm (magnitude) of a vector using SIMD.
192#[inline]
193#[must_use]
194pub fn norm_simd(v: &[f32]) -> f32 {
195    let len = v.len();
196    let simd_len = len / 8;
197    let remainder = len % 8;
198
199    let mut sum = f32x8::ZERO;
200
201    for i in 0..simd_len {
202        let offset = i * 8;
203        let vv = f32x8::from(&v[offset..offset + 8]);
204        sum = vv.mul_add(vv, sum); // FMA: sum = (vv * vv) + sum
205    }
206
207    let mut result = sum.reduce_add();
208
209    let base = simd_len * 8;
210    for i in 0..remainder {
211        result += v[base + i] * v[base + i];
212    }
213
214    result.sqrt()
215}
216
217/// Computes Hamming distance for f32 binary vectors with loop unrolling.
218///
219/// Values > 0.5 are treated as 1, else 0. Counts differing positions.
220/// Uses 8-wide loop unrolling for better cache utilization.
221///
222/// For packed binary data, use `hamming_distance_binary` which is ~50x faster.
223///
224/// # Panics
225///
226/// Panics if vectors have different lengths.
227#[inline]
228#[must_use]
229pub fn hamming_distance_simd(a: &[f32], b: &[f32]) -> f32 {
230    assert_eq!(a.len(), b.len(), "Vector dimensions must match");
231
232    let len = a.len();
233    let chunks = len / 8;
234    let remainder = len % 8;
235
236    let mut count = 0u32;
237
238    // Process 8 elements at a time for better cache/pipeline utilization
239    for i in 0..chunks {
240        let base = i * 8;
241        count += u32::from((a[base] > 0.5) != (b[base] > 0.5));
242        count += u32::from((a[base + 1] > 0.5) != (b[base + 1] > 0.5));
243        count += u32::from((a[base + 2] > 0.5) != (b[base + 2] > 0.5));
244        count += u32::from((a[base + 3] > 0.5) != (b[base + 3] > 0.5));
245        count += u32::from((a[base + 4] > 0.5) != (b[base + 4] > 0.5));
246        count += u32::from((a[base + 5] > 0.5) != (b[base + 5] > 0.5));
247        count += u32::from((a[base + 6] > 0.5) != (b[base + 6] > 0.5));
248        count += u32::from((a[base + 7] > 0.5) != (b[base + 7] > 0.5));
249    }
250
251    // Handle remainder
252    let base = chunks * 8;
253    for i in 0..remainder {
254        if (a[base + i] > 0.5) != (b[base + i] > 0.5) {
255            count += 1;
256        }
257    }
258
259    #[allow(clippy::cast_precision_loss)]
260    {
261        count as f32
262    }
263}
264
265/// Computes Hamming distance for packed binary vectors (u64 chunks).
266///
267/// Uses POPCNT for massive speedup on binary data. Each u64 contains 64 bits.
268/// This is ~50x faster than f32-based Hamming for large binary vectors.
269///
270/// # Arguments
271///
272/// * `a` - First packed binary vector
273/// * `b` - Second packed binary vector
274///
275/// # Returns
276///
277/// Number of differing bits.
278///
279/// # Panics
280///
281/// Panics if vectors have different lengths.
282#[inline]
283#[must_use]
284pub fn hamming_distance_binary(a: &[u64], b: &[u64]) -> u32 {
285    assert_eq!(a.len(), b.len(), "Vector dimensions must match");
286
287    // Use iterator for better optimization - compiler can vectorize this
288    a.iter()
289        .zip(b.iter())
290        .map(|(&x, &y)| (x ^ y).count_ones())
291        .sum()
292}
293
294/// Computes Hamming distance for packed binary vectors with 8-wide unrolling.
295///
296/// Optimized version with explicit 8-wide loop unrolling for maximum throughput.
297/// Use this for large vectors (>= 64 u64 elements).
298///
299/// # Panics
300///
301/// Panics if vectors have different lengths.
302#[inline]
303#[must_use]
304pub fn hamming_distance_binary_fast(a: &[u64], b: &[u64]) -> u32 {
305    assert_eq!(a.len(), b.len(), "Vector dimensions must match");
306
307    let len = a.len();
308    let chunks = len / 8;
309    let remainder = len % 8;
310
311    // Use multiple accumulators to exploit instruction-level parallelism
312    let mut c0 = 0u32;
313    let mut c1 = 0u32;
314    let mut c2 = 0u32;
315    let mut c3 = 0u32;
316
317    for i in 0..chunks {
318        let base = i * 8;
319        c0 += (a[base] ^ b[base]).count_ones();
320        c1 += (a[base + 1] ^ b[base + 1]).count_ones();
321        c0 += (a[base + 2] ^ b[base + 2]).count_ones();
322        c1 += (a[base + 3] ^ b[base + 3]).count_ones();
323        c2 += (a[base + 4] ^ b[base + 4]).count_ones();
324        c3 += (a[base + 5] ^ b[base + 5]).count_ones();
325        c2 += (a[base + 6] ^ b[base + 6]).count_ones();
326        c3 += (a[base + 7] ^ b[base + 7]).count_ones();
327    }
328
329    // Handle remainder
330    let base = chunks * 8;
331    for i in 0..remainder {
332        c0 += (a[base + i] ^ b[base + i]).count_ones();
333    }
334
335    c0 + c1 + c2 + c3
336}
337
338/// Computes Jaccard similarity for f32 binary vectors with loop unrolling.
339///
340/// Values > 0.5 are treated as set members. Returns intersection/union.
341///
342/// # Panics
343///
344/// Panics if vectors have different lengths.
345#[inline]
346#[must_use]
347pub fn jaccard_similarity_simd(a: &[f32], b: &[f32]) -> f32 {
348    assert_eq!(a.len(), b.len(), "Vector dimensions must match");
349
350    let len = a.len();
351    let chunks = len / 8;
352    let remainder = len % 8;
353
354    let mut intersection = 0u32;
355    let mut union = 0u32;
356
357    // Process 8 elements at a time
358    for i in 0..chunks {
359        let base = i * 8;
360        for j in 0..8 {
361            let ai = a[base + j] > 0.5;
362            let bi = b[base + j] > 0.5;
363            intersection += u32::from(ai && bi);
364            union += u32::from(ai || bi);
365        }
366    }
367
368    // Handle remainder
369    let base = chunks * 8;
370    for i in 0..remainder {
371        let ai = a[base + i] > 0.5;
372        let bi = b[base + i] > 0.5;
373        intersection += u32::from(ai && bi);
374        union += u32::from(ai || bi);
375    }
376
377    if union == 0 {
378        return 1.0; // Empty sets are identical
379    }
380
381    #[allow(clippy::cast_precision_loss)]
382    {
383        intersection as f32 / union as f32
384    }
385}
386
387/// Normalizes a vector in-place using SIMD.
388#[inline]
389pub fn normalize_inplace_simd(v: &mut [f32]) {
390    let norm = norm_simd(v);
391
392    if norm == 0.0 {
393        return;
394    }
395
396    let inv_norm = 1.0 / norm;
397    let inv_norm_simd = f32x8::splat(inv_norm);
398
399    let len = v.len();
400    let simd_len = len / 8;
401    let remainder = len % 8;
402
403    for i in 0..simd_len {
404        let offset = i * 8;
405        let vv = f32x8::from(&v[offset..offset + 8]);
406        let normalized = vv * inv_norm_simd;
407        let arr: [f32; 8] = normalized.into();
408        v[offset..offset + 8].copy_from_slice(&arr);
409    }
410
411    let base = simd_len * 8;
412    for i in 0..remainder {
413        v[base + i] *= inv_norm;
414    }
415}