1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
//! NEON SIMD implementations for ARM64 (EPIC-054 US-001).
//!
//! This module provides NEON-optimized distance calculations for aarch64 targets.
//! Performance is comparable to x86_64 AVX2 (≥95% parity).
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
/// NEON-optimized dot product for f32 vectors.
///
/// # Safety
/// Requires aarch64 target with NEON support.
/// Input slices must have equal length.
///
/// # Performance
/// - Uses vfmaq_f32 (fused multiply-add)
/// - Processes 4 elements per iteration
/// - ~3-4x faster than scalar on M1/M2
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
#[must_use]
pub unsafe fn dot_product_neon(a: &[f32], b: &[f32]) -> f32 {
debug_assert_eq!(a.len(), b.len());
let len = a.len();
if len == 0 {
return 0.0;
}
let chunks = len / 4;
let remainder = len % 4;
// Main SIMD loop
let mut sum = vdupq_n_f32(0.0);
for i in 0..chunks {
let offset = i * 4;
// SAFETY: NEON load and FMA require in-bounds pointers.
// - Condition 1: Loop invariant `offset + 4 <= chunks * 4 <= len` keeps loads in bounds.
// - Condition 2: `a` and `b` have equal length (debug assertion at entry).
// Reason: Use NEON intrinsics for vectorized multiply-accumulate.
let va = vld1q_f32(a.as_ptr().add(offset));
let vb = vld1q_f32(b.as_ptr().add(offset));
sum = vfmaq_f32(sum, va, vb); // sum += va * vb
}
// Horizontal sum of SIMD register
let mut result = vaddvq_f32(sum);
// Handle remainder (if len not divisible by 4) - unrolled for performance
let base = chunks * 4;
if remainder == 3 {
result += a[base] * b[base] + a[base + 1] * b[base + 1] + a[base + 2] * b[base + 2];
} else if remainder == 2 {
result += a[base] * b[base] + a[base + 1] * b[base + 1];
} else if remainder == 1 {
result += a[base] * b[base];
}
result
}
/// NEON-optimized squared Euclidean distance.
///
/// # Safety
/// Requires aarch64 target with NEON support.
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
#[must_use]
pub unsafe fn euclidean_squared_neon(a: &[f32], b: &[f32]) -> f32 {
debug_assert_eq!(a.len(), b.len());
let len = a.len();
if len == 0 {
return 0.0;
}
let chunks = len / 4;
let remainder = len % 4;
let mut sum = vdupq_n_f32(0.0);
for i in 0..chunks {
let offset = i * 4;
// SAFETY: NEON load/sub/FMA require in-bounds pointers.
// - Condition 1: Loop invariant `offset + 4 <= chunks * 4 <= len` keeps loads in bounds.
// - Condition 2: `a` and `b` have equal length (debug assertion at entry).
// Reason: SIMD distance accumulation is required for NEON fast path.
let va = vld1q_f32(a.as_ptr().add(offset));
let vb = vld1q_f32(b.as_ptr().add(offset));
let diff = vsubq_f32(va, vb);
sum = vfmaq_f32(sum, diff, diff); // sum += diff * diff
}
let mut result = vaddvq_f32(sum);
let base = chunks * 4;
if remainder == 3 {
let d0 = a[base] - b[base];
let d1 = a[base + 1] - b[base + 1];
let d2 = a[base + 2] - b[base + 2];
result += d0 * d0 + d1 * d1 + d2 * d2;
} else if remainder == 2 {
let d0 = a[base] - b[base];
let d1 = a[base + 1] - b[base + 1];
result += d0 * d0 + d1 * d1;
} else if remainder == 1 {
let d = a[base] - b[base];
result += d * d;
}
result
}
/// NEON-optimized Euclidean distance (with sqrt).
///
/// # Safety
/// Requires aarch64 target with NEON support.
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
#[must_use]
pub unsafe fn euclidean_neon(a: &[f32], b: &[f32]) -> f32 {
euclidean_squared_neon(a, b).sqrt()
}
/// NEON-optimized cosine similarity.
///
/// # Safety
/// Requires aarch64 target with NEON support.
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
#[must_use]
pub unsafe fn cosine_neon(a: &[f32], b: &[f32]) -> f32 {
let dot = dot_product_neon(a, b);
let norm_a = dot_product_neon(a, a).sqrt();
let norm_b = dot_product_neon(b, b).sqrt();
if norm_a == 0.0 || norm_b == 0.0 {
0.0
} else {
dot / (norm_a * norm_b)
}
}
/// NEON-optimized cosine similarity for pre-normalized vectors.
///
/// # Safety
/// Requires aarch64 target with NEON support.
/// Vectors must be pre-normalized to unit length.
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
#[must_use]
pub unsafe fn cosine_normalized_neon(a: &[f32], b: &[f32]) -> f32 {
// For normalized vectors, cosine = dot product
dot_product_neon(a, b)
}
// =============================================================================
// Wrapper functions for dispatch (safe API)
// =============================================================================
/// Safe wrapper for dot product NEON.
#[cfg(target_arch = "aarch64")]
#[inline]
#[must_use]
pub fn dot_product_neon_safe(a: &[f32], b: &[f32]) -> f32 {
// SAFETY: Calling `dot_product_neon` requires NEON target support.
// - Condition 1: This function is compiled only for `target_arch = "aarch64"`.
// - Condition 2: AArch64 guarantees NEON availability.
// Reason: Safe wrapper delegates to NEON implementation without repeating checks.
unsafe { dot_product_neon(a, b) }
}
/// Safe wrapper for euclidean NEON.
#[cfg(target_arch = "aarch64")]
#[inline]
#[must_use]
pub fn euclidean_neon_safe(a: &[f32], b: &[f32]) -> f32 {
// SAFETY: Calling `euclidean_neon` requires NEON target support.
// - Condition 1: This function is compiled only for `target_arch = "aarch64"`.
// - Condition 2: AArch64 guarantees NEON availability.
// Reason: Safe wrapper delegates to NEON implementation without repeating checks.
unsafe { euclidean_neon(a, b) }
}
/// Safe wrapper for cosine NEON.
#[cfg(target_arch = "aarch64")]
#[inline]
#[must_use]
pub fn cosine_neon_safe(a: &[f32], b: &[f32]) -> f32 {
// SAFETY: Calling `cosine_neon` requires NEON target support.
// - Condition 1: This function is compiled only for `target_arch = "aarch64"`.
// - Condition 2: AArch64 guarantees NEON availability.
// Reason: Safe wrapper delegates to NEON implementation without repeating checks.
unsafe { cosine_neon(a, b) }
}
/// Safe wrapper for cosine normalized NEON.
#[cfg(target_arch = "aarch64")]
#[inline]
#[must_use]
pub fn cosine_normalized_neon_safe(a: &[f32], b: &[f32]) -> f32 {
// SAFETY: Calling `cosine_normalized_neon` requires NEON target support.
// - Condition 1: This function is compiled only for `target_arch = "aarch64"`.
// - Condition 2: AArch64 guarantees NEON availability.
// Reason: Safe wrapper delegates to NEON implementation without repeating checks.
unsafe { cosine_normalized_neon(a, b) }
}
// =============================================================================
// Tests
// =============================================================================
#[cfg(all(test, target_arch = "aarch64"))]
mod tests {
use super::*;
#[test]
fn test_dot_product_neon_basic() {
let a = vec![1.0f32, 2.0, 3.0, 4.0];
let b = vec![1.0f32, 1.0, 1.0, 1.0];
let result = dot_product_neon_safe(&a, &b);
assert!((result - 10.0).abs() < 1e-5);
}
#[test]
fn test_dot_product_neon_empty() {
let a: Vec<f32> = vec![];
let b: Vec<f32> = vec![];
let result = dot_product_neon_safe(&a, &b);
assert!((result - 0.0).abs() < 1e-5);
}
#[test]
fn test_dot_product_neon_non_aligned() {
// 7 elements - not divisible by 4
let a = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0];
let b = vec![1.0f32, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
let result = dot_product_neon_safe(&a, &b);
assert!((result - 28.0).abs() < 1e-5);
}
#[test]
fn test_euclidean_neon_basic() {
let a = vec![0.0f32, 0.0, 0.0, 0.0];
let b = vec![3.0f32, 4.0, 0.0, 0.0];
let result = euclidean_neon_safe(&a, &b);
assert!((result - 5.0).abs() < 1e-5);
}
#[test]
fn test_cosine_neon_identical() {
let a = vec![1.0f32, 2.0, 3.0, 4.0];
let result = cosine_neon_safe(&a, &a);
assert!((result - 1.0).abs() < 1e-5);
}
#[test]
fn test_cosine_neon_orthogonal() {
let a = vec![1.0f32, 0.0, 0.0, 0.0];
let b = vec![0.0f32, 1.0, 0.0, 0.0];
let result = cosine_neon_safe(&a, &b);
assert!(result.abs() < 1e-5);
}
#[test]
fn test_dot_product_neon_768d() {
// Test with typical embedding dimension
let a: Vec<f32> = (0..768).map(|i| (i as f32) * 0.001).collect();
let b: Vec<f32> = (0..768).map(|i| (i as f32) * 0.002).collect();
let neon_result = dot_product_neon_safe(&a, &b);
// Compare with scalar
let scalar_result: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
assert!(
(neon_result - scalar_result).abs() < 1e-3,
"NEON: {}, Scalar: {}",
neon_result,
scalar_result
);
}
}