ruvector_memopt/accel/
simd.rs1use super::cpu::CpuCapabilities;
4
5pub struct SimdOptimizer {
7 caps: CpuCapabilities,
8}
9
10impl SimdOptimizer {
11 pub fn new() -> Self {
12 Self { caps: CpuCapabilities::detect() }
13 }
14
15 #[cfg(target_arch = "x86_64")]
17 pub fn euclidean_distance(&self, a: &[f32], b: &[f32]) -> f32 {
18 if a.len() != b.len() { return f32::MAX; }
19
20 if self.caps.has_avx2 && a.len() >= 8 {
21 unsafe { self.euclidean_distance_avx2(a, b) }
23 } else {
24 self.euclidean_distance_scalar(a, b)
25 }
26 }
27
28 #[cfg(not(target_arch = "x86_64"))]
29 pub fn euclidean_distance(&self, a: &[f32], b: &[f32]) -> f32 {
30 self.euclidean_distance_scalar(a, b)
31 }
32
33 fn euclidean_distance_scalar(&self, a: &[f32], b: &[f32]) -> f32 {
34 a.iter().zip(b.iter())
35 .map(|(x, y)| (x - y).powi(2))
36 .sum::<f32>()
37 .sqrt()
38 }
39
40 #[cfg(target_arch = "x86_64")]
41 #[target_feature(enable = "avx2")]
42 unsafe fn euclidean_distance_avx2(&self, a: &[f32], b: &[f32]) -> f32 {
43 use std::arch::x86_64::*;
44
45 let len = a.len();
46 let chunks = len / 8;
47 let mut sum = _mm256_setzero_ps();
48
49 for i in 0..chunks {
50 let offset = i * 8;
51 let va = _mm256_loadu_ps(a.as_ptr().add(offset));
52 let vb = _mm256_loadu_ps(b.as_ptr().add(offset));
53 let diff = _mm256_sub_ps(va, vb);
54 let sq = _mm256_mul_ps(diff, diff);
55 sum = _mm256_add_ps(sum, sq);
56 }
57
58 let high = _mm256_extractf128_ps(sum, 1);
60 let low = _mm256_castps256_ps128(sum);
61 let sum128 = _mm_add_ps(low, high);
62 let sum64 = _mm_add_ps(sum128, _mm_movehl_ps(sum128, sum128));
63 let sum32 = _mm_add_ss(sum64, _mm_shuffle_ps(sum64, sum64, 1));
64
65 let mut result = _mm_cvtss_f32(sum32);
66
67 for i in (chunks * 8)..len {
69 let diff = a[i] - b[i];
70 result += diff * diff;
71 }
72
73 result.sqrt()
74 }
75
76 pub fn dot_product(&self, a: &[f32], b: &[f32]) -> f32 {
78 if a.len() != b.len() { return 0.0; }
79
80 #[cfg(target_arch = "x86_64")]
81 if self.caps.has_avx2 && a.len() >= 8 {
82 return unsafe { self.dot_product_avx2(a, b) };
83 }
84
85 self.dot_product_scalar(a, b)
86 }
87
88 fn dot_product_scalar(&self, a: &[f32], b: &[f32]) -> f32 {
89 a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
90 }
91
92 #[cfg(target_arch = "x86_64")]
93 #[target_feature(enable = "avx2", enable = "fma")]
94 unsafe fn dot_product_avx2(&self, a: &[f32], b: &[f32]) -> f32 {
95 use std::arch::x86_64::*;
96
97 let len = a.len().min(b.len());
98 let chunks = len / 8;
99 let mut sum = _mm256_setzero_ps();
100
101 for i in 0..chunks {
102 let offset = i * 8;
103 let va = _mm256_loadu_ps(a.as_ptr().add(offset));
104 let vb = _mm256_loadu_ps(b.as_ptr().add(offset));
105 sum = _mm256_fmadd_ps(va, vb, sum);
106 }
107
108 let high = _mm256_extractf128_ps(sum, 1);
110 let low = _mm256_castps256_ps128(sum);
111 let sum128 = _mm_add_ps(low, high);
112 let sum64 = _mm_add_ps(sum128, _mm_movehl_ps(sum128, sum128));
113 let sum32 = _mm_add_ss(sum64, _mm_shuffle_ps(sum64, sum64, 1));
114
115 let mut result = _mm_cvtss_f32(sum32);
116
117 for i in (chunks * 8)..len {
118 result += a[i] * b[i];
119 }
120
121 result
122 }
123
124 pub fn batch_distances(&self, query: &[f32], vectors: &[Vec<f32>]) -> Vec<f32> {
126 vectors.iter()
127 .map(|v| self.euclidean_distance(query, v))
128 .collect()
129 }
130
131 pub fn benchmark(&self, dim: usize, iterations: usize) -> (f64, f64, f64) {
133 use std::time::Instant;
134 use std::hint::black_box;
135
136 let a: Vec<f32> = (0..dim).map(|i| i as f32 * 0.1).collect();
137 let b: Vec<f32> = (0..dim).map(|i| (dim - i) as f32 * 0.1).collect();
138
139 for _ in 0..100 {
141 let _ = black_box(self.euclidean_distance_scalar(black_box(&a), black_box(&b)));
142 }
143
144 let start = Instant::now();
146 for _ in 0..iterations {
147 let _ = black_box(self.euclidean_distance_scalar(black_box(&a), black_box(&b)));
148 }
149 let scalar_time = start.elapsed().as_secs_f64();
150
151 let start = Instant::now();
153 for _ in 0..iterations {
154 let _ = black_box(self.euclidean_distance(black_box(&a), black_box(&b)));
155 }
156 let simd_time = start.elapsed().as_secs_f64();
157
158 let speedup = if simd_time > 0.0 { scalar_time / simd_time } else { 1.0 };
160 (scalar_time, simd_time, speedup)
161 }
162
163 pub fn capabilities(&self) -> &CpuCapabilities { &self.caps }
164}
165
166impl Default for SimdOptimizer {
167 fn default() -> Self { Self::new() }
168}
169
170#[cfg(test)]
171mod tests {
172 use super::*;
173
174 #[test]
175 fn test_euclidean_distance() {
176 let opt = SimdOptimizer::new();
177 let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
178 let b = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
179 assert!((opt.euclidean_distance(&a, &b) - 0.0).abs() < 0.001);
180 }
181
182 #[test]
183 fn test_dot_product() {
184 let opt = SimdOptimizer::new();
185 let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
186 let b = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
187 assert!((opt.dot_product(&a, &b) - 36.0).abs() < 0.001);
188 }
189}