anno/backends/inference/binary_embeddings.rs
1//! Binary embeddings for fast candidate blocking via Hamming distance.
2//!
3//! Research note: binary embeddings are for blocking, not primary retrieval.
4//! The sign-rank limitation means they cannot represent all similarity functions.
5//! Use as a pre-filter only.
6
7// =============================================================================
8
9/// Binary hash for fast approximate nearest neighbor search.
10///
11/// # Research Background
12///
13/// Binary embeddings enable sub-linear search via Hamming distance. Key insight
14/// from our research synthesis: **binary embeddings are for blocking, not primary
15/// retrieval**. The sign-rank limitation means they cannot represent all similarity
16/// relationships, but they excel at fast candidate filtering.
17///
18/// # Two-Stage Retrieval Pattern
19///
20/// ```text
21/// Query → [Binary Hash] → Hamming Filter (fast) → Candidates
22/// ↓
23/// [Dense Similarity]
24/// ↓
25/// Final Results
26/// ```
27///
28/// # Example
29///
30/// ```rust
31/// use anno::backends::inference::BinaryHash;
32///
33/// // Create hashes from embeddings
34/// let hash1 = BinaryHash::from_embedding(&[0.1, -0.2, 0.3, -0.4, 0.5, -0.6, 0.7, -0.8]);
35/// let hash2 = BinaryHash::from_embedding(&[0.15, -0.25, 0.35, -0.45, 0.55, -0.65, 0.75, -0.85]);
36///
37/// // Similar embeddings → low Hamming distance
38/// assert!(hash1.hamming_distance(&hash2) < 2);
39/// ```
40#[derive(Debug, Clone, PartialEq, Eq, Hash)]
41pub struct BinaryHash {
42 /// Packed bits (each u64 holds 64 bits)
43 pub bits: Vec<u64>,
44 /// Original dimension (number of bits)
45 pub dim: usize,
46}
47
48impl BinaryHash {
49 /// Create from a dense embedding using sign function.
50 ///
51 /// Each positive value → 1, each negative/zero value → 0.
52 #[must_use]
53 pub fn from_embedding(embedding: &[f32]) -> Self {
54 let dim = embedding.len();
55 let num_u64s = dim.div_ceil(64);
56 let mut bits = vec![0u64; num_u64s];
57
58 for (i, &val) in embedding.iter().enumerate() {
59 if val > 0.0 {
60 let word_idx = i / 64;
61 let bit_idx = i % 64;
62 bits[word_idx] |= 1u64 << bit_idx;
63 }
64 }
65
66 Self { bits, dim }
67 }
68
69 /// Create from a dense f64 embedding.
70 #[must_use]
71 pub fn from_embedding_f64(embedding: &[f64]) -> Self {
72 let dim = embedding.len();
73 let num_u64s = dim.div_ceil(64);
74 let mut bits = vec![0u64; num_u64s];
75
76 for (i, &val) in embedding.iter().enumerate() {
77 if val > 0.0 {
78 let word_idx = i / 64;
79 let bit_idx = i % 64;
80 bits[word_idx] |= 1u64 << bit_idx;
81 }
82 }
83
84 Self { bits, dim }
85 }
86
87 /// Compute Hamming distance (number of differing bits).
88 ///
89 /// Uses POPCNT instruction when available for hardware acceleration.
90 #[must_use]
91 pub fn hamming_distance(&self, other: &Self) -> u32 {
92 self.bits
93 .iter()
94 .zip(other.bits.iter())
95 .map(|(a, b)| (a ^ b).count_ones())
96 .sum()
97 }
98
99 /// Compute normalized Hamming distance (0.0 to 1.0).
100 #[must_use]
101 pub fn hamming_distance_normalized(&self, other: &Self) -> f64 {
102 if self.dim == 0 {
103 return 0.0;
104 }
105 self.hamming_distance(other) as f64 / self.dim as f64
106 }
107
108 /// Convert Hamming distance to approximate cosine similarity.
109 ///
110 /// Based on the relationship: cos(θ) ≈ 1 - 2 * (hamming_distance / dim)
111 /// This is an approximation valid for random hyperplane hashing.
112 #[must_use]
113 pub fn approximate_cosine(&self, other: &Self) -> f64 {
114 1.0 - 2.0 * self.hamming_distance_normalized(other)
115 }
116}
117
118/// Blocker using binary embeddings for fast candidate filtering.
119///
120/// # Usage Pattern
121///
122/// 1. Pre-compute binary hashes for all entities in your KB
123/// 2. At query time, hash the query embedding
124/// 3. Find candidates within Hamming distance threshold
125/// 4. Run dense similarity only on candidates
126///
127/// # Example
128///
129/// ```rust
130/// use anno::backends::inference::{BinaryBlocker, BinaryHash};
131///
132/// let mut blocker = BinaryBlocker::new(8); // 8-bit Hamming threshold
133///
134/// // Add entities to the index
135/// let hash1 = BinaryHash::from_embedding(&vec![0.1; 768]);
136/// let hash2 = BinaryHash::from_embedding(&vec![-0.1; 768]);
137/// blocker.add(0, hash1);
138/// blocker.add(1, hash2);
139///
140/// // Query
141/// let query = BinaryHash::from_embedding(&vec![0.1; 768]);
142/// let candidates = blocker.query(&query);
143/// assert!(candidates.contains(&0)); // Similar to hash1
144/// ```
145#[derive(Debug, Clone)]
146pub struct BinaryBlocker {
147 /// Hamming distance threshold for candidates
148 pub threshold: u32,
149 /// Index of hashes by ID
150 index: Vec<(usize, BinaryHash)>,
151}
152
153impl BinaryBlocker {
154 /// Create a new blocker with the given threshold.
155 #[must_use]
156 pub fn new(threshold: u32) -> Self {
157 Self {
158 threshold,
159 index: Vec::new(),
160 }
161 }
162
163 /// Add an entity to the index.
164 pub fn add(&mut self, id: usize, hash: BinaryHash) {
165 self.index.push((id, hash));
166 }
167
168 /// Add multiple entities.
169 pub fn add_batch(&mut self, entries: impl IntoIterator<Item = (usize, BinaryHash)>) {
170 self.index.extend(entries);
171 }
172
173 /// Find candidate IDs within Hamming distance threshold.
174 #[must_use]
175 pub fn query(&self, query: &BinaryHash) -> Vec<usize> {
176 self.index
177 .iter()
178 .filter(|(_, hash)| hash.hamming_distance(query) <= self.threshold)
179 .map(|(id, _)| *id)
180 .collect()
181 }
182
183 /// Find candidates with their distances.
184 #[must_use]
185 pub fn query_with_distance(&self, query: &BinaryHash) -> Vec<(usize, u32)> {
186 self.index
187 .iter()
188 .map(|(id, hash)| (*id, hash.hamming_distance(query)))
189 .filter(|(_, dist)| *dist <= self.threshold)
190 .collect()
191 }
192
193 /// Number of entries in the index.
194 #[must_use]
195 pub fn len(&self) -> usize {
196 self.index.len()
197 }
198
199 /// Check if index is empty.
200 #[must_use]
201 pub fn is_empty(&self) -> bool {
202 self.index.is_empty()
203 }
204
205 /// Clear the index.
206 pub fn clear(&mut self) {
207 self.index.clear();
208 }
209}
210
211/// Recommended two-stage retrieval using binary blocking + dense reranking.
212///
213/// # Research Context
214///
215/// This implements the pattern identified in our research synthesis:
216/// - Stage 1: Binary blocking for O(n) candidate filtering
217/// - Stage 2: Dense similarity for accurate ranking
218///
219/// The key insight is that binary embeddings have fundamental limitations
220/// (sign-rank theorem) but excel at fast filtering.
221///
222/// # Arguments
223///
224/// * `query_embedding` - Dense query embedding
225/// * `candidate_embeddings` - Dense embeddings of all candidates
226/// * `binary_threshold` - Hamming distance threshold for blocking
227/// * `top_k` - Number of final results to return
228///
229/// # Returns
230///
231/// Vector of (candidate_index, similarity_score) pairs, sorted by score descending.
232#[must_use]
233pub fn two_stage_retrieval(
234 query_embedding: &[f32],
235 candidate_embeddings: &[Vec<f32>],
236 binary_threshold: u32,
237 top_k: usize,
238) -> Vec<(usize, f32)> {
239 // Stage 1: Binary blocking
240 let query_hash = BinaryHash::from_embedding(query_embedding);
241
242 let candidate_hashes: Vec<BinaryHash> = candidate_embeddings
243 .iter()
244 .map(|e| BinaryHash::from_embedding(e))
245 .collect();
246
247 let mut blocker = BinaryBlocker::new(binary_threshold);
248 for (i, hash) in candidate_hashes.into_iter().enumerate() {
249 blocker.add(i, hash);
250 }
251
252 let candidates = blocker.query(&query_hash);
253
254 // Stage 2: Dense similarity on candidates only
255 // Performance: Pre-allocate scored vec with known size
256 let mut scored: Vec<(usize, f32)> = Vec::with_capacity(candidates.len());
257 scored.extend(candidates.into_iter().map(|idx| {
258 let sim = cosine_similarity_f32(query_embedding, &candidate_embeddings[idx]);
259 (idx, sim)
260 }));
261
262 // Performance: Use unstable sort (we don't need stable sort here)
263 // Sort by similarity descending
264 scored.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
265 scored.truncate(top_k);
266 scored
267}
268
269/// Compute cosine similarity between two f32 vectors.
270#[must_use]
271pub fn cosine_similarity_f32(a: &[f32], b: &[f32]) -> f32 {
272 if a.len() != b.len() || a.is_empty() {
273 return 0.0;
274 }
275
276 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
277 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
278 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
279
280 if norm_a == 0.0 || norm_b == 0.0 {
281 return 0.0;
282 }
283
284 dot / (norm_a * norm_b)
285}