Skip to main content

anno/backends/inference/
binary_embeddings.rs

1//! Binary embeddings for fast candidate blocking via Hamming distance.
2//!
3//! Research note: binary embeddings are for blocking, not primary retrieval.
4//! The sign-rank limitation means they cannot represent all similarity functions.
5//! Use as a pre-filter only.
6
7// =============================================================================
8
9/// Binary hash for fast approximate nearest neighbor search.
10///
11/// # Research Background
12///
13/// Binary embeddings enable sub-linear search via Hamming distance. Key insight
14/// from our research synthesis: **binary embeddings are for blocking, not primary
15/// retrieval**. The sign-rank limitation means they cannot represent all similarity
16/// relationships, but they excel at fast candidate filtering.
17///
18/// # Two-Stage Retrieval Pattern
19///
20/// ```text
21/// Query → [Binary Hash] → Hamming Filter (fast) → Candidates
22///                                                      ↓
23///                                              [Dense Similarity]
24///                                                      ↓
25///                                               Final Results
26/// ```
27///
28/// # Example
29///
30/// ```rust
31/// use anno::backends::inference::BinaryHash;
32///
33/// // Create hashes from embeddings
34/// let hash1 = BinaryHash::from_embedding(&[0.1, -0.2, 0.3, -0.4, 0.5, -0.6, 0.7, -0.8]);
35/// let hash2 = BinaryHash::from_embedding(&[0.15, -0.25, 0.35, -0.45, 0.55, -0.65, 0.75, -0.85]);
36///
37/// // Similar embeddings → low Hamming distance
38/// assert!(hash1.hamming_distance(&hash2) < 2);
39/// ```
40#[derive(Debug, Clone, PartialEq, Eq, Hash)]
41pub struct BinaryHash {
42    /// Packed bits (each u64 holds 64 bits)
43    pub bits: Vec<u64>,
44    /// Original dimension (number of bits)
45    pub dim: usize,
46}
47
48impl BinaryHash {
49    /// Create from a dense embedding using sign function.
50    ///
51    /// Each positive value → 1, each negative/zero value → 0.
52    #[must_use]
53    pub fn from_embedding(embedding: &[f32]) -> Self {
54        let dim = embedding.len();
55        let num_u64s = dim.div_ceil(64);
56        let mut bits = vec![0u64; num_u64s];
57
58        for (i, &val) in embedding.iter().enumerate() {
59            if val > 0.0 {
60                let word_idx = i / 64;
61                let bit_idx = i % 64;
62                bits[word_idx] |= 1u64 << bit_idx;
63            }
64        }
65
66        Self { bits, dim }
67    }
68
69    /// Create from a dense f64 embedding.
70    #[must_use]
71    pub fn from_embedding_f64(embedding: &[f64]) -> Self {
72        let dim = embedding.len();
73        let num_u64s = dim.div_ceil(64);
74        let mut bits = vec![0u64; num_u64s];
75
76        for (i, &val) in embedding.iter().enumerate() {
77            if val > 0.0 {
78                let word_idx = i / 64;
79                let bit_idx = i % 64;
80                bits[word_idx] |= 1u64 << bit_idx;
81            }
82        }
83
84        Self { bits, dim }
85    }
86
87    /// Compute Hamming distance (number of differing bits).
88    ///
89    /// Uses POPCNT instruction when available for hardware acceleration.
90    #[must_use]
91    pub fn hamming_distance(&self, other: &Self) -> u32 {
92        self.bits
93            .iter()
94            .zip(other.bits.iter())
95            .map(|(a, b)| (a ^ b).count_ones())
96            .sum()
97    }
98
99    /// Compute normalized Hamming distance (0.0 to 1.0).
100    #[must_use]
101    pub fn hamming_distance_normalized(&self, other: &Self) -> f64 {
102        if self.dim == 0 {
103            return 0.0;
104        }
105        self.hamming_distance(other) as f64 / self.dim as f64
106    }
107
108    /// Convert Hamming distance to approximate cosine similarity.
109    ///
110    /// Based on the relationship: cos(θ) ≈ 1 - 2 * (hamming_distance / dim)
111    /// This is an approximation valid for random hyperplane hashing.
112    #[must_use]
113    pub fn approximate_cosine(&self, other: &Self) -> f64 {
114        1.0 - 2.0 * self.hamming_distance_normalized(other)
115    }
116}
117
118/// Blocker using binary embeddings for fast candidate filtering.
119///
120/// # Usage Pattern
121///
122/// 1. Pre-compute binary hashes for all entities in your KB
123/// 2. At query time, hash the query embedding
124/// 3. Find candidates within Hamming distance threshold
125/// 4. Run dense similarity only on candidates
126///
127/// # Example
128///
129/// ```rust
130/// use anno::backends::inference::{BinaryBlocker, BinaryHash};
131///
132/// let mut blocker = BinaryBlocker::new(8); // 8-bit Hamming threshold
133///
134/// // Add entities to the index
135/// let hash1 = BinaryHash::from_embedding(&vec![0.1; 768]);
136/// let hash2 = BinaryHash::from_embedding(&vec![-0.1; 768]);
137/// blocker.add(0, hash1);
138/// blocker.add(1, hash2);
139///
140/// // Query
141/// let query = BinaryHash::from_embedding(&vec![0.1; 768]);
142/// let candidates = blocker.query(&query);
143/// assert!(candidates.contains(&0)); // Similar to hash1
144/// ```
145#[derive(Debug, Clone)]
146pub struct BinaryBlocker {
147    /// Hamming distance threshold for candidates
148    pub threshold: u32,
149    /// Index of hashes by ID
150    index: Vec<(usize, BinaryHash)>,
151}
152
153impl BinaryBlocker {
154    /// Create a new blocker with the given threshold.
155    #[must_use]
156    pub fn new(threshold: u32) -> Self {
157        Self {
158            threshold,
159            index: Vec::new(),
160        }
161    }
162
163    /// Add an entity to the index.
164    pub fn add(&mut self, id: usize, hash: BinaryHash) {
165        self.index.push((id, hash));
166    }
167
168    /// Add multiple entities.
169    pub fn add_batch(&mut self, entries: impl IntoIterator<Item = (usize, BinaryHash)>) {
170        self.index.extend(entries);
171    }
172
173    /// Find candidate IDs within Hamming distance threshold.
174    #[must_use]
175    pub fn query(&self, query: &BinaryHash) -> Vec<usize> {
176        self.index
177            .iter()
178            .filter(|(_, hash)| hash.hamming_distance(query) <= self.threshold)
179            .map(|(id, _)| *id)
180            .collect()
181    }
182
183    /// Find candidates with their distances.
184    #[must_use]
185    pub fn query_with_distance(&self, query: &BinaryHash) -> Vec<(usize, u32)> {
186        self.index
187            .iter()
188            .map(|(id, hash)| (*id, hash.hamming_distance(query)))
189            .filter(|(_, dist)| *dist <= self.threshold)
190            .collect()
191    }
192
193    /// Number of entries in the index.
194    #[must_use]
195    pub fn len(&self) -> usize {
196        self.index.len()
197    }
198
199    /// Check if index is empty.
200    #[must_use]
201    pub fn is_empty(&self) -> bool {
202        self.index.is_empty()
203    }
204
205    /// Clear the index.
206    pub fn clear(&mut self) {
207        self.index.clear();
208    }
209}
210
211/// Recommended two-stage retrieval using binary blocking + dense reranking.
212///
213/// # Research Context
214///
215/// This implements the pattern identified in our research synthesis:
216/// - Stage 1: Binary blocking for O(n) candidate filtering
217/// - Stage 2: Dense similarity for accurate ranking
218///
219/// The key insight is that binary embeddings have fundamental limitations
220/// (sign-rank theorem) but excel at fast filtering.
221///
222/// # Arguments
223///
224/// * `query_embedding` - Dense query embedding
225/// * `candidate_embeddings` - Dense embeddings of all candidates
226/// * `binary_threshold` - Hamming distance threshold for blocking
227/// * `top_k` - Number of final results to return
228///
229/// # Returns
230///
231/// Vector of (candidate_index, similarity_score) pairs, sorted by score descending.
232#[must_use]
233pub fn two_stage_retrieval(
234    query_embedding: &[f32],
235    candidate_embeddings: &[Vec<f32>],
236    binary_threshold: u32,
237    top_k: usize,
238) -> Vec<(usize, f32)> {
239    // Stage 1: Binary blocking
240    let query_hash = BinaryHash::from_embedding(query_embedding);
241
242    let candidate_hashes: Vec<BinaryHash> = candidate_embeddings
243        .iter()
244        .map(|e| BinaryHash::from_embedding(e))
245        .collect();
246
247    let mut blocker = BinaryBlocker::new(binary_threshold);
248    for (i, hash) in candidate_hashes.into_iter().enumerate() {
249        blocker.add(i, hash);
250    }
251
252    let candidates = blocker.query(&query_hash);
253
254    // Stage 2: Dense similarity on candidates only
255    // Performance: Pre-allocate scored vec with known size
256    let mut scored: Vec<(usize, f32)> = Vec::with_capacity(candidates.len());
257    scored.extend(candidates.into_iter().map(|idx| {
258        let sim = cosine_similarity_f32(query_embedding, &candidate_embeddings[idx]);
259        (idx, sim)
260    }));
261
262    // Performance: Use unstable sort (we don't need stable sort here)
263    // Sort by similarity descending
264    scored.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
265    scored.truncate(top_k);
266    scored
267}
268
269/// Compute cosine similarity between two f32 vectors.
270#[must_use]
271pub fn cosine_similarity_f32(a: &[f32], b: &[f32]) -> f32 {
272    if a.len() != b.len() || a.is_empty() {
273        return 0.0;
274    }
275
276    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
277    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
278    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
279
280    if norm_a == 0.0 || norm_b == 0.0 {
281        return 0.0;
282    }
283
284    dot / (norm_a * norm_b)
285}