embeddenator_vsa/vsa.rs
1//! Vector Symbolic Architecture (VSA) Implementation
2//!
3//! Sparse ternary vector representation with algebraic operations:
4//! - Bundle (⊕): Associative superposition
5//! - Bind (⊙): Non-commutative composition
6//! - Cosine similarity for retrieval
7
8use rand::seq::SliceRandom;
9use rand::SeedableRng;
10use serde::{Deserialize, Serialize};
11use sha2::{Digest, Sha256};
12
13#[cfg(feature = "bt-phase-2")]
14use crate::ternary_vec::PackedTritVec;
15
16#[cfg(feature = "bt-phase-2")]
17use std::cell::RefCell;
18
19/// Dimension of VSA vectors
20pub const DIM: usize = 10000;
21
22#[cfg(feature = "bt-phase-2")]
23thread_local! {
24 // Reused packed buffers for hot paths. Using TLS keeps this allocation
25 // amortized while remaining safe in multi-threaded contexts.
26 static PACKED_SCRATCH_A: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
27 static PACKED_SCRATCH_B: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
28 static PACKED_SCRATCH_OUT: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
29}
30
31/// Configuration for reversible VSA encoding/decoding operations
32#[derive(Clone, Debug, Serialize, Deserialize)]
33pub struct ReversibleVSAConfig {
34 /// Block size for chunked encoding (must be power of 2 for efficiency)
35 pub block_size: usize,
36 /// Maximum path depth for hierarchical encoding
37 pub max_path_depth: usize,
38 /// Base permutation shift for path-based encoding
39 pub base_shift: usize,
40 /// Target sparsity level for operations (number of non-zero elements)
41 pub target_sparsity: usize,
42}
43
44impl Default for ReversibleVSAConfig {
45 fn default() -> Self {
46 ReversibleVSAConfig {
47 block_size: 256, // 256-byte blocks
48 max_path_depth: 10,
49 base_shift: 1000,
50 target_sparsity: 200, // Default sparsity level
51 }
52 }
53}
54
55impl ReversibleVSAConfig {
56 /// Create config optimized for small data blocks
57 pub fn small_blocks() -> Self {
58 ReversibleVSAConfig {
59 block_size: 64,
60 max_path_depth: 5,
61 base_shift: 500,
62 target_sparsity: 100,
63 }
64 }
65
66 /// Create config optimized for large data blocks
67 pub fn large_blocks() -> Self {
68 ReversibleVSAConfig {
69 block_size: 1024,
70 max_path_depth: 20,
71 base_shift: 2000,
72 target_sparsity: 400,
73 }
74 }
75}
76
77/// Sparse ternary vector with positive and negative indices
78#[derive(Clone, Debug, Serialize, Deserialize)]
79pub struct SparseVec {
80 /// Indices with +1 value
81 pub pos: Vec<usize>,
82 /// Indices with -1 value
83 pub neg: Vec<usize>,
84}
85
86impl Default for SparseVec {
87 fn default() -> Self {
88 Self::new()
89 }
90}
91
92impl SparseVec {
93 #[inline]
94 fn nnz(&self) -> usize {
95 self.pos.len() + self.neg.len()
96 }
97
98 fn intersection_count_sorted(a: &[usize], b: &[usize]) -> usize {
99 let mut i = 0usize;
100 let mut j = 0usize;
101 let mut count = 0usize;
102 while i < a.len() && j < b.len() {
103 match a[i].cmp(&b[j]) {
104 std::cmp::Ordering::Less => i += 1,
105 std::cmp::Ordering::Greater => j += 1,
106 std::cmp::Ordering::Equal => {
107 count += 1;
108 i += 1;
109 j += 1;
110 }
111 }
112 }
113 count
114 }
115
116 fn union_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
117 let mut out = Vec::with_capacity(a.len() + b.len());
118 let mut i = 0usize;
119 let mut j = 0usize;
120
121 while i < a.len() && j < b.len() {
122 match a[i].cmp(&b[j]) {
123 std::cmp::Ordering::Less => {
124 out.push(a[i]);
125 i += 1;
126 }
127 std::cmp::Ordering::Greater => {
128 out.push(b[j]);
129 j += 1;
130 }
131 std::cmp::Ordering::Equal => {
132 out.push(a[i]);
133 i += 1;
134 j += 1;
135 }
136 }
137 }
138
139 if i < a.len() {
140 out.extend_from_slice(&a[i..]);
141 }
142 if j < b.len() {
143 out.extend_from_slice(&b[j..]);
144 }
145
146 out
147 }
148
149 fn difference_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
150 let mut out = Vec::with_capacity(a.len());
151 let mut i = 0usize;
152 let mut j = 0usize;
153
154 while i < a.len() && j < b.len() {
155 match a[i].cmp(&b[j]) {
156 std::cmp::Ordering::Less => {
157 out.push(a[i]);
158 i += 1;
159 }
160 std::cmp::Ordering::Greater => j += 1,
161 std::cmp::Ordering::Equal => {
162 i += 1;
163 j += 1;
164 }
165 }
166 }
167
168 if i < a.len() {
169 out.extend_from_slice(&a[i..]);
170 }
171
172 out
173 }
174 /// Create an empty sparse vector
175 ///
176 /// # Examples
177 ///
178 /// ```
179 /// use embeddenator::SparseVec;
180 ///
181 /// let vec = SparseVec::new();
182 /// assert!(vec.pos.is_empty());
183 /// assert!(vec.neg.is_empty());
184 /// ```
185 pub fn new() -> Self {
186 SparseVec {
187 pos: Vec::new(),
188 neg: Vec::new(),
189 }
190 }
191
192 /// Generate a random sparse vector with ~1% density
193 ///
194 /// # Examples
195 ///
196 /// ```
197 /// use embeddenator::SparseVec;
198 ///
199 /// let vec = SparseVec::random();
200 /// // Vector should have approximately 1% density (100 positive + 100 negative)
201 /// assert!(vec.pos.len() > 0);
202 /// assert!(vec.neg.len() > 0);
203 /// ```
204 pub fn random() -> Self {
205 let mut rng = rand::thread_rng();
206 let sparsity = DIM / 100; // ~1% density
207
208 // Generate random indices without replacement
209 let mut indices: Vec<usize> = (0..DIM).collect();
210 indices.shuffle(&mut rng);
211
212 let mut pos: Vec<_> = indices[..sparsity].to_vec();
213 let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
214
215 pos.sort_unstable();
216 neg.sort_unstable();
217
218 SparseVec { pos, neg }
219 }
220
221 /// Encode data into a reversible sparse vector using block-based mapping
222 ///
223 /// This method implements hierarchical encoding with path-based permutations
224 /// for lossless data recovery. The encoding process:
225 /// 1. Splits data into blocks of configurable size
226 /// 2. Applies path-based permutations to each block
227 /// 3. Combines blocks using hierarchical bundling
228 ///
229 /// # Arguments
230 /// * `data` - The data to encode
231 /// * `config` - Configuration for encoding parameters
232 /// * `path` - Optional path string for hierarchical encoding (affects permutation)
233 ///
234 /// # Returns
235 /// A SparseVec that can be decoded back to the original data
236 ///
237 /// # Examples
238 ///
239 /// ```
240 /// use embeddenator::{SparseVec, ReversibleVSAConfig};
241 ///
242 /// let data = b"hello world";
243 /// let config = ReversibleVSAConfig::default();
244 /// let encoded = SparseVec::encode_data(data, &config, None);
245 ///
246 /// // encoded vector contains reversible representation of the data
247 /// assert!(!encoded.pos.is_empty() || !encoded.neg.is_empty());
248 /// ```
249 pub fn encode_data(data: &[u8], config: &ReversibleVSAConfig, path: Option<&str>) -> Self {
250 if data.is_empty() {
251 return SparseVec::new();
252 }
253
254 // Calculate path-based shift for hierarchical encoding
255 let path_shift = if let Some(path_str) = path {
256 // Use path hash to determine shift, constrained to prevent overflow
257 let mut hasher = Sha256::new();
258 hasher.update(path_str.as_bytes());
259 let hash = hasher.finalize();
260 // SAFETY: SHA256 always produces 32 bytes, first 4 bytes are always valid
261 let hash_bytes: [u8; 4] = hash[0..4].try_into()
262 .expect("SHA256 hash is always at least 4 bytes");
263 let path_hash = u32::from_le_bytes(hash_bytes) as usize;
264 (path_hash % config.max_path_depth) * config.base_shift
265 } else {
266 0
267 };
268
269 // Split data into blocks
270 let mut blocks = Vec::new();
271 for chunk in data.chunks(config.block_size) {
272 blocks.push(chunk);
273 }
274
275 // Encode each block with position-based permutation
276 let mut encoded_blocks = Vec::new();
277 for (i, block) in blocks.iter().enumerate() {
278 let block_shift = path_shift + (i * config.base_shift / blocks.len().max(1));
279 let block_vec = Self::encode_block(block, block_shift);
280 encoded_blocks.push(block_vec);
281 }
282
283 // Combine blocks hierarchically
284 if encoded_blocks.is_empty() {
285 SparseVec::new()
286 } else if encoded_blocks.len() == 1 {
287 // SAFETY: we just checked len() == 1, so next() must return Some
288 encoded_blocks.into_iter().next()
289 .expect("encoded_blocks has exactly one element")
290 } else {
291 // Hierarchical bundling: combine in binary tree fashion
292 Self::hierarchical_bundle(&encoded_blocks)
293 }
294 }
295
296 /// Decode data from a reversible sparse vector
297 ///
298 /// Reverses the encoding process to recover the original data.
299 /// Requires the same configuration and path used during encoding.
300 ///
301 /// # Arguments
302 /// * `config` - Same configuration used for encoding
303 /// * `path` - Same path string used for encoding
304 /// * `expected_size` - Expected size of the decoded data (for validation)
305 ///
306 /// # Returns
307 /// The original data bytes (may need correction layer for 100% fidelity)
308 ///
309 /// # Examples
310 ///
311 /// ```no_run
312 /// use embeddenator::{SparseVec, ReversibleVSAConfig};
313 ///
314 /// let data = b"hello world";
315 /// let config = ReversibleVSAConfig::default();
316 /// let encoded = SparseVec::encode_data(data, &config, None);
317 /// let decoded = encoded.decode_data(&config, None, data.len());
318 ///
319 /// // Note: For 100% fidelity, use CorrectionStore with EmbrFS
320 /// // Raw decode may have minor differences that corrections compensate for
321 /// ```
322 pub fn decode_data(&self, config: &ReversibleVSAConfig, path: Option<&str>, expected_size: usize) -> Vec<u8> {
323 if self.pos.is_empty() && self.neg.is_empty() {
324 return Vec::new();
325 }
326
327 if expected_size == 0 {
328 return Vec::new();
329 }
330
331 // Calculate path-based shift (same as encoding)
332 let path_shift = if let Some(path_str) = path {
333 let mut hasher = Sha256::new();
334 hasher.update(path_str.as_bytes());
335 let hash = hasher.finalize();
336 // SAFETY: SHA256 always produces 32 bytes, first 4 bytes are always valid
337 let hash_bytes: [u8; 4] = hash[0..4].try_into()
338 .expect("SHA256 hash is always at least 4 bytes");
339 let path_hash = u32::from_le_bytes(hash_bytes) as usize;
340 (path_hash % config.max_path_depth) * config.base_shift
341 } else {
342 0
343 };
344
345 // Estimate number of blocks based on expected size
346 let estimated_blocks = (expected_size + config.block_size - 1) / config.block_size;
347
348 // For single block case
349 if estimated_blocks <= 1 {
350 return Self::decode_block(self, path_shift, expected_size);
351 }
352
353 // For multiple blocks, we need to factorize the hierarchical bundle
354 // This is a simplified approach - in practice, we'd need more sophisticated
355 // factorization to separate the blocks
356 let mut result = Vec::new();
357
358 // For now, attempt to decode as much as possible
359 // This is a placeholder for the full hierarchical decoding
360 for i in 0..estimated_blocks {
361 let block_shift = path_shift + (i * config.base_shift / estimated_blocks.max(1));
362 let remaining = expected_size.saturating_sub(result.len());
363 if remaining == 0 {
364 break;
365 }
366 let max_len = remaining.min(config.block_size);
367 let block_data = Self::decode_block(self, block_shift, max_len);
368 if block_data.is_empty() {
369 break;
370 }
371 result.extend(block_data);
372 if result.len() >= expected_size {
373 break;
374 }
375 }
376
377 // Truncate to expected size
378 result.truncate(expected_size);
379 result
380 }
381
382 /// Encode a single block of data with position-based permutation
383 fn encode_block(data: &[u8], shift: usize) -> SparseVec {
384 if data.is_empty() {
385 return SparseVec::new();
386 }
387
388 // Map data bytes to vector indices using the permuted mapping
389 let mut pos = Vec::new();
390 let mut neg = Vec::new();
391
392 for (i, &byte) in data.iter().enumerate() {
393 let base_idx = (i + shift) % DIM;
394
395 // Use byte value to determine polarity and offset
396 if byte & 0x80 != 0 {
397 // High bit set -> negative
398 neg.push((base_idx + (byte & 0x7F) as usize) % DIM);
399 } else {
400 // High bit clear -> positive
401 pos.push((base_idx + byte as usize) % DIM);
402 }
403 }
404
405 pos.sort_unstable();
406 pos.dedup();
407 neg.sort_unstable();
408 neg.dedup();
409
410 SparseVec { pos, neg }
411 }
412
413 /// Decode a single block of data
414 fn decode_block(encoded: &SparseVec, shift: usize, max_len: usize) -> Vec<u8> {
415 if max_len == 0 {
416 return Vec::new();
417 }
418
419 let mut result = Vec::with_capacity(max_len);
420
421 // Reconstruct data by reversing the permutation.
422 // Note: `pos` and `neg` are kept sorted, so membership can be checked via binary search.
423 for i in 0..max_len {
424 let base_idx = (i + shift) % DIM;
425
426 // Look for indices that map back to this position
427 let mut found_byte = None;
428 for offset in 0..128u8 {
429 let test_idx = (base_idx + offset as usize) % DIM;
430
431 if encoded.pos.binary_search(&test_idx).is_ok() {
432 found_byte = Some(offset);
433 break;
434 } else if encoded.neg.binary_search(&test_idx).is_ok() {
435 found_byte = Some(offset | 0x80);
436 break;
437 }
438 }
439
440 if let Some(byte) = found_byte {
441 result.push(byte);
442 } else {
443 // No more data found
444 break;
445 }
446 }
447
448 result
449 }
450
451 /// Combine multiple vectors using hierarchical bundling
452 fn hierarchical_bundle(vectors: &[SparseVec]) -> SparseVec {
453 if vectors.is_empty() {
454 return SparseVec::new();
455 }
456 if vectors.len() == 1 {
457 return vectors[0].clone();
458 }
459
460 // Binary tree combination
461 let mut result = vectors[0].clone();
462 for vec in &vectors[1..] {
463 result = result.bundle(vec);
464 }
465 result
466 }
467
468 /// Generate a deterministic sparse vector from data using SHA256 seed
469 /// DEPRECATED: Use encode_data() for new code
470 ///
471 /// # Examples
472 ///
473 /// ```
474 /// use embeddenator::SparseVec;
475 ///
476 /// let data = b"hello world";
477 /// let vec1 = SparseVec::from_data(data);
478 /// let vec2 = SparseVec::from_data(data);
479 ///
480 /// // Same input produces same vector (deterministic)
481 /// assert_eq!(vec1.pos, vec2.pos);
482 /// assert_eq!(vec1.neg, vec2.neg);
483 /// ```
484 #[deprecated(since = "0.2.0", note = "Use encode_data() for reversible encoding")]
485 pub fn from_data(data: &[u8]) -> Self {
486 let mut hasher = Sha256::new();
487 hasher.update(data);
488 let hash = hasher.finalize();
489
490 // SAFETY: SHA256 always produces exactly 32 bytes
491 let seed: [u8; 32] = hash[..32]
492 .try_into()
493 .expect("SHA256 output is always 32 bytes");
494 let mut rng = rand::rngs::StdRng::from_seed(seed);
495
496 let mut indices: Vec<usize> = (0..DIM).collect();
497 indices.shuffle(&mut rng);
498
499 let sparsity = DIM / 100;
500 let mut pos = indices[..sparsity].to_vec();
501 let mut neg = indices[sparsity..sparsity * 2].to_vec();
502
503 pos.sort_unstable();
504 neg.sort_unstable();
505
506 SparseVec { pos, neg }
507 }
508
509 /// Bundle operation: pairwise conflict-cancel superposition (A ⊕ B)
510 ///
511 /// This is a fast, commutative merge for two vectors:
512 /// - same sign => keep
513 /// - opposite signs => cancel to 0
514 /// - sign vs 0 => keep sign
515 ///
516 /// Note: While this is well-defined for two vectors, repeated application across 3+
517 /// vectors is generally **not associative** because early cancellation/thresholding can
518 /// discard multiplicity information.
519 ///
520 /// # Arguments
521 /// * `other` - The vector to bundle with self
522 /// * `config` - Optional ReversibleVSAConfig for controlling sparsity via thinning
523 ///
524 /// # Examples
525 ///
526 /// ```
527 /// use embeddenator::{SparseVec, ReversibleVSAConfig};
528 ///
529 /// let vec1 = SparseVec::from_data(b"data1");
530 /// let vec2 = SparseVec::from_data(b"data2");
531 /// let config = ReversibleVSAConfig::default();
532 /// let bundled = vec1.bundle_with_config(&vec2, Some(&config));
533 ///
534 /// // Bundled vector contains superposition of both inputs
535 /// // Should be similar to both original vectors
536 /// let sim1 = vec1.cosine(&bundled);
537 /// let sim2 = vec2.cosine(&bundled);
538 /// assert!(sim1 > 0.3);
539 /// assert!(sim2 > 0.3);
540 /// ```
541 pub fn bundle_with_config(&self, other: &SparseVec, config: Option<&ReversibleVSAConfig>) -> SparseVec {
542 let mut result = self.bundle(other);
543
544 // Apply thinning if config provided and result exceeds target sparsity
545 if let Some(cfg) = config {
546 let current_count = result.pos.len() + result.neg.len();
547 if current_count > cfg.target_sparsity {
548 result = result.thin(cfg.target_sparsity);
549 }
550 }
551
552 result
553 }
554
555 /// Bundle operation: pairwise conflict-cancel superposition (A ⊕ B)
556 ///
557 /// See `bundle()` for semantic details; this wrapper optionally applies thinning via
558 /// `ReversibleVSAConfig`.
559 ///
560 /// # Examples
561 ///
562 /// ```
563 /// use embeddenator::SparseVec;
564 ///
565 /// let vec1 = SparseVec::from_data(b"data1");
566 /// let vec2 = SparseVec::from_data(b"data2");
567 /// let bundled = vec1.bundle(&vec2);
568 ///
569 /// // Bundled vector contains superposition of both inputs
570 /// // Should be similar to both original vectors
571 /// let sim1 = vec1.cosine(&bundled);
572 /// let sim2 = vec2.cosine(&bundled);
573 /// assert!(sim1 > 0.3);
574 /// assert!(sim2 > 0.3);
575 /// ```
576 pub fn bundle(&self, other: &SparseVec) -> SparseVec {
577 // Optional ternary-native fast path (migration gate).
578 // This is primarily intended for cases where vectors become dense enough
579 // that packed word-wise operations are competitive.
580 #[cfg(feature = "bt-phase-2")]
581 {
582 // Converting SparseVec -> PackedTritVec is O(DIM) for allocation/zeroing, plus O(nnz)
583 // for setting lanes. Only attempt the packed path when total density is high enough,
584 // and avoid converting an extremely sparse operand.
585 let a_nnz = self.nnz();
586 let b_nnz = other.nnz();
587 let total = a_nnz + b_nnz;
588 if total > DIM / 4 {
589 let min_nnz = a_nnz.min(b_nnz);
590 if min_nnz > DIM / 32 {
591 return PACKED_SCRATCH_A.with(|a_cell| {
592 PACKED_SCRATCH_B.with(|b_cell| {
593 PACKED_SCRATCH_OUT.with(|out_cell| {
594 let mut a = a_cell.borrow_mut();
595 let mut b = b_cell.borrow_mut();
596 let mut out = out_cell.borrow_mut();
597
598 a.fill_from_sparsevec(self, DIM);
599 b.fill_from_sparsevec(other, DIM);
600 a.bundle_into(&b, &mut out);
601 out.to_sparsevec()
602 })
603 })
604 });
605 }
606 }
607 }
608
609 // Majority voting for two sparse ternary vectors:
610 // - Same sign => keep
611 // - Opposite signs => cancel to 0
612 // - Sign vs 0 => keep sign
613 // This can be expressed via sorted set differences/unions.
614 let pos_a = Self::difference_sorted(&self.pos, &other.neg);
615 let pos_b = Self::difference_sorted(&other.pos, &self.neg);
616 let neg_a = Self::difference_sorted(&self.neg, &other.pos);
617 let neg_b = Self::difference_sorted(&other.neg, &self.pos);
618
619 let pos = Self::union_sorted(&pos_a, &pos_b);
620 let neg = Self::union_sorted(&neg_a, &neg_b);
621
622 SparseVec { pos, neg }
623 }
624
625 /// Associative bundle over many vectors: sums contributions per index, then thresholds to sign.
626 /// This is order-independent because all contributions are accumulated before applying sign.
627 /// Complexity: O(K log K) where K is total non-zero entries across inputs.
628 pub fn bundle_sum_many<'a, I>(vectors: I) -> SparseVec
629 where
630 I: IntoIterator<Item = &'a SparseVec>,
631 {
632 let mut contributions: Vec<(usize, i32)> = Vec::new();
633
634 for vec in vectors {
635 contributions.extend(vec.pos.iter().map(|&idx| (idx, 1i32)));
636 contributions.extend(vec.neg.iter().map(|&idx| (idx, -1i32)));
637 }
638
639 if contributions.is_empty() {
640 return SparseVec {
641 pos: Vec::new(),
642 neg: Vec::new(),
643 };
644 }
645
646 contributions.sort_unstable_by_key(|(idx, _)| *idx);
647
648 let mut pos = Vec::new();
649 let mut neg = Vec::new();
650
651 let mut iter = contributions.into_iter();
652 // SAFETY: we checked contributions.is_empty() above and returned early if empty
653 let (mut current_idx, mut acc) = iter.next()
654 .expect("contributions is non-empty after early return check");
655
656 for (idx, value) in iter {
657 if idx == current_idx {
658 acc += value;
659 } else {
660 if acc > 0 {
661 pos.push(current_idx);
662 } else if acc < 0 {
663 neg.push(current_idx);
664 }
665 current_idx = idx;
666 acc = value;
667 }
668 }
669
670 if acc > 0 {
671 pos.push(current_idx);
672 } else if acc < 0 {
673 neg.push(current_idx);
674 }
675
676 SparseVec { pos, neg }
677 }
678
679 /// Hybrid bundle: choose a fast pairwise fold for very sparse regimes (to preserve sparsity),
680 /// otherwise use the associative sum-then-threshold path (order-independent, more faithful to majority).
681 ///
682 /// Heuristic: estimate expected overlap/collision count assuming uniform hashing into `DIM`.
683 /// If expected colliding dimensions is below a small budget, use pairwise `bundle`; else use
684 /// `bundle_sum_many`.
685 pub fn bundle_hybrid_many<'a, I>(vectors: I) -> SparseVec
686 where
687 I: IntoIterator<Item = &'a SparseVec>,
688 {
689 let collected: Vec<&'a SparseVec> = vectors.into_iter().collect();
690 if collected.is_empty() {
691 return SparseVec {
692 pos: Vec::new(),
693 neg: Vec::new(),
694 };
695 }
696
697 if collected.len() == 1 {
698 return collected[0].clone();
699 }
700
701 if collected.len() == 2 {
702 return collected[0].bundle(collected[1]);
703 }
704
705 let total_nnz: usize = collected
706 .iter()
707 .map(|v| v.pos.len() + v.neg.len())
708 .sum();
709
710 if total_nnz == 0 {
711 return SparseVec {
712 pos: Vec::new(),
713 neg: Vec::new(),
714 };
715 }
716
717 // Constant-time overlap/collision risk estimate.
718 // Model: each non-zero lands uniformly in DIM dimensions.
719 // Let λ = total_nnz / DIM be expected hits per dimension.
720 // Then P(K>=2) ≈ 1 - e^{-λ}(1+λ) for Poisson(λ).
721 // If expected number of colliding dimensions is tiny, pairwise fold is effectively safe
722 // (and faster). Otherwise, use associative accumulation to avoid order sensitivity.
723 let lambda = total_nnz as f64 / DIM as f64;
724 let p_ge_2 = 1.0 - (-lambda).exp() * (1.0 + lambda);
725 let expected_colliding_dims = p_ge_2 * DIM as f64;
726
727 // Budget: allow a small number of potentially order-sensitive dimensions.
728 // Tune via benchmarks; this is conservative for integrity.
729 let collision_budget_dims = 32.0;
730
731 if expected_colliding_dims <= collision_budget_dims {
732 let mut iter = collected.into_iter();
733 // SAFETY: hierarchical_bundle is only called when collected.len() > 1
734 let mut acc = iter.next()
735 .expect("hierarchical_bundle called with non-empty collection")
736 .clone();
737 for v in iter {
738 acc = acc.bundle(v);
739 }
740 return acc;
741 }
742
743 SparseVec::bundle_sum_many(collected)
744 }
745
746 /// Bind operation: non-commutative composition (A ⊙ B)
747 /// Performs element-wise multiplication. Self-inverse: A ⊙ A ≈ I
748 ///
749 /// # Examples
750 ///
751 /// ```
752 /// use embeddenator::SparseVec;
753 ///
754 /// let vec = SparseVec::from_data(b"test");
755 /// let bound = vec.bind(&vec);
756 ///
757 /// // Bind with self should produce high similarity (self-inverse property)
758 /// let identity = SparseVec::from_data(b"identity");
759 /// let sim = bound.cosine(&identity);
760 /// // Result is approximately identity, so similarity varies
761 /// assert!(sim >= -1.0 && sim <= 1.0);
762 /// ```
763 pub fn bind(&self, other: &SparseVec) -> SparseVec {
764 #[cfg(feature = "bt-phase-2")]
765 {
766 // Packed bind is only worthwhile when both operands are dense enough.
767 // Using a short-circuiting check avoids paying extra overhead for sparse workloads.
768 let a_nnz = self.nnz();
769 if a_nnz > DIM / 4 {
770 let b_nnz = other.nnz();
771 if b_nnz > DIM / 4 {
772 return PACKED_SCRATCH_A.with(|a_cell| {
773 PACKED_SCRATCH_B.with(|b_cell| {
774 PACKED_SCRATCH_OUT.with(|out_cell| {
775 let mut a = a_cell.borrow_mut();
776 let mut b = b_cell.borrow_mut();
777 let mut out = out_cell.borrow_mut();
778
779 a.fill_from_sparsevec(self, DIM);
780 b.fill_from_sparsevec(other, DIM);
781 a.bind_into(&b, &mut out);
782 out.to_sparsevec()
783 })
784 })
785 });
786 }
787 }
788 }
789
790 // Sparse bind is element-wise multiplication restricted to non-zero support.
791 // We can compute this in a single merge-join over the signed supports, keeping
792 // outputs sorted without any post-sort.
793 let mut result_pos = Vec::new();
794 let mut result_neg = Vec::new();
795
796 let mut a_pos = 0usize;
797 let mut a_neg = 0usize;
798 let mut b_pos = 0usize;
799 let mut b_neg = 0usize;
800
801 loop {
802 let next_a = match (self.pos.get(a_pos), self.neg.get(a_neg)) {
803 (Some(&p), Some(&n)) => {
804 if p < n {
805 Some((p, 1i8, true))
806 } else {
807 Some((n, -1i8, false))
808 }
809 }
810 (Some(&p), None) => Some((p, 1i8, true)),
811 (None, Some(&n)) => Some((n, -1i8, false)),
812 (None, None) => None,
813 };
814
815 let next_b = match (other.pos.get(b_pos), other.neg.get(b_neg)) {
816 (Some(&p), Some(&n)) => {
817 if p < n {
818 Some((p, 1i8, true))
819 } else {
820 Some((n, -1i8, false))
821 }
822 }
823 (Some(&p), None) => Some((p, 1i8, true)),
824 (None, Some(&n)) => Some((n, -1i8, false)),
825 (None, None) => None,
826 };
827
828 let Some((idx_a, sign_a, a_is_pos)) = next_a else {
829 break;
830 };
831 let Some((idx_b, sign_b, b_is_pos)) = next_b else {
832 break;
833 };
834
835 match idx_a.cmp(&idx_b) {
836 std::cmp::Ordering::Less => {
837 if a_is_pos {
838 a_pos += 1;
839 } else {
840 a_neg += 1;
841 }
842 }
843 std::cmp::Ordering::Greater => {
844 if b_is_pos {
845 b_pos += 1;
846 } else {
847 b_neg += 1;
848 }
849 }
850 std::cmp::Ordering::Equal => {
851 let prod = sign_a * sign_b;
852 if prod == 1 {
853 result_pos.push(idx_a);
854 } else {
855 result_neg.push(idx_a);
856 }
857
858 if a_is_pos {
859 a_pos += 1;
860 } else {
861 a_neg += 1;
862 }
863 if b_is_pos {
864 b_pos += 1;
865 } else {
866 b_neg += 1;
867 }
868 }
869 }
870 }
871
872 SparseVec {
873 pos: result_pos,
874 neg: result_neg,
875 }
876 }
877
878 /// Calculate cosine similarity between two sparse vectors
879 /// Returns value in [-1, 1] where 1 is identical, 0 is orthogonal
880 ///
881 /// When the `simd` feature is enabled, this will automatically use
882 /// AVX2 (x86_64) or NEON (aarch64) acceleration if available.
883 ///
884 /// # Examples
885 ///
886 /// ```
887 /// use embeddenator::SparseVec;
888 ///
889 /// let vec1 = SparseVec::from_data(b"hello");
890 /// let vec2 = SparseVec::from_data(b"hello");
891 /// let vec3 = SparseVec::from_data(b"world");
892 ///
893 /// // Identical data produces identical vectors
894 /// assert!((vec1.cosine(&vec2) - 1.0).abs() < 0.01);
895 ///
896 /// // Different data produces low similarity
897 /// let sim = vec1.cosine(&vec3);
898 /// assert!(sim < 0.3);
899 /// ```
900 pub fn cosine(&self, other: &SparseVec) -> f64 {
901 #[cfg(feature = "simd")]
902 {
903 return crate::simd_cosine::cosine_simd(self, other);
904 }
905
906 #[cfg(not(feature = "simd"))]
907 {
908 // Original implementation remains as fallback
909 self.cosine_scalar(other)
910 }
911 }
912
913 /// Scalar (non-SIMD) cosine similarity implementation.
914 ///
915 /// This is the original implementation and serves as the baseline
916 /// for SIMD optimizations. It's also used when SIMD is not available.
917 #[inline]
918 pub fn cosine_scalar(&self, other: &SparseVec) -> f64 {
919 #[cfg(feature = "bt-phase-2")]
920 {
921 // Only use packed cosine when total density is high enough to amortize conversion,
922 // and avoid converting an extremely sparse operand.
923 let a_nnz = self.nnz();
924 let b_nnz = other.nnz();
925 let total = a_nnz + b_nnz;
926 if total > DIM / 4 {
927 let min_nnz = a_nnz.min(b_nnz);
928 if min_nnz > DIM / 32 {
929 let dot = PACKED_SCRATCH_A.with(|a_cell| {
930 PACKED_SCRATCH_B.with(|b_cell| {
931 let mut a = a_cell.borrow_mut();
932 let mut b = b_cell.borrow_mut();
933 a.fill_from_sparsevec(self, DIM);
934 b.fill_from_sparsevec(other, DIM);
935 a.dot(&b)
936 })
937 }) as f64;
938 let self_norm = a_nnz as f64;
939 let other_norm = b_nnz as f64;
940 if self_norm == 0.0 || other_norm == 0.0 {
941 return 0.0;
942 }
943 return dot / (self_norm.sqrt() * other_norm.sqrt());
944 }
945 }
946 }
947
948 // Sparse ternary dot product:
949 // +1 when signs match, -1 when signs oppose.
950 let pp = Self::intersection_count_sorted(&self.pos, &other.pos) as i32;
951 let nn = Self::intersection_count_sorted(&self.neg, &other.neg) as i32;
952 let pn = Self::intersection_count_sorted(&self.pos, &other.neg) as i32;
953 let np = Self::intersection_count_sorted(&self.neg, &other.pos) as i32;
954 let dot = (pp + nn) - (pn + np);
955
956 let self_norm = self.nnz() as f64;
957 let other_norm = other.nnz() as f64;
958
959 if self_norm == 0.0 || other_norm == 0.0 {
960 return 0.0;
961 }
962
963 dot as f64 / (self_norm.sqrt() * other_norm.sqrt())
964 }
965
966 /// Apply cyclic permutation to vector indices
967 /// Used for encoding sequence order in hierarchical structures
968 ///
969 /// # Arguments
970 /// * `shift` - Number of positions to shift indices cyclically
971 ///
972 /// # Examples
973 ///
974 /// ```
975 /// use embeddenator::SparseVec;
976 ///
977 /// let vec = SparseVec::from_data(b"test");
978 /// let permuted = vec.permute(100);
979 ///
980 /// // Permuted vector should have different indices but same structure
981 /// assert_eq!(vec.pos.len(), permuted.pos.len());
982 /// assert_eq!(vec.neg.len(), permuted.neg.len());
983 /// ```
984 pub fn permute(&self, shift: usize) -> SparseVec {
985 let permute_index = |idx: usize| (idx + shift) % DIM;
986
987 let pos: Vec<usize> = self.pos.iter().map(|&idx| permute_index(idx)).collect();
988 let neg: Vec<usize> = self.neg.iter().map(|&idx| permute_index(idx)).collect();
989
990 // Indices must remain sorted for efficient operations
991 let mut pos = pos;
992 let mut neg = neg;
993 pos.sort_unstable();
994 neg.sort_unstable();
995
996 SparseVec { pos, neg }
997 }
998
999 /// Apply inverse cyclic permutation to vector indices
1000 /// Decodes sequence order by reversing the permutation shift
1001 ///
1002 /// # Arguments
1003 /// * `shift` - Number of positions to reverse shift indices cyclically
1004 ///
1005 /// # Examples
1006 ///
1007 /// ```
1008 /// use embeddenator::SparseVec;
1009 ///
1010 /// let vec = SparseVec::from_data(b"test");
1011 /// let permuted = vec.permute(100);
1012 /// let recovered = permuted.inverse_permute(100);
1013 ///
1014 /// // Round-trip should recover original vector
1015 /// assert_eq!(vec.pos, recovered.pos);
1016 /// assert_eq!(vec.neg, recovered.neg);
1017 /// ```
1018 pub fn inverse_permute(&self, shift: usize) -> SparseVec {
1019 let inverse_permute_index = |idx: usize| (idx + DIM - (shift % DIM)) % DIM;
1020
1021 let pos: Vec<usize> = self.pos.iter().map(|&idx| inverse_permute_index(idx)).collect();
1022 let neg: Vec<usize> = self.neg.iter().map(|&idx| inverse_permute_index(idx)).collect();
1023
1024 // Indices must remain sorted for efficient operations
1025 let mut pos = pos;
1026 let mut neg = neg;
1027 pos.sort_unstable();
1028 neg.sort_unstable();
1029
1030 SparseVec { pos, neg }
1031 }
1032
1033 /// Context-Dependent Thinning Algorithm
1034 ///
1035 /// Thinning controls vector sparsity during bundle operations to prevent
1036 /// exponential density growth that degrades VSA performance. The algorithm:
1037 ///
1038 /// 1. Calculate current density = (pos.len() + neg.len()) as f32 / DIM as f32
1039 /// 2. If current_density <= target_density, return unchanged
1040 /// 3. Otherwise, randomly sample indices to reduce to target count
1041 /// 4. Preserve pos/neg ratio to maintain signal polarity balance
1042 /// 5. Use deterministic seeding for reproducible results
1043 ///
1044 /// Edge Cases:
1045 /// - Empty vector: return unchanged
1046 /// - target_non_zero = 0: return empty vector (not recommended)
1047 /// - target_non_zero >= current: return clone
1048 /// - Single polarity vectors: preserve polarity distribution
1049 ///
1050 /// Performance: O(n log n) due to sorting, where n = target_non_zero
1051 pub fn thin(&self, target_non_zero: usize) -> SparseVec {
1052 let current_count = self.pos.len() + self.neg.len();
1053
1054 // Edge case: already at or below target
1055 if current_count <= target_non_zero {
1056 return self.clone();
1057 }
1058
1059 // Edge case: target is zero
1060 if target_non_zero == 0 {
1061 return SparseVec::new();
1062 }
1063
1064 // Calculate how many to keep from each polarity
1065 let pos_ratio = self.pos.len() as f32 / current_count as f32;
1066 let target_pos = (target_non_zero as f32 * pos_ratio).round() as usize;
1067 let target_neg = target_non_zero - target_pos;
1068
1069 // Randomly sample indices using deterministic seed based on vector content
1070 let mut seed = [0u8; 32];
1071 seed[0..4].copy_from_slice(&(self.pos.len() as u32).to_le_bytes());
1072 seed[4..8].copy_from_slice(&(self.neg.len() as u32).to_le_bytes());
1073 seed[8..12].copy_from_slice(&(target_non_zero as u32).to_le_bytes());
1074 // Rest remains zero for deterministic seeding
1075 let mut rng = rand::rngs::StdRng::from_seed(seed);
1076
1077 // Sample positive indices
1078 let mut sampled_pos: Vec<usize> = self.pos.clone();
1079 sampled_pos.shuffle(&mut rng);
1080 sampled_pos.truncate(target_pos);
1081 sampled_pos.sort_unstable();
1082
1083 // Sample negative indices
1084 let mut sampled_neg: Vec<usize> = self.neg.clone();
1085 sampled_neg.shuffle(&mut rng);
1086 sampled_neg.truncate(target_neg);
1087 sampled_neg.sort_unstable();
1088
1089 SparseVec {
1090 pos: sampled_pos,
1091 neg: sampled_neg,
1092 }
1093 }
1094}