embeddenator_vsa/vsa.rs
1//! Vector Symbolic Architecture (VSA) Implementation
2//!
3//! Sparse ternary vector representation with algebraic operations:
4//! - Bundle (⊕): Associative superposition
5//! - Bind (⊙): Non-commutative composition
6//! - Cosine similarity for retrieval
7
8use rand::seq::SliceRandom;
9use rand::SeedableRng;
10use serde::{Deserialize, Serialize};
11use sha2::{Digest, Sha256};
12
13use crate::ternary_vec::PackedTritVec;
14
15use std::cell::RefCell;
16
17/// Dimension of VSA vectors
18pub const DIM: usize = 10000;
19
20thread_local! {
21 // Reused packed buffers for hot paths. Using TLS keeps this allocation
22 // amortized while remaining safe in multi-threaded contexts.
23 static PACKED_SCRATCH_A: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
24 static PACKED_SCRATCH_B: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
25 static PACKED_SCRATCH_OUT: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
26}
27
28/// Configuration for reversible VSA encoding/decoding operations
29#[derive(Clone, Debug, Serialize, Deserialize)]
30pub struct ReversibleVSAConfig {
31 /// Block size for chunked encoding (must be power of 2 for efficiency)
32 pub block_size: usize,
33 /// Maximum path depth for hierarchical encoding
34 pub max_path_depth: usize,
35 /// Base permutation shift for path-based encoding
36 pub base_shift: usize,
37 /// Target sparsity level for operations (number of non-zero elements)
38 pub target_sparsity: usize,
39}
40
41impl Default for ReversibleVSAConfig {
42 fn default() -> Self {
43 ReversibleVSAConfig {
44 block_size: 256, // 256-byte blocks
45 max_path_depth: 10,
46 base_shift: 1000,
47 target_sparsity: 200, // Default sparsity level
48 }
49 }
50}
51
52impl ReversibleVSAConfig {
53 /// Create config optimized for small data blocks
54 pub fn small_blocks() -> Self {
55 ReversibleVSAConfig {
56 block_size: 64,
57 max_path_depth: 5,
58 base_shift: 500,
59 target_sparsity: 100,
60 }
61 }
62
63 /// Create config optimized for large data blocks
64 pub fn large_blocks() -> Self {
65 ReversibleVSAConfig {
66 block_size: 1024,
67 max_path_depth: 20,
68 base_shift: 2000,
69 target_sparsity: 400,
70 }
71 }
72}
73
74/// Sparse ternary vector with positive and negative indices
75#[derive(Clone, Debug, Serialize, Deserialize)]
76pub struct SparseVec {
77 /// Indices with +1 value
78 pub pos: Vec<usize>,
79 /// Indices with -1 value
80 pub neg: Vec<usize>,
81}
82
83impl Default for SparseVec {
84 fn default() -> Self {
85 Self::new()
86 }
87}
88
89impl SparseVec {
90 #[inline]
91 fn nnz(&self) -> usize {
92 self.pos.len() + self.neg.len()
93 }
94
95 fn intersection_count_sorted(a: &[usize], b: &[usize]) -> usize {
96 let mut i = 0usize;
97 let mut j = 0usize;
98 let mut count = 0usize;
99 while i < a.len() && j < b.len() {
100 match a[i].cmp(&b[j]) {
101 std::cmp::Ordering::Less => i += 1,
102 std::cmp::Ordering::Greater => j += 1,
103 std::cmp::Ordering::Equal => {
104 count += 1;
105 i += 1;
106 j += 1;
107 }
108 }
109 }
110 count
111 }
112
113 fn union_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
114 let mut out = Vec::with_capacity(a.len() + b.len());
115 let mut i = 0usize;
116 let mut j = 0usize;
117
118 while i < a.len() && j < b.len() {
119 match a[i].cmp(&b[j]) {
120 std::cmp::Ordering::Less => {
121 out.push(a[i]);
122 i += 1;
123 }
124 std::cmp::Ordering::Greater => {
125 out.push(b[j]);
126 j += 1;
127 }
128 std::cmp::Ordering::Equal => {
129 out.push(a[i]);
130 i += 1;
131 j += 1;
132 }
133 }
134 }
135
136 if i < a.len() {
137 out.extend_from_slice(&a[i..]);
138 }
139 if j < b.len() {
140 out.extend_from_slice(&b[j..]);
141 }
142
143 out
144 }
145
146 fn difference_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
147 let mut out = Vec::with_capacity(a.len());
148 let mut i = 0usize;
149 let mut j = 0usize;
150
151 while i < a.len() && j < b.len() {
152 match a[i].cmp(&b[j]) {
153 std::cmp::Ordering::Less => {
154 out.push(a[i]);
155 i += 1;
156 }
157 std::cmp::Ordering::Greater => j += 1,
158 std::cmp::Ordering::Equal => {
159 i += 1;
160 j += 1;
161 }
162 }
163 }
164
165 if i < a.len() {
166 out.extend_from_slice(&a[i..]);
167 }
168
169 out
170 }
171
172 fn intersection_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
173 let mut out = Vec::with_capacity(a.len().min(b.len()));
174 let mut i = 0usize;
175 let mut j = 0usize;
176
177 while i < a.len() && j < b.len() {
178 match a[i].cmp(&b[j]) {
179 std::cmp::Ordering::Less => i += 1,
180 std::cmp::Ordering::Greater => j += 1,
181 std::cmp::Ordering::Equal => {
182 out.push(a[i]);
183 i += 1;
184 j += 1;
185 }
186 }
187 }
188
189 out
190 }
191 /// Create an empty sparse vector
192 ///
193 /// # Examples
194 ///
195 /// ```
196 /// use embeddenator_vsa::SparseVec;
197 ///
198 /// let vec = SparseVec::new();
199 /// assert!(vec.pos.is_empty());
200 /// assert!(vec.neg.is_empty());
201 /// ```
202 pub fn new() -> Self {
203 SparseVec {
204 pos: Vec::new(),
205 neg: Vec::new(),
206 }
207 }
208
209 /// Generate a random sparse vector with ~1% density
210 ///
211 /// # Examples
212 ///
213 /// ```
214 /// use embeddenator_vsa::SparseVec;
215 ///
216 /// let vec = SparseVec::random();
217 /// // Vector should have approximately 1% density (100 positive + 100 negative)
218 /// assert!(vec.pos.len() > 0);
219 /// assert!(vec.neg.len() > 0);
220 /// ```
221 pub fn random() -> Self {
222 let mut rng = rand::rng();
223 let sparsity = DIM / 100; // ~1% density
224
225 // Generate random indices without replacement
226 let mut indices: Vec<usize> = (0..DIM).collect();
227 indices.shuffle(&mut rng);
228
229 let mut pos: Vec<_> = indices[..sparsity].to_vec();
230 let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
231
232 pos.sort_unstable();
233 neg.sort_unstable();
234
235 SparseVec { pos, neg }
236 }
237
238 /// Encode data into a reversible sparse vector using block-based mapping
239 ///
240 /// This method implements hierarchical encoding with path-based permutations
241 /// for lossless data recovery. The encoding process:
242 /// 1. Splits data into blocks of configurable size
243 /// 2. Applies path-based permutations to each block
244 /// 3. Combines blocks using hierarchical bundling
245 ///
246 /// # Arguments
247 /// * `data` - The data to encode
248 /// * `config` - Configuration for encoding parameters
249 /// * `path` - Optional path string for hierarchical encoding (affects permutation)
250 ///
251 /// # Returns
252 /// A SparseVec that can be decoded back to the original data
253 ///
254 /// # Examples
255 ///
256 /// ```
257 /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
258 ///
259 /// let data = b"hello world";
260 /// let config = ReversibleVSAConfig::default();
261 /// let encoded = SparseVec::encode_data(data, &config, None);
262 ///
263 /// // encoded vector contains reversible representation of the data
264 /// assert!(!encoded.pos.is_empty() || !encoded.neg.is_empty());
265 /// ```
266 pub fn encode_data(data: &[u8], config: &ReversibleVSAConfig, path: Option<&str>) -> Self {
267 if data.is_empty() {
268 return SparseVec::new();
269 }
270
271 // Calculate path-based shift for hierarchical encoding
272 let path_shift = if let Some(path_str) = path {
273 // Use path hash to determine shift, constrained to prevent overflow
274 let mut hasher = Sha256::new();
275 hasher.update(path_str.as_bytes());
276 let hash = hasher.finalize();
277 // SAFETY: SHA256 always produces 32 bytes, first 4 bytes are always valid
278 let hash_bytes: [u8; 4] = hash[0..4]
279 .try_into()
280 .expect("SHA256 hash must produce at least 4 bytes");
281 let path_hash = u32::from_le_bytes(hash_bytes) as usize;
282 (path_hash % config.max_path_depth) * config.base_shift
283 } else {
284 0
285 };
286
287 // Split data into blocks
288 let mut blocks = Vec::new();
289 for chunk in data.chunks(config.block_size) {
290 blocks.push(chunk);
291 }
292
293 // Encode each block with position-based permutation
294 let mut encoded_blocks = Vec::new();
295 for (i, block) in blocks.iter().enumerate() {
296 let block_shift = path_shift + (i * config.base_shift / blocks.len().max(1));
297 let block_vec = Self::encode_block(block, block_shift);
298 encoded_blocks.push(block_vec);
299 }
300
301 // Combine blocks hierarchically
302 if encoded_blocks.is_empty() {
303 SparseVec::new()
304 } else if encoded_blocks.len() == 1 {
305 // SAFETY: we just checked len() == 1, so next() must return Some
306 encoded_blocks
307 .into_iter()
308 .next()
309 .expect("encoded_blocks must have exactly one element")
310 } else {
311 // Hierarchical bundling: combine in binary tree fashion
312 Self::hierarchical_bundle(&encoded_blocks)
313 }
314 }
315
316 /// Decode data from a reversible sparse vector
317 ///
318 /// Reverses the encoding process to recover the original data.
319 /// Requires the same configuration and path used during encoding.
320 ///
321 /// # Arguments
322 /// * `config` - Same configuration used for encoding
323 /// * `path` - Same path string used for encoding
324 /// * `expected_size` - Expected size of the decoded data (for validation)
325 ///
326 /// # Returns
327 /// The original data bytes (may need correction layer for 100% fidelity)
328 ///
329 /// # Examples
330 ///
331 /// ```no_run
332 /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
333 ///
334 /// let data = b"hello world";
335 /// let config = ReversibleVSAConfig::default();
336 /// let encoded = SparseVec::encode_data(data, &config, None);
337 /// let decoded = encoded.decode_data(&config, None, data.len());
338 ///
339 /// // Note: For 100% fidelity, use CorrectionStore with EmbrFS
340 /// // Raw decode may have minor differences that corrections compensate for
341 /// ```
342 pub fn decode_data(
343 &self,
344 config: &ReversibleVSAConfig,
345 path: Option<&str>,
346 expected_size: usize,
347 ) -> Vec<u8> {
348 if self.pos.is_empty() && self.neg.is_empty() {
349 return Vec::new();
350 }
351
352 if expected_size == 0 {
353 return Vec::new();
354 }
355
356 // Calculate path-based shift (same as encoding)
357 let path_shift = if let Some(path_str) = path {
358 let mut hasher = Sha256::new();
359 hasher.update(path_str.as_bytes());
360 let hash = hasher.finalize();
361 // SAFETY: SHA256 always produces 32 bytes, first 4 bytes are always valid
362 let hash_bytes: [u8; 4] = hash[0..4]
363 .try_into()
364 .expect("SHA256 hash must produce at least 4 bytes");
365 let path_hash = u32::from_le_bytes(hash_bytes) as usize;
366 (path_hash % config.max_path_depth) * config.base_shift
367 } else {
368 0
369 };
370
371 // Estimate number of blocks based on expected size
372 let estimated_blocks = expected_size.div_ceil(config.block_size);
373
374 // For single block case
375 if estimated_blocks <= 1 {
376 return Self::decode_block(self, path_shift, expected_size);
377 }
378
379 // =======================================================================
380 // MULTI-BLOCK DECODING LIMITATIONS (v0.21.0)
381 // =======================================================================
382 //
383 // Current Status: EXPERIMENTAL / SIMPLIFIED IMPLEMENTATION
384 //
385 // The multi-block decoding uses a simplified linear approach without proper
386 // hierarchical factorization. This has the following implications:
387 //
388 // 1. **Information Loss**: When multiple blocks are bundled hierarchically,
389 // separating them requires VSA factorization techniques (e.g., HRR unbinding,
390 // MAP estimation) that are not yet implemented.
391 //
392 // 2. **Use Case**: Works reliably for:
393 // - Single-block data (data.len() <= config.block_size)
394 // - Data with clear block boundaries
395 //
396 // 3. **Known Limitation**: Multi-block data (> block_size) may have reduced
397 // fidelity. For 100% accuracy, use the Codebook's residual mechanism or
398 // external correction stores.
399 //
400 // 4. **Future Work**: Full hierarchical decoding is planned for v1.1.0.
401 // See IMPLEMENTATION_PLAN.md Task 4.2 for details.
402 //
403 // =======================================================================
404
405 // For multiple blocks, we need to factorize the hierarchical bundle
406 // This is a simplified approach - in practice, we'd need more sophisticated
407 // factorization to separate the blocks
408 let mut result = Vec::new();
409
410 // Simplified best-effort linear decoding for multi-block data.
411 // WARNING: This path trades accuracy for speed and may:
412 // - Lose information across blocks, and/or
413 // - Mis-assign tokens between adjacent blocks.
414 // For applications that require 100% fidelity, prefer the Codebook's
415 // residual mechanism or external correction stores instead of relying
416 // solely on this multi-block decoder.
417 for i in 0..estimated_blocks {
418 let block_shift = path_shift + (i * config.base_shift / estimated_blocks.max(1));
419 let remaining = expected_size.saturating_sub(result.len());
420 if remaining == 0 {
421 break;
422 }
423 let max_len = remaining.min(config.block_size);
424 let block_data = Self::decode_block(self, block_shift, max_len);
425 if block_data.is_empty() {
426 break;
427 }
428 result.extend(block_data);
429 if result.len() >= expected_size {
430 break;
431 }
432 }
433
434 // Truncate to expected size
435 result.truncate(expected_size);
436 result
437 }
438
439 /// Encode a single block of data with position-based permutation
440 fn encode_block(data: &[u8], shift: usize) -> SparseVec {
441 if data.is_empty() {
442 return SparseVec::new();
443 }
444
445 // Map data bytes to vector indices using the permuted mapping
446 let mut pos = Vec::new();
447 let mut neg = Vec::new();
448
449 for (i, &byte) in data.iter().enumerate() {
450 let base_idx = (i + shift) % DIM;
451
452 // Use byte value to determine polarity and offset
453 if byte & 0x80 != 0 {
454 // High bit set -> negative
455 neg.push((base_idx + (byte & 0x7F) as usize) % DIM);
456 } else {
457 // High bit clear -> positive
458 pos.push((base_idx + byte as usize) % DIM);
459 }
460 }
461
462 pos.sort_unstable();
463 pos.dedup();
464 neg.sort_unstable();
465 neg.dedup();
466
467 // CRITICAL FIX: Ensure pos and neg are disjoint
468 // If an index appears in both pos and neg, it means we have conflicting
469 // encodings. Remove from both to treat as 0 (cancel out).
470 let overlap = Self::intersection_sorted(&pos, &neg);
471 if !overlap.is_empty() {
472 pos = Self::difference_sorted(&pos, &overlap);
473 neg = Self::difference_sorted(&neg, &overlap);
474 }
475
476 SparseVec { pos, neg }
477 }
478
479 /// Decode a single block of data
480 fn decode_block(encoded: &SparseVec, shift: usize, max_len: usize) -> Vec<u8> {
481 if max_len == 0 {
482 return Vec::new();
483 }
484
485 let mut result = Vec::with_capacity(max_len);
486
487 // Reconstruct data by reversing the permutation.
488 // Note: `pos` and `neg` are kept sorted, so membership can be checked via binary search.
489 for i in 0..max_len {
490 let base_idx = (i + shift) % DIM;
491
492 // Look for indices that map back to this position
493 let mut found_byte = None;
494 for offset in 0..128u8 {
495 let test_idx = (base_idx + offset as usize) % DIM;
496
497 if encoded.pos.binary_search(&test_idx).is_ok() {
498 found_byte = Some(offset);
499 break;
500 } else if encoded.neg.binary_search(&test_idx).is_ok() {
501 found_byte = Some(offset | 0x80);
502 break;
503 }
504 }
505
506 if let Some(byte) = found_byte {
507 result.push(byte);
508 } else {
509 // No more data found
510 break;
511 }
512 }
513
514 result
515 }
516
517 /// Combine multiple vectors using hierarchical bundling
518 fn hierarchical_bundle(vectors: &[SparseVec]) -> SparseVec {
519 if vectors.is_empty() {
520 return SparseVec::new();
521 }
522 if vectors.len() == 1 {
523 return vectors[0].clone();
524 }
525
526 // Binary tree combination
527 let mut result = vectors[0].clone();
528 for vec in &vectors[1..] {
529 result = result.bundle(vec);
530 }
531 result
532 }
533
534 /// Generate a deterministic sparse vector from data using SHA256 seed
535 /// DEPRECATED: Use encode_data() for new code
536 ///
537 /// # Examples
538 ///
539 /// ```
540 /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
541 ///
542 /// let data = b"hello world";
543 /// let config = ReversibleVSAConfig::default();
544 /// let vec1 = SparseVec::encode_data(data, &config, None);
545 /// let vec2 = SparseVec::encode_data(data, &config, None);
546 ///
547 /// // Same input produces same vector (deterministic)
548 /// assert_eq!(vec1.pos, vec2.pos);
549 /// assert_eq!(vec1.neg, vec2.neg);
550 /// ```
551 #[deprecated(since = "0.2.0", note = "Use encode_data() for reversible encoding")]
552 pub fn from_data(data: &[u8]) -> Self {
553 let mut hasher = Sha256::new();
554 hasher.update(data);
555 let hash = hasher.finalize();
556
557 // SAFETY: SHA256 always produces exactly 32 bytes
558 let seed: [u8; 32] = hash[..32]
559 .try_into()
560 .expect("SHA256 must produce exactly 32 bytes");
561 let mut rng = rand::rngs::StdRng::from_seed(seed);
562
563 let mut indices: Vec<usize> = (0..DIM).collect();
564 indices.shuffle(&mut rng);
565
566 let sparsity = DIM / 100;
567 let mut pos = indices[..sparsity].to_vec();
568 let mut neg = indices[sparsity..sparsity * 2].to_vec();
569
570 pos.sort_unstable();
571 neg.sort_unstable();
572
573 SparseVec { pos, neg }
574 }
575
576 /// Bundle operation: pairwise conflict-cancel superposition (A ⊕ B)
577 ///
578 /// This is a fast, commutative merge for two vectors:
579 /// - same sign => keep
580 /// - opposite signs => cancel to 0
581 /// - sign vs 0 => keep sign
582 ///
583 /// Note: While this is well-defined for two vectors, repeated application across 3+
584 /// vectors is generally **not associative** because early cancellation/thresholding can
585 /// discard multiplicity information.
586 ///
587 /// # Arguments
588 /// * `other` - The vector to bundle with self
589 /// * `config` - Optional ReversibleVSAConfig for controlling sparsity via thinning
590 ///
591 /// # Examples
592 ///
593 /// ```
594 /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
595 ///
596 /// let config = ReversibleVSAConfig::default();
597 /// let vec1 = SparseVec::encode_data(b"data1", &config, None);
598 /// let vec2 = SparseVec::encode_data(b"data2", &config, None);
599 /// let bundled = vec1.bundle_with_config(&vec2, Some(&config));
600 ///
601 /// // Bundled vector contains superposition of both inputs
602 /// // Should be similar to both original vectors
603 /// let sim1 = vec1.cosine(&bundled);
604 /// let sim2 = vec2.cosine(&bundled);
605 /// assert!(sim1 > 0.3);
606 /// assert!(sim2 > 0.3);
607 /// ```
608 pub fn bundle_with_config(
609 &self,
610 other: &SparseVec,
611 config: Option<&ReversibleVSAConfig>,
612 ) -> SparseVec {
613 let mut result = self.bundle(other);
614
615 // Apply thinning if config provided and result exceeds target sparsity
616 if let Some(cfg) = config {
617 let current_count = result.pos.len() + result.neg.len();
618 if current_count > cfg.target_sparsity {
619 result = result.thin(cfg.target_sparsity);
620 }
621 }
622
623 result
624 }
625
626 /// Bundle operation: pairwise conflict-cancel superposition (A ⊕ B)
627 ///
628 /// See `bundle()` for semantic details; this wrapper optionally applies thinning via
629 /// `ReversibleVSAConfig`.
630 ///
631 /// # Examples
632 ///
633 /// ```
634 /// use embeddenator_vsa::SparseVec;
635 ///
636 /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
637 /// let vec1 = SparseVec::encode_data(b"data1", &config, None);
638 /// let vec2 = SparseVec::encode_data(b"data2", &config, None);
639 /// let bundled = vec1.bundle(&vec2);
640 ///
641 /// // Bundled vector contains superposition of both inputs
642 /// // Should be similar to both original vectors
643 /// let sim1 = vec1.cosine(&bundled);
644 /// let sim2 = vec2.cosine(&bundled);
645 /// assert!(sim1 > 0.3);
646 /// assert!(sim2 > 0.3);
647 /// ```
648 pub fn bundle(&self, other: &SparseVec) -> SparseVec {
649 // Optional ternary-native fast path (migration gate).
650 // This is primarily intended for cases where vectors become dense enough
651 // that packed word-wise operations are competitive.
652 {
653 // Converting SparseVec -> PackedTritVec is O(DIM) for allocation/zeroing, plus O(nnz)
654 // for setting lanes. Only attempt the packed path when total density is high enough,
655 // and avoid converting an extremely sparse operand.
656 let a_nnz = self.nnz();
657 let b_nnz = other.nnz();
658 let total = a_nnz + b_nnz;
659 if total > DIM / 4 {
660 let min_nnz = a_nnz.min(b_nnz);
661 if min_nnz > DIM / 32 {
662 return PACKED_SCRATCH_A.with(|a_cell| {
663 PACKED_SCRATCH_B.with(|b_cell| {
664 PACKED_SCRATCH_OUT.with(|out_cell| {
665 let mut a = a_cell.borrow_mut();
666 let mut b = b_cell.borrow_mut();
667 let mut out = out_cell.borrow_mut();
668
669 a.fill_from_sparsevec(self, DIM);
670 b.fill_from_sparsevec(other, DIM);
671 a.bundle_into(&b, &mut out);
672 out.to_sparsevec()
673 })
674 })
675 });
676 }
677 }
678 }
679
680 // Majority voting for two sparse ternary vectors:
681 // - Same sign => keep
682 // - Opposite signs => cancel to 0
683 // - Sign vs 0 => keep sign
684 // This can be expressed via sorted set differences/unions.
685 let pos_a = Self::difference_sorted(&self.pos, &other.neg);
686 let pos_b = Self::difference_sorted(&other.pos, &self.neg);
687 let neg_a = Self::difference_sorted(&self.neg, &other.pos);
688 let neg_b = Self::difference_sorted(&other.neg, &self.pos);
689
690 let mut pos = Self::union_sorted(&pos_a, &pos_b);
691 let mut neg = Self::union_sorted(&neg_a, &neg_b);
692
693 // CRITICAL FIX: Ensure pos and neg remain disjoint
694 // Remove any indices that appear in both (shouldn't happen in theory,
695 // but can occur due to floating-point or hierarchical bundling issues)
696 let overlap = Self::intersection_sorted(&pos, &neg);
697 if !overlap.is_empty() {
698 pos = Self::difference_sorted(&pos, &overlap);
699 neg = Self::difference_sorted(&neg, &overlap);
700 }
701
702 SparseVec { pos, neg }
703 }
704
705 /// Associative bundle over many vectors: sums contributions per index, then thresholds to sign.
706 /// This is order-independent because all contributions are accumulated before applying sign.
707 /// Complexity: O(K log K) where K is total non-zero entries across inputs.
708 pub fn bundle_sum_many<'a, I>(vectors: I) -> SparseVec
709 where
710 I: IntoIterator<Item = &'a SparseVec>,
711 {
712 let mut contributions: Vec<(usize, i32)> = Vec::new();
713
714 for vec in vectors {
715 contributions.extend(vec.pos.iter().map(|&idx| (idx, 1i32)));
716 contributions.extend(vec.neg.iter().map(|&idx| (idx, -1i32)));
717 }
718
719 if contributions.is_empty() {
720 return SparseVec {
721 pos: Vec::new(),
722 neg: Vec::new(),
723 };
724 }
725
726 contributions.sort_unstable_by_key(|(idx, _)| *idx);
727
728 let mut pos = Vec::new();
729 let mut neg = Vec::new();
730
731 let mut iter = contributions.into_iter();
732 // SAFETY: we checked contributions.is_empty() above and returned early if empty
733 let (mut current_idx, mut acc) = iter
734 .next()
735 .expect("contributions must be non-empty after early return check");
736
737 for (idx, value) in iter {
738 if idx == current_idx {
739 acc += value;
740 } else {
741 if acc > 0 {
742 pos.push(current_idx);
743 } else if acc < 0 {
744 neg.push(current_idx);
745 }
746 current_idx = idx;
747 acc = value;
748 }
749 }
750
751 if acc > 0 {
752 pos.push(current_idx);
753 } else if acc < 0 {
754 neg.push(current_idx);
755 }
756
757 SparseVec { pos, neg }
758 }
759
760 /// Hybrid bundle: choose a fast pairwise fold for very sparse regimes (to preserve sparsity),
761 /// otherwise use the associative sum-then-threshold path (order-independent, more faithful to majority).
762 ///
763 /// Heuristic: estimate expected overlap/collision count assuming uniform hashing into `DIM`.
764 /// If expected colliding dimensions is below a small budget, use pairwise `bundle`; else use
765 /// `bundle_sum_many`.
766 pub fn bundle_hybrid_many<'a, I>(vectors: I) -> SparseVec
767 where
768 I: IntoIterator<Item = &'a SparseVec>,
769 {
770 let collected: Vec<&'a SparseVec> = vectors.into_iter().collect();
771 if collected.is_empty() {
772 return SparseVec {
773 pos: Vec::new(),
774 neg: Vec::new(),
775 };
776 }
777
778 if collected.len() == 1 {
779 return collected[0].clone();
780 }
781
782 if collected.len() == 2 {
783 return collected[0].bundle(collected[1]);
784 }
785
786 let total_nnz: usize = collected.iter().map(|v| v.pos.len() + v.neg.len()).sum();
787
788 if total_nnz == 0 {
789 return SparseVec {
790 pos: Vec::new(),
791 neg: Vec::new(),
792 };
793 }
794
795 // Constant-time overlap/collision risk estimate.
796 // Model: each non-zero lands uniformly in DIM dimensions.
797 // Let λ = total_nnz / DIM be expected hits per dimension.
798 // Then P(K>=2) ≈ 1 - e^{-λ}(1+λ) for Poisson(λ).
799 // If expected number of colliding dimensions is tiny, pairwise fold is effectively safe
800 // (and faster). Otherwise, use associative accumulation to avoid order sensitivity.
801 let lambda = total_nnz as f64 / DIM as f64;
802 let p_ge_2 = 1.0 - (-lambda).exp() * (1.0 + lambda);
803 let expected_colliding_dims = p_ge_2 * DIM as f64;
804
805 // Budget: allow a small number of potentially order-sensitive dimensions.
806 // Tune via benchmarks; this is conservative for integrity.
807 let collision_budget_dims = 32.0;
808
809 if expected_colliding_dims <= collision_budget_dims {
810 let mut iter = collected.into_iter();
811 // SAFETY: hierarchical_bundle is only called when collected.len() > 1
812 let mut acc = iter
813 .next()
814 .expect("hierarchical_bundle must be called with non-empty collection")
815 .clone();
816 for v in iter {
817 acc = acc.bundle(v);
818 }
819 return acc;
820 }
821
822 SparseVec::bundle_sum_many(collected)
823 }
824
825 /// Bind operation: non-commutative composition (A ⊙ B)
826 /// Performs element-wise multiplication. Self-inverse: A ⊙ A ≈ I
827 ///
828 /// # Examples
829 ///
830 /// ```
831 /// use embeddenator_vsa::SparseVec;
832 ///
833 /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
834 /// let vec = SparseVec::encode_data(b"test", &config, None);
835 /// let bound = vec.bind(&vec);
836 ///
837 /// // Bind with self should produce high similarity (self-inverse property)
838 /// let identity = SparseVec::encode_data(b"identity", &config, None);
839 /// let sim = bound.cosine(&identity);
840 /// // Result is approximately identity, so similarity varies
841 /// assert!(sim >= -1.0 && sim <= 1.0);
842 /// ```
843 pub fn bind(&self, other: &SparseVec) -> SparseVec {
844 {
845 // Packed bind is only worthwhile when both operands are dense enough.
846 // Using a short-circuiting check avoids paying extra overhead for sparse workloads.
847 let a_nnz = self.nnz();
848 if a_nnz > DIM / 4 {
849 let b_nnz = other.nnz();
850 if b_nnz > DIM / 4 {
851 return PACKED_SCRATCH_A.with(|a_cell| {
852 PACKED_SCRATCH_B.with(|b_cell| {
853 PACKED_SCRATCH_OUT.with(|out_cell| {
854 let mut a = a_cell.borrow_mut();
855 let mut b = b_cell.borrow_mut();
856 let mut out = out_cell.borrow_mut();
857
858 a.fill_from_sparsevec(self, DIM);
859 b.fill_from_sparsevec(other, DIM);
860 a.bind_into(&b, &mut out);
861 out.to_sparsevec()
862 })
863 })
864 });
865 }
866 }
867 }
868
869 // Sparse bind is element-wise multiplication restricted to non-zero support.
870 // We can compute this in a single merge-join over the signed supports, keeping
871 // outputs sorted without any post-sort.
872 let mut result_pos = Vec::new();
873 let mut result_neg = Vec::new();
874
875 let mut a_pos = 0usize;
876 let mut a_neg = 0usize;
877 let mut b_pos = 0usize;
878 let mut b_neg = 0usize;
879
880 loop {
881 let next_a = match (self.pos.get(a_pos), self.neg.get(a_neg)) {
882 (Some(&p), Some(&n)) => {
883 if p < n {
884 Some((p, 1i8, true))
885 } else {
886 Some((n, -1i8, false))
887 }
888 }
889 (Some(&p), None) => Some((p, 1i8, true)),
890 (None, Some(&n)) => Some((n, -1i8, false)),
891 (None, None) => None,
892 };
893
894 let next_b = match (other.pos.get(b_pos), other.neg.get(b_neg)) {
895 (Some(&p), Some(&n)) => {
896 if p < n {
897 Some((p, 1i8, true))
898 } else {
899 Some((n, -1i8, false))
900 }
901 }
902 (Some(&p), None) => Some((p, 1i8, true)),
903 (None, Some(&n)) => Some((n, -1i8, false)),
904 (None, None) => None,
905 };
906
907 let Some((idx_a, sign_a, a_is_pos)) = next_a else {
908 break;
909 };
910 let Some((idx_b, sign_b, b_is_pos)) = next_b else {
911 break;
912 };
913
914 match idx_a.cmp(&idx_b) {
915 std::cmp::Ordering::Less => {
916 if a_is_pos {
917 a_pos += 1;
918 } else {
919 a_neg += 1;
920 }
921 }
922 std::cmp::Ordering::Greater => {
923 if b_is_pos {
924 b_pos += 1;
925 } else {
926 b_neg += 1;
927 }
928 }
929 std::cmp::Ordering::Equal => {
930 let prod = sign_a * sign_b;
931 if prod == 1 {
932 result_pos.push(idx_a);
933 } else {
934 result_neg.push(idx_a);
935 }
936
937 if a_is_pos {
938 a_pos += 1;
939 } else {
940 a_neg += 1;
941 }
942 if b_is_pos {
943 b_pos += 1;
944 } else {
945 b_neg += 1;
946 }
947 }
948 }
949 }
950
951 SparseVec {
952 pos: result_pos,
953 neg: result_neg,
954 }
955 }
956
957 /// Calculate cosine similarity between two sparse vectors
958 /// Returns value in [-1, 1] where 1 is identical, 0 is orthogonal
959 ///
960 /// When the `simd` feature is enabled, this will automatically use
961 /// AVX2 (x86_64) or NEON (aarch64) acceleration if available.
962 ///
963 /// # Examples
964 ///
965 /// ```
966 /// use embeddenator_vsa::SparseVec;
967 ///
968 /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
969 /// let vec1 = SparseVec::encode_data(b"cat", &config, None);
970 /// let vec2 = SparseVec::encode_data(b"cat", &config, None);
971 /// let vec3 = SparseVec::encode_data(b"dog", &config, None);
972 ///
973 /// // Identical data produces identical vectors
974 /// assert!((vec1.cosine(&vec2) - 1.0).abs() < 0.01);
975 ///
976 /// // Different data produces low similarity
977 /// let sim = vec1.cosine(&vec3);
978 /// assert!(sim < 0.3);
979 /// ```
980 pub fn cosine(&self, other: &SparseVec) -> f64 {
981 #[cfg(feature = "simd")]
982 {
983 crate::simd_cosine::cosine_simd(self, other)
984 }
985
986 #[cfg(not(feature = "simd"))]
987 {
988 // Original implementation remains as fallback
989 self.cosine_scalar(other)
990 }
991 }
992
993 /// Scalar (non-SIMD) cosine similarity implementation.
994 ///
995 /// This is the original implementation and serves as the baseline
996 /// for SIMD optimizations. It's also used when SIMD is not available.
997 #[inline]
998 pub fn cosine_scalar(&self, other: &SparseVec) -> f64 {
999 {
1000 // Only use packed cosine when total density is high enough to amortize conversion,
1001 // and avoid converting an extremely sparse operand.
1002 let a_nnz = self.nnz();
1003 let b_nnz = other.nnz();
1004 let total = a_nnz + b_nnz;
1005 if total > DIM / 4 {
1006 let min_nnz = a_nnz.min(b_nnz);
1007 if min_nnz > DIM / 32 {
1008 let dot = PACKED_SCRATCH_A.with(|a_cell| {
1009 PACKED_SCRATCH_B.with(|b_cell| {
1010 let mut a = a_cell.borrow_mut();
1011 let mut b = b_cell.borrow_mut();
1012 a.fill_from_sparsevec(self, DIM);
1013 b.fill_from_sparsevec(other, DIM);
1014 a.dot(&b)
1015 })
1016 }) as f64;
1017 let self_norm = a_nnz as f64;
1018 let other_norm = b_nnz as f64;
1019 if self_norm == 0.0 || other_norm == 0.0 {
1020 return 0.0;
1021 }
1022 return dot / (self_norm.sqrt() * other_norm.sqrt());
1023 }
1024 }
1025 }
1026
1027 // Sparse ternary dot product:
1028 // +1 when signs match, -1 when signs oppose.
1029 let pp = Self::intersection_count_sorted(&self.pos, &other.pos) as i32;
1030 let nn = Self::intersection_count_sorted(&self.neg, &other.neg) as i32;
1031 let pn = Self::intersection_count_sorted(&self.pos, &other.neg) as i32;
1032 let np = Self::intersection_count_sorted(&self.neg, &other.pos) as i32;
1033 let dot = (pp + nn) - (pn + np);
1034
1035 let self_norm = self.nnz() as f64;
1036 let other_norm = other.nnz() as f64;
1037
1038 if self_norm == 0.0 || other_norm == 0.0 {
1039 return 0.0;
1040 }
1041
1042 dot as f64 / (self_norm.sqrt() * other_norm.sqrt())
1043 }
1044
1045 /// Apply cyclic permutation to vector indices
1046 /// Used for encoding sequence order in hierarchical structures
1047 ///
1048 /// # Arguments
1049 /// * `shift` - Number of positions to shift indices cyclically
1050 ///
1051 /// # Examples
1052 ///
1053 /// ```
1054 /// use embeddenator_vsa::SparseVec;
1055 ///
1056 /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1057 /// let vec = SparseVec::encode_data(b"test", &config, None);
1058 /// let permuted = vec.permute(100);
1059 ///
1060 /// // Permuted vector should have different indices but same structure
1061 /// assert_eq!(vec.pos.len(), permuted.pos.len());
1062 /// assert_eq!(vec.neg.len(), permuted.neg.len());
1063 /// ```
1064 pub fn permute(&self, shift: usize) -> SparseVec {
1065 let permute_index = |idx: usize| (idx + shift) % DIM;
1066
1067 let pos: Vec<usize> = self.pos.iter().map(|&idx| permute_index(idx)).collect();
1068 let neg: Vec<usize> = self.neg.iter().map(|&idx| permute_index(idx)).collect();
1069
1070 // Indices must remain sorted for efficient operations
1071 let mut pos = pos;
1072 let mut neg = neg;
1073 pos.sort_unstable();
1074 neg.sort_unstable();
1075
1076 SparseVec { pos, neg }
1077 }
1078
1079 /// Apply inverse cyclic permutation to vector indices
1080 /// Decodes sequence order by reversing the permutation shift
1081 ///
1082 /// # Arguments
1083 /// * `shift` - Number of positions to reverse shift indices cyclically
1084 ///
1085 /// # Examples
1086 ///
1087 /// ```
1088 /// use embeddenator_vsa::SparseVec;
1089 ///
1090 /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1091 /// let vec = SparseVec::encode_data(b"test", &config, None);
1092 /// let permuted = vec.permute(100);
1093 /// let recovered = permuted.inverse_permute(100);
1094 ///
1095 /// // Round-trip should recover original vector
1096 /// assert_eq!(vec.pos, recovered.pos);
1097 /// assert_eq!(vec.neg, recovered.neg);
1098 /// ```
1099 pub fn inverse_permute(&self, shift: usize) -> SparseVec {
1100 let inverse_permute_index = |idx: usize| (idx + DIM - (shift % DIM)) % DIM;
1101
1102 let pos: Vec<usize> = self
1103 .pos
1104 .iter()
1105 .map(|&idx| inverse_permute_index(idx))
1106 .collect();
1107 let neg: Vec<usize> = self
1108 .neg
1109 .iter()
1110 .map(|&idx| inverse_permute_index(idx))
1111 .collect();
1112
1113 // Indices must remain sorted for efficient operations
1114 let mut pos = pos;
1115 let mut neg = neg;
1116 pos.sort_unstable();
1117 neg.sort_unstable();
1118
1119 SparseVec { pos, neg }
1120 }
1121
1122 /// Context-Dependent Thinning Algorithm
1123 ///
1124 /// Thinning controls vector sparsity during bundle operations to prevent
1125 /// exponential density growth that degrades VSA performance. The algorithm:
1126 ///
1127 /// 1. Calculate current density = (pos.len() + neg.len()) as f32 / DIM as f32
1128 /// 2. If current_density <= target_density, return unchanged
1129 /// 3. Otherwise, randomly sample indices to reduce to target count
1130 /// 4. Preserve pos/neg ratio to maintain signal polarity balance
1131 /// 5. Use deterministic seeding for reproducible results
1132 ///
1133 /// Edge Cases:
1134 /// - Empty vector: return unchanged
1135 /// - target_non_zero = 0: return empty vector (not recommended)
1136 /// - target_non_zero >= current: return clone
1137 /// - Single polarity vectors: preserve polarity distribution
1138 ///
1139 /// Performance: O(n log n) due to sorting, where n = target_non_zero
1140 pub fn thin(&self, target_non_zero: usize) -> SparseVec {
1141 let current_count = self.pos.len() + self.neg.len();
1142
1143 // Edge case: already at or below target
1144 if current_count <= target_non_zero {
1145 return self.clone();
1146 }
1147
1148 // Edge case: target is zero
1149 if target_non_zero == 0 {
1150 return SparseVec::new();
1151 }
1152
1153 // Calculate how many to keep from each polarity
1154 let pos_ratio = self.pos.len() as f32 / current_count as f32;
1155 let target_pos = (target_non_zero as f32 * pos_ratio).round() as usize;
1156 let target_neg = target_non_zero - target_pos;
1157
1158 // Randomly sample indices using deterministic seed based on vector content
1159 let mut seed = [0u8; 32];
1160 seed[0..4].copy_from_slice(&(self.pos.len() as u32).to_le_bytes());
1161 seed[4..8].copy_from_slice(&(self.neg.len() as u32).to_le_bytes());
1162 seed[8..12].copy_from_slice(&(target_non_zero as u32).to_le_bytes());
1163 // Rest remains zero for deterministic seeding
1164 let mut rng = rand::rngs::StdRng::from_seed(seed);
1165
1166 // Sample positive indices
1167 let mut sampled_pos: Vec<usize> = self.pos.clone();
1168 sampled_pos.shuffle(&mut rng);
1169 sampled_pos.truncate(target_pos);
1170 sampled_pos.sort_unstable();
1171
1172 // Sample negative indices
1173 let mut sampled_neg: Vec<usize> = self.neg.clone();
1174 sampled_neg.shuffle(&mut rng);
1175 sampled_neg.truncate(target_neg);
1176 sampled_neg.sort_unstable();
1177
1178 SparseVec {
1179 pos: sampled_pos,
1180 neg: sampled_neg,
1181 }
1182 }
1183}