embeddenator_vsa/
vsa.rs

1//! Vector Symbolic Architecture (VSA) Implementation
2//!
3//! Sparse ternary vector representation with algebraic operations:
4//! - Bundle (⊕): Associative superposition
5//! - Bind (⊙): Non-commutative composition
6//! - Cosine similarity for retrieval
7
8use rand::seq::SliceRandom;
9use rand::SeedableRng;
10use serde::{Deserialize, Serialize};
11use sha2::{Digest, Sha256};
12
13use crate::ternary_vec::PackedTritVec;
14
15use std::cell::RefCell;
16
17/// Dimension of VSA vectors
18pub const DIM: usize = 10000;
19
20thread_local! {
21    // Reused packed buffers for hot paths. Using TLS keeps this allocation
22    // amortized while remaining safe in multi-threaded contexts.
23    static PACKED_SCRATCH_A: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
24    static PACKED_SCRATCH_B: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
25    static PACKED_SCRATCH_OUT: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
26}
27
28/// Configuration for reversible VSA encoding/decoding operations
29#[derive(Clone, Debug, Serialize, Deserialize)]
30pub struct ReversibleVSAConfig {
31    /// Block size for chunked encoding (must be power of 2 for efficiency)
32    pub block_size: usize,
33    /// Maximum path depth for hierarchical encoding
34    pub max_path_depth: usize,
35    /// Base permutation shift for path-based encoding
36    pub base_shift: usize,
37    /// Target sparsity level for operations (number of non-zero elements)
38    pub target_sparsity: usize,
39}
40
41impl Default for ReversibleVSAConfig {
42    fn default() -> Self {
43        ReversibleVSAConfig {
44            block_size: 256, // 256-byte blocks
45            max_path_depth: 10,
46            base_shift: 1000,
47            target_sparsity: 200, // Default sparsity level
48        }
49    }
50}
51
52impl ReversibleVSAConfig {
53    /// Create config optimized for small data blocks
54    pub fn small_blocks() -> Self {
55        ReversibleVSAConfig {
56            block_size: 64,
57            max_path_depth: 5,
58            base_shift: 500,
59            target_sparsity: 100,
60        }
61    }
62
63    /// Create config optimized for large data blocks
64    pub fn large_blocks() -> Self {
65        ReversibleVSAConfig {
66            block_size: 1024,
67            max_path_depth: 20,
68            base_shift: 2000,
69            target_sparsity: 400,
70        }
71    }
72}
73
74/// Sparse ternary vector with positive and negative indices
75#[derive(Clone, Debug, Serialize, Deserialize)]
76pub struct SparseVec {
77    /// Indices with +1 value
78    pub pos: Vec<usize>,
79    /// Indices with -1 value
80    pub neg: Vec<usize>,
81}
82
83impl Default for SparseVec {
84    fn default() -> Self {
85        Self::new()
86    }
87}
88
89impl SparseVec {
90    #[inline]
91    fn nnz(&self) -> usize {
92        self.pos.len() + self.neg.len()
93    }
94
95    fn intersection_count_sorted(a: &[usize], b: &[usize]) -> usize {
96        let mut i = 0usize;
97        let mut j = 0usize;
98        let mut count = 0usize;
99        while i < a.len() && j < b.len() {
100            match a[i].cmp(&b[j]) {
101                std::cmp::Ordering::Less => i += 1,
102                std::cmp::Ordering::Greater => j += 1,
103                std::cmp::Ordering::Equal => {
104                    count += 1;
105                    i += 1;
106                    j += 1;
107                }
108            }
109        }
110        count
111    }
112
113    fn union_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
114        let mut out = Vec::with_capacity(a.len() + b.len());
115        let mut i = 0usize;
116        let mut j = 0usize;
117
118        while i < a.len() && j < b.len() {
119            match a[i].cmp(&b[j]) {
120                std::cmp::Ordering::Less => {
121                    out.push(a[i]);
122                    i += 1;
123                }
124                std::cmp::Ordering::Greater => {
125                    out.push(b[j]);
126                    j += 1;
127                }
128                std::cmp::Ordering::Equal => {
129                    out.push(a[i]);
130                    i += 1;
131                    j += 1;
132                }
133            }
134        }
135
136        if i < a.len() {
137            out.extend_from_slice(&a[i..]);
138        }
139        if j < b.len() {
140            out.extend_from_slice(&b[j..]);
141        }
142
143        out
144    }
145
146    fn difference_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
147        let mut out = Vec::with_capacity(a.len());
148        let mut i = 0usize;
149        let mut j = 0usize;
150
151        while i < a.len() && j < b.len() {
152            match a[i].cmp(&b[j]) {
153                std::cmp::Ordering::Less => {
154                    out.push(a[i]);
155                    i += 1;
156                }
157                std::cmp::Ordering::Greater => j += 1,
158                std::cmp::Ordering::Equal => {
159                    i += 1;
160                    j += 1;
161                }
162            }
163        }
164
165        if i < a.len() {
166            out.extend_from_slice(&a[i..]);
167        }
168
169        out
170    }
171
172    fn intersection_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
173        let mut out = Vec::with_capacity(a.len().min(b.len()));
174        let mut i = 0usize;
175        let mut j = 0usize;
176
177        while i < a.len() && j < b.len() {
178            match a[i].cmp(&b[j]) {
179                std::cmp::Ordering::Less => i += 1,
180                std::cmp::Ordering::Greater => j += 1,
181                std::cmp::Ordering::Equal => {
182                    out.push(a[i]);
183                    i += 1;
184                    j += 1;
185                }
186            }
187        }
188
189        out
190    }
191    /// Create an empty sparse vector
192    ///
193    /// # Examples
194    ///
195    /// ```
196    /// use embeddenator_vsa::SparseVec;
197    ///
198    /// let vec = SparseVec::new();
199    /// assert!(vec.pos.is_empty());
200    /// assert!(vec.neg.is_empty());
201    /// ```
202    pub fn new() -> Self {
203        SparseVec {
204            pos: Vec::new(),
205            neg: Vec::new(),
206        }
207    }
208
209    /// Generate a random sparse vector with ~1% density
210    ///
211    /// # Examples
212    ///
213    /// ```
214    /// use embeddenator_vsa::SparseVec;
215    ///
216    /// let vec = SparseVec::random();
217    /// // Vector should have approximately 1% density (100 positive + 100 negative)
218    /// assert!(vec.pos.len() > 0);
219    /// assert!(vec.neg.len() > 0);
220    /// ```
221    pub fn random() -> Self {
222        let mut rng = rand::rng();
223        let sparsity = DIM / 100; // ~1% density
224
225        // Generate random indices without replacement
226        let mut indices: Vec<usize> = (0..DIM).collect();
227        indices.shuffle(&mut rng);
228
229        let mut pos: Vec<_> = indices[..sparsity].to_vec();
230        let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
231
232        pos.sort_unstable();
233        neg.sort_unstable();
234
235        SparseVec { pos, neg }
236    }
237
238    /// Encode data into a reversible sparse vector using block-based mapping
239    ///
240    /// This method implements hierarchical encoding with path-based permutations
241    /// for lossless data recovery. The encoding process:
242    /// 1. Splits data into blocks of configurable size
243    /// 2. Applies path-based permutations to each block
244    /// 3. Combines blocks using hierarchical bundling
245    ///
246    /// # Arguments
247    /// * `data` - The data to encode
248    /// * `config` - Configuration for encoding parameters
249    /// * `path` - Optional path string for hierarchical encoding (affects permutation)
250    ///
251    /// # Returns
252    /// A SparseVec that can be decoded back to the original data
253    ///
254    /// # Examples
255    ///
256    /// ```
257    /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
258    ///
259    /// let data = b"hello world";
260    /// let config = ReversibleVSAConfig::default();
261    /// let encoded = SparseVec::encode_data(data, &config, None);
262    ///
263    /// // encoded vector contains reversible representation of the data
264    /// assert!(!encoded.pos.is_empty() || !encoded.neg.is_empty());
265    /// ```
266    pub fn encode_data(data: &[u8], config: &ReversibleVSAConfig, path: Option<&str>) -> Self {
267        if data.is_empty() {
268            return SparseVec::new();
269        }
270
271        // Calculate path-based shift for hierarchical encoding
272        let path_shift = if let Some(path_str) = path {
273            // Use path hash to determine shift, constrained to prevent overflow
274            let mut hasher = Sha256::new();
275            hasher.update(path_str.as_bytes());
276            let hash = hasher.finalize();
277            // SAFETY: SHA256 always produces 32 bytes, first 4 bytes are always valid
278            let hash_bytes: [u8; 4] = hash[0..4]
279                .try_into()
280                .expect("SHA256 hash must produce at least 4 bytes");
281            let path_hash = u32::from_le_bytes(hash_bytes) as usize;
282            (path_hash % config.max_path_depth) * config.base_shift
283        } else {
284            0
285        };
286
287        // Split data into blocks
288        let mut blocks = Vec::new();
289        for chunk in data.chunks(config.block_size) {
290            blocks.push(chunk);
291        }
292
293        // Encode each block with position-based permutation
294        let mut encoded_blocks = Vec::new();
295        for (i, block) in blocks.iter().enumerate() {
296            let block_shift = path_shift + (i * config.base_shift / blocks.len().max(1));
297            let block_vec = Self::encode_block(block, block_shift);
298            encoded_blocks.push(block_vec);
299        }
300
301        // Combine blocks hierarchically
302        if encoded_blocks.is_empty() {
303            SparseVec::new()
304        } else if encoded_blocks.len() == 1 {
305            // SAFETY: we just checked len() == 1, so next() must return Some
306            encoded_blocks
307                .into_iter()
308                .next()
309                .expect("encoded_blocks must have exactly one element")
310        } else {
311            // Hierarchical bundling: combine in binary tree fashion
312            Self::hierarchical_bundle(&encoded_blocks)
313        }
314    }
315
316    /// Decode data from a reversible sparse vector
317    ///
318    /// Reverses the encoding process to recover the original data.
319    /// Requires the same configuration and path used during encoding.
320    ///
321    /// # Arguments
322    /// * `config` - Same configuration used for encoding
323    /// * `path` - Same path string used for encoding
324    /// * `expected_size` - Expected size of the decoded data (for validation)
325    ///
326    /// # Returns
327    /// The original data bytes (may need correction layer for 100% fidelity)
328    ///
329    /// # Examples
330    ///
331    /// ```no_run
332    /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
333    ///
334    /// let data = b"hello world";
335    /// let config = ReversibleVSAConfig::default();
336    /// let encoded = SparseVec::encode_data(data, &config, None);
337    /// let decoded = encoded.decode_data(&config, None, data.len());
338    ///
339    /// // Note: For 100% fidelity, use CorrectionStore with EmbrFS
340    /// // Raw decode may have minor differences that corrections compensate for
341    /// ```
342    pub fn decode_data(
343        &self,
344        config: &ReversibleVSAConfig,
345        path: Option<&str>,
346        expected_size: usize,
347    ) -> Vec<u8> {
348        if self.pos.is_empty() && self.neg.is_empty() {
349            return Vec::new();
350        }
351
352        if expected_size == 0 {
353            return Vec::new();
354        }
355
356        // Calculate path-based shift (same as encoding)
357        let path_shift = if let Some(path_str) = path {
358            let mut hasher = Sha256::new();
359            hasher.update(path_str.as_bytes());
360            let hash = hasher.finalize();
361            // SAFETY: SHA256 always produces 32 bytes, first 4 bytes are always valid
362            let hash_bytes: [u8; 4] = hash[0..4]
363                .try_into()
364                .expect("SHA256 hash must produce at least 4 bytes");
365            let path_hash = u32::from_le_bytes(hash_bytes) as usize;
366            (path_hash % config.max_path_depth) * config.base_shift
367        } else {
368            0
369        };
370
371        // Estimate number of blocks based on expected size
372        let estimated_blocks = expected_size.div_ceil(config.block_size);
373
374        // For single block case
375        if estimated_blocks <= 1 {
376            return Self::decode_block(self, path_shift, expected_size);
377        }
378
379        // =======================================================================
380        // MULTI-BLOCK DECODING LIMITATIONS (v0.21.0)
381        // =======================================================================
382        //
383        // Current Status: EXPERIMENTAL / SIMPLIFIED IMPLEMENTATION
384        //
385        // The multi-block decoding uses a simplified linear approach without proper
386        // hierarchical factorization. This has the following implications:
387        //
388        // 1. **Information Loss**: When multiple blocks are bundled hierarchically,
389        //    separating them requires VSA factorization techniques (e.g., HRR unbinding,
390        //    MAP estimation) that are not yet implemented.
391        //
392        // 2. **Use Case**: Works reliably for:
393        //    - Single-block data (data.len() <= config.block_size)
394        //    - Data with clear block boundaries
395        //
396        // 3. **Known Limitation**: Multi-block data (> block_size) may have reduced
397        //    fidelity. For 100% accuracy, use the Codebook's residual mechanism or
398        //    external correction stores.
399        //
400        // 4. **Future Work**: Full hierarchical decoding is planned for v1.1.0.
401        //    See IMPLEMENTATION_PLAN.md Task 4.2 for details.
402        //
403        // =======================================================================
404
405        // For multiple blocks, we need to factorize the hierarchical bundle
406        // This is a simplified approach - in practice, we'd need more sophisticated
407        // factorization to separate the blocks
408        let mut result = Vec::new();
409
410        // Simplified best-effort linear decoding for multi-block data.
411        // WARNING: This path trades accuracy for speed and may:
412        //   - Lose information across blocks, and/or
413        //   - Mis-assign tokens between adjacent blocks.
414        // For applications that require 100% fidelity, prefer the Codebook's
415        // residual mechanism or external correction stores instead of relying
416        // solely on this multi-block decoder.
417        for i in 0..estimated_blocks {
418            let block_shift = path_shift + (i * config.base_shift / estimated_blocks.max(1));
419            let remaining = expected_size.saturating_sub(result.len());
420            if remaining == 0 {
421                break;
422            }
423            let max_len = remaining.min(config.block_size);
424            let block_data = Self::decode_block(self, block_shift, max_len);
425            if block_data.is_empty() {
426                break;
427            }
428            result.extend(block_data);
429            if result.len() >= expected_size {
430                break;
431            }
432        }
433
434        // Truncate to expected size
435        result.truncate(expected_size);
436        result
437    }
438
439    /// Encode a single block of data with position-based permutation
440    fn encode_block(data: &[u8], shift: usize) -> SparseVec {
441        if data.is_empty() {
442            return SparseVec::new();
443        }
444
445        // Map data bytes to vector indices using the permuted mapping
446        let mut pos = Vec::new();
447        let mut neg = Vec::new();
448
449        for (i, &byte) in data.iter().enumerate() {
450            let base_idx = (i + shift) % DIM;
451
452            // Use byte value to determine polarity and offset
453            if byte & 0x80 != 0 {
454                // High bit set -> negative
455                neg.push((base_idx + (byte & 0x7F) as usize) % DIM);
456            } else {
457                // High bit clear -> positive
458                pos.push((base_idx + byte as usize) % DIM);
459            }
460        }
461
462        pos.sort_unstable();
463        pos.dedup();
464        neg.sort_unstable();
465        neg.dedup();
466
467        // CRITICAL FIX: Ensure pos and neg are disjoint
468        // If an index appears in both pos and neg, it means we have conflicting
469        // encodings. Remove from both to treat as 0 (cancel out).
470        let overlap = Self::intersection_sorted(&pos, &neg);
471        if !overlap.is_empty() {
472            pos = Self::difference_sorted(&pos, &overlap);
473            neg = Self::difference_sorted(&neg, &overlap);
474        }
475
476        SparseVec { pos, neg }
477    }
478
479    /// Decode a single block of data
480    fn decode_block(encoded: &SparseVec, shift: usize, max_len: usize) -> Vec<u8> {
481        if max_len == 0 {
482            return Vec::new();
483        }
484
485        let mut result = Vec::with_capacity(max_len);
486
487        // Reconstruct data by reversing the permutation.
488        // Note: `pos` and `neg` are kept sorted, so membership can be checked via binary search.
489        for i in 0..max_len {
490            let base_idx = (i + shift) % DIM;
491
492            // Look for indices that map back to this position
493            let mut found_byte = None;
494            for offset in 0..128u8 {
495                let test_idx = (base_idx + offset as usize) % DIM;
496
497                if encoded.pos.binary_search(&test_idx).is_ok() {
498                    found_byte = Some(offset);
499                    break;
500                } else if encoded.neg.binary_search(&test_idx).is_ok() {
501                    found_byte = Some(offset | 0x80);
502                    break;
503                }
504            }
505
506            if let Some(byte) = found_byte {
507                result.push(byte);
508            } else {
509                // No more data found
510                break;
511            }
512        }
513
514        result
515    }
516
517    /// Combine multiple vectors using hierarchical bundling
518    fn hierarchical_bundle(vectors: &[SparseVec]) -> SparseVec {
519        if vectors.is_empty() {
520            return SparseVec::new();
521        }
522        if vectors.len() == 1 {
523            return vectors[0].clone();
524        }
525
526        // Binary tree combination
527        let mut result = vectors[0].clone();
528        for vec in &vectors[1..] {
529            result = result.bundle(vec);
530        }
531        result
532    }
533
534    /// Generate a deterministic sparse vector from data using SHA256 seed
535    /// DEPRECATED: Use encode_data() for new code
536    ///
537    /// # Examples
538    ///
539    /// ```
540    /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
541    ///
542    /// let data = b"hello world";
543    /// let config = ReversibleVSAConfig::default();
544    /// let vec1 = SparseVec::encode_data(data, &config, None);
545    /// let vec2 = SparseVec::encode_data(data, &config, None);
546    ///
547    /// // Same input produces same vector (deterministic)
548    /// assert_eq!(vec1.pos, vec2.pos);
549    /// assert_eq!(vec1.neg, vec2.neg);
550    /// ```
551    #[deprecated(since = "0.2.0", note = "Use encode_data() for reversible encoding")]
552    pub fn from_data(data: &[u8]) -> Self {
553        let mut hasher = Sha256::new();
554        hasher.update(data);
555        let hash = hasher.finalize();
556
557        // SAFETY: SHA256 always produces exactly 32 bytes
558        let seed: [u8; 32] = hash[..32]
559            .try_into()
560            .expect("SHA256 must produce exactly 32 bytes");
561        let mut rng = rand::rngs::StdRng::from_seed(seed);
562
563        let mut indices: Vec<usize> = (0..DIM).collect();
564        indices.shuffle(&mut rng);
565
566        let sparsity = DIM / 100;
567        let mut pos = indices[..sparsity].to_vec();
568        let mut neg = indices[sparsity..sparsity * 2].to_vec();
569
570        pos.sort_unstable();
571        neg.sort_unstable();
572
573        SparseVec { pos, neg }
574    }
575
576    /// Bundle operation: pairwise conflict-cancel superposition (A ⊕ B)
577    ///
578    /// This is a fast, commutative merge for two vectors:
579    /// - same sign => keep
580    /// - opposite signs => cancel to 0
581    /// - sign vs 0 => keep sign
582    ///
583    /// Note: While this is well-defined for two vectors, repeated application across 3+
584    /// vectors is generally **not associative** because early cancellation/thresholding can
585    /// discard multiplicity information.
586    ///
587    /// # Arguments
588    /// * `other` - The vector to bundle with self
589    /// * `config` - Optional ReversibleVSAConfig for controlling sparsity via thinning
590    ///
591    /// # Examples
592    ///
593    /// ```
594    /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
595    ///
596    /// let config = ReversibleVSAConfig::default();
597    /// let vec1 = SparseVec::encode_data(b"data1", &config, None);
598    /// let vec2 = SparseVec::encode_data(b"data2", &config, None);
599    /// let bundled = vec1.bundle_with_config(&vec2, Some(&config));
600    ///
601    /// // Bundled vector contains superposition of both inputs
602    /// // Should be similar to both original vectors
603    /// let sim1 = vec1.cosine(&bundled);
604    /// let sim2 = vec2.cosine(&bundled);
605    /// assert!(sim1 > 0.3);
606    /// assert!(sim2 > 0.3);
607    /// ```
608    pub fn bundle_with_config(
609        &self,
610        other: &SparseVec,
611        config: Option<&ReversibleVSAConfig>,
612    ) -> SparseVec {
613        let mut result = self.bundle(other);
614
615        // Apply thinning if config provided and result exceeds target sparsity
616        if let Some(cfg) = config {
617            let current_count = result.pos.len() + result.neg.len();
618            if current_count > cfg.target_sparsity {
619                result = result.thin(cfg.target_sparsity);
620            }
621        }
622
623        result
624    }
625
626    /// Bundle operation: pairwise conflict-cancel superposition (A ⊕ B)
627    ///
628    /// See `bundle()` for semantic details; this wrapper optionally applies thinning via
629    /// `ReversibleVSAConfig`.
630    ///
631    /// # Examples
632    ///
633    /// ```
634    /// use embeddenator_vsa::SparseVec;
635    ///
636    /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
637    /// let vec1 = SparseVec::encode_data(b"data1", &config, None);
638    /// let vec2 = SparseVec::encode_data(b"data2", &config, None);
639    /// let bundled = vec1.bundle(&vec2);
640    ///
641    /// // Bundled vector contains superposition of both inputs
642    /// // Should be similar to both original vectors
643    /// let sim1 = vec1.cosine(&bundled);
644    /// let sim2 = vec2.cosine(&bundled);
645    /// assert!(sim1 > 0.3);
646    /// assert!(sim2 > 0.3);
647    /// ```
648    pub fn bundle(&self, other: &SparseVec) -> SparseVec {
649        // Optional ternary-native fast path (migration gate).
650        // This is primarily intended for cases where vectors become dense enough
651        // that packed word-wise operations are competitive.
652        {
653            // Converting SparseVec -> PackedTritVec is O(DIM) for allocation/zeroing, plus O(nnz)
654            // for setting lanes. Only attempt the packed path when total density is high enough,
655            // and avoid converting an extremely sparse operand.
656            let a_nnz = self.nnz();
657            let b_nnz = other.nnz();
658            let total = a_nnz + b_nnz;
659            if total > DIM / 4 {
660                let min_nnz = a_nnz.min(b_nnz);
661                if min_nnz > DIM / 32 {
662                    return PACKED_SCRATCH_A.with(|a_cell| {
663                        PACKED_SCRATCH_B.with(|b_cell| {
664                            PACKED_SCRATCH_OUT.with(|out_cell| {
665                                let mut a = a_cell.borrow_mut();
666                                let mut b = b_cell.borrow_mut();
667                                let mut out = out_cell.borrow_mut();
668
669                                a.fill_from_sparsevec(self, DIM);
670                                b.fill_from_sparsevec(other, DIM);
671                                a.bundle_into(&b, &mut out);
672                                out.to_sparsevec()
673                            })
674                        })
675                    });
676                }
677            }
678        }
679
680        // Majority voting for two sparse ternary vectors:
681        // - Same sign => keep
682        // - Opposite signs => cancel to 0
683        // - Sign vs 0 => keep sign
684        // This can be expressed via sorted set differences/unions.
685        let pos_a = Self::difference_sorted(&self.pos, &other.neg);
686        let pos_b = Self::difference_sorted(&other.pos, &self.neg);
687        let neg_a = Self::difference_sorted(&self.neg, &other.pos);
688        let neg_b = Self::difference_sorted(&other.neg, &self.pos);
689
690        let mut pos = Self::union_sorted(&pos_a, &pos_b);
691        let mut neg = Self::union_sorted(&neg_a, &neg_b);
692
693        // CRITICAL FIX: Ensure pos and neg remain disjoint
694        // Remove any indices that appear in both (shouldn't happen in theory,
695        // but can occur due to floating-point or hierarchical bundling issues)
696        let overlap = Self::intersection_sorted(&pos, &neg);
697        if !overlap.is_empty() {
698            pos = Self::difference_sorted(&pos, &overlap);
699            neg = Self::difference_sorted(&neg, &overlap);
700        }
701
702        SparseVec { pos, neg }
703    }
704
705    /// Associative bundle over many vectors: sums contributions per index, then thresholds to sign.
706    /// This is order-independent because all contributions are accumulated before applying sign.
707    /// Complexity: O(K log K) where K is total non-zero entries across inputs.
708    pub fn bundle_sum_many<'a, I>(vectors: I) -> SparseVec
709    where
710        I: IntoIterator<Item = &'a SparseVec>,
711    {
712        let mut contributions: Vec<(usize, i32)> = Vec::new();
713
714        for vec in vectors {
715            contributions.extend(vec.pos.iter().map(|&idx| (idx, 1i32)));
716            contributions.extend(vec.neg.iter().map(|&idx| (idx, -1i32)));
717        }
718
719        if contributions.is_empty() {
720            return SparseVec {
721                pos: Vec::new(),
722                neg: Vec::new(),
723            };
724        }
725
726        contributions.sort_unstable_by_key(|(idx, _)| *idx);
727
728        let mut pos = Vec::new();
729        let mut neg = Vec::new();
730
731        let mut iter = contributions.into_iter();
732        // SAFETY: we checked contributions.is_empty() above and returned early if empty
733        let (mut current_idx, mut acc) = iter
734            .next()
735            .expect("contributions must be non-empty after early return check");
736
737        for (idx, value) in iter {
738            if idx == current_idx {
739                acc += value;
740            } else {
741                if acc > 0 {
742                    pos.push(current_idx);
743                } else if acc < 0 {
744                    neg.push(current_idx);
745                }
746                current_idx = idx;
747                acc = value;
748            }
749        }
750
751        if acc > 0 {
752            pos.push(current_idx);
753        } else if acc < 0 {
754            neg.push(current_idx);
755        }
756
757        SparseVec { pos, neg }
758    }
759
760    /// Hybrid bundle: choose a fast pairwise fold for very sparse regimes (to preserve sparsity),
761    /// otherwise use the associative sum-then-threshold path (order-independent, more faithful to majority).
762    ///
763    /// Heuristic: estimate expected overlap/collision count assuming uniform hashing into `DIM`.
764    /// If expected colliding dimensions is below a small budget, use pairwise `bundle`; else use
765    /// `bundle_sum_many`.
766    pub fn bundle_hybrid_many<'a, I>(vectors: I) -> SparseVec
767    where
768        I: IntoIterator<Item = &'a SparseVec>,
769    {
770        let collected: Vec<&'a SparseVec> = vectors.into_iter().collect();
771        if collected.is_empty() {
772            return SparseVec {
773                pos: Vec::new(),
774                neg: Vec::new(),
775            };
776        }
777
778        if collected.len() == 1 {
779            return collected[0].clone();
780        }
781
782        if collected.len() == 2 {
783            return collected[0].bundle(collected[1]);
784        }
785
786        let total_nnz: usize = collected.iter().map(|v| v.pos.len() + v.neg.len()).sum();
787
788        if total_nnz == 0 {
789            return SparseVec {
790                pos: Vec::new(),
791                neg: Vec::new(),
792            };
793        }
794
795        // Constant-time overlap/collision risk estimate.
796        // Model: each non-zero lands uniformly in DIM dimensions.
797        // Let λ = total_nnz / DIM be expected hits per dimension.
798        // Then P(K>=2) ≈ 1 - e^{-λ}(1+λ) for Poisson(λ).
799        // If expected number of colliding dimensions is tiny, pairwise fold is effectively safe
800        // (and faster). Otherwise, use associative accumulation to avoid order sensitivity.
801        let lambda = total_nnz as f64 / DIM as f64;
802        let p_ge_2 = 1.0 - (-lambda).exp() * (1.0 + lambda);
803        let expected_colliding_dims = p_ge_2 * DIM as f64;
804
805        // Budget: allow a small number of potentially order-sensitive dimensions.
806        // Tune via benchmarks; this is conservative for integrity.
807        let collision_budget_dims = 32.0;
808
809        if expected_colliding_dims <= collision_budget_dims {
810            let mut iter = collected.into_iter();
811            // SAFETY: hierarchical_bundle is only called when collected.len() > 1
812            let mut acc = iter
813                .next()
814                .expect("hierarchical_bundle must be called with non-empty collection")
815                .clone();
816            for v in iter {
817                acc = acc.bundle(v);
818            }
819            return acc;
820        }
821
822        SparseVec::bundle_sum_many(collected)
823    }
824
825    /// Bind operation: non-commutative composition (A ⊙ B)
826    /// Performs element-wise multiplication. Self-inverse: A ⊙ A ≈ I
827    ///
828    /// # Examples
829    ///
830    /// ```
831    /// use embeddenator_vsa::SparseVec;
832    ///
833    /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
834    /// let vec = SparseVec::encode_data(b"test", &config, None);
835    /// let bound = vec.bind(&vec);
836    ///
837    /// // Bind with self should produce high similarity (self-inverse property)
838    /// let identity = SparseVec::encode_data(b"identity", &config, None);
839    /// let sim = bound.cosine(&identity);
840    /// // Result is approximately identity, so similarity varies
841    /// assert!(sim >= -1.0 && sim <= 1.0);
842    /// ```
843    pub fn bind(&self, other: &SparseVec) -> SparseVec {
844        {
845            // Packed bind is only worthwhile when both operands are dense enough.
846            // Using a short-circuiting check avoids paying extra overhead for sparse workloads.
847            let a_nnz = self.nnz();
848            if a_nnz > DIM / 4 {
849                let b_nnz = other.nnz();
850                if b_nnz > DIM / 4 {
851                    return PACKED_SCRATCH_A.with(|a_cell| {
852                        PACKED_SCRATCH_B.with(|b_cell| {
853                            PACKED_SCRATCH_OUT.with(|out_cell| {
854                                let mut a = a_cell.borrow_mut();
855                                let mut b = b_cell.borrow_mut();
856                                let mut out = out_cell.borrow_mut();
857
858                                a.fill_from_sparsevec(self, DIM);
859                                b.fill_from_sparsevec(other, DIM);
860                                a.bind_into(&b, &mut out);
861                                out.to_sparsevec()
862                            })
863                        })
864                    });
865                }
866            }
867        }
868
869        // Sparse bind is element-wise multiplication restricted to non-zero support.
870        // We can compute this in a single merge-join over the signed supports, keeping
871        // outputs sorted without any post-sort.
872        let mut result_pos = Vec::new();
873        let mut result_neg = Vec::new();
874
875        let mut a_pos = 0usize;
876        let mut a_neg = 0usize;
877        let mut b_pos = 0usize;
878        let mut b_neg = 0usize;
879
880        loop {
881            let next_a = match (self.pos.get(a_pos), self.neg.get(a_neg)) {
882                (Some(&p), Some(&n)) => {
883                    if p < n {
884                        Some((p, 1i8, true))
885                    } else {
886                        Some((n, -1i8, false))
887                    }
888                }
889                (Some(&p), None) => Some((p, 1i8, true)),
890                (None, Some(&n)) => Some((n, -1i8, false)),
891                (None, None) => None,
892            };
893
894            let next_b = match (other.pos.get(b_pos), other.neg.get(b_neg)) {
895                (Some(&p), Some(&n)) => {
896                    if p < n {
897                        Some((p, 1i8, true))
898                    } else {
899                        Some((n, -1i8, false))
900                    }
901                }
902                (Some(&p), None) => Some((p, 1i8, true)),
903                (None, Some(&n)) => Some((n, -1i8, false)),
904                (None, None) => None,
905            };
906
907            let Some((idx_a, sign_a, a_is_pos)) = next_a else {
908                break;
909            };
910            let Some((idx_b, sign_b, b_is_pos)) = next_b else {
911                break;
912            };
913
914            match idx_a.cmp(&idx_b) {
915                std::cmp::Ordering::Less => {
916                    if a_is_pos {
917                        a_pos += 1;
918                    } else {
919                        a_neg += 1;
920                    }
921                }
922                std::cmp::Ordering::Greater => {
923                    if b_is_pos {
924                        b_pos += 1;
925                    } else {
926                        b_neg += 1;
927                    }
928                }
929                std::cmp::Ordering::Equal => {
930                    let prod = sign_a * sign_b;
931                    if prod == 1 {
932                        result_pos.push(idx_a);
933                    } else {
934                        result_neg.push(idx_a);
935                    }
936
937                    if a_is_pos {
938                        a_pos += 1;
939                    } else {
940                        a_neg += 1;
941                    }
942                    if b_is_pos {
943                        b_pos += 1;
944                    } else {
945                        b_neg += 1;
946                    }
947                }
948            }
949        }
950
951        SparseVec {
952            pos: result_pos,
953            neg: result_neg,
954        }
955    }
956
957    /// Calculate cosine similarity between two sparse vectors
958    /// Returns value in [-1, 1] where 1 is identical, 0 is orthogonal
959    ///
960    /// When the `simd` feature is enabled, this will automatically use
961    /// AVX2 (x86_64) or NEON (aarch64) acceleration if available.
962    ///
963    /// # Examples
964    ///
965    /// ```
966    /// use embeddenator_vsa::SparseVec;
967    ///
968    /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
969    /// let vec1 = SparseVec::encode_data(b"cat", &config, None);
970    /// let vec2 = SparseVec::encode_data(b"cat", &config, None);
971    /// let vec3 = SparseVec::encode_data(b"dog", &config, None);
972    ///
973    /// // Identical data produces identical vectors
974    /// assert!((vec1.cosine(&vec2) - 1.0).abs() < 0.01);
975    ///
976    /// // Different data produces low similarity
977    /// let sim = vec1.cosine(&vec3);
978    /// assert!(sim < 0.3);
979    /// ```
980    pub fn cosine(&self, other: &SparseVec) -> f64 {
981        #[cfg(feature = "simd")]
982        {
983            crate::simd_cosine::cosine_simd(self, other)
984        }
985
986        #[cfg(not(feature = "simd"))]
987        {
988            // Original implementation remains as fallback
989            self.cosine_scalar(other)
990        }
991    }
992
993    /// Scalar (non-SIMD) cosine similarity implementation.
994    ///
995    /// This is the original implementation and serves as the baseline
996    /// for SIMD optimizations. It's also used when SIMD is not available.
997    #[inline]
998    pub fn cosine_scalar(&self, other: &SparseVec) -> f64 {
999        {
1000            // Only use packed cosine when total density is high enough to amortize conversion,
1001            // and avoid converting an extremely sparse operand.
1002            let a_nnz = self.nnz();
1003            let b_nnz = other.nnz();
1004            let total = a_nnz + b_nnz;
1005            if total > DIM / 4 {
1006                let min_nnz = a_nnz.min(b_nnz);
1007                if min_nnz > DIM / 32 {
1008                    let dot = PACKED_SCRATCH_A.with(|a_cell| {
1009                        PACKED_SCRATCH_B.with(|b_cell| {
1010                            let mut a = a_cell.borrow_mut();
1011                            let mut b = b_cell.borrow_mut();
1012                            a.fill_from_sparsevec(self, DIM);
1013                            b.fill_from_sparsevec(other, DIM);
1014                            a.dot(&b)
1015                        })
1016                    }) as f64;
1017                    let self_norm = a_nnz as f64;
1018                    let other_norm = b_nnz as f64;
1019                    if self_norm == 0.0 || other_norm == 0.0 {
1020                        return 0.0;
1021                    }
1022                    return dot / (self_norm.sqrt() * other_norm.sqrt());
1023                }
1024            }
1025        }
1026
1027        // Sparse ternary dot product:
1028        // +1 when signs match, -1 when signs oppose.
1029        let pp = Self::intersection_count_sorted(&self.pos, &other.pos) as i32;
1030        let nn = Self::intersection_count_sorted(&self.neg, &other.neg) as i32;
1031        let pn = Self::intersection_count_sorted(&self.pos, &other.neg) as i32;
1032        let np = Self::intersection_count_sorted(&self.neg, &other.pos) as i32;
1033        let dot = (pp + nn) - (pn + np);
1034
1035        let self_norm = self.nnz() as f64;
1036        let other_norm = other.nnz() as f64;
1037
1038        if self_norm == 0.0 || other_norm == 0.0 {
1039            return 0.0;
1040        }
1041
1042        dot as f64 / (self_norm.sqrt() * other_norm.sqrt())
1043    }
1044
1045    /// Apply cyclic permutation to vector indices
1046    /// Used for encoding sequence order in hierarchical structures
1047    ///
1048    /// # Arguments
1049    /// * `shift` - Number of positions to shift indices cyclically
1050    ///
1051    /// # Examples
1052    ///
1053    /// ```
1054    /// use embeddenator_vsa::SparseVec;
1055    ///
1056    /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1057    /// let vec = SparseVec::encode_data(b"test", &config, None);
1058    /// let permuted = vec.permute(100);
1059    ///
1060    /// // Permuted vector should have different indices but same structure
1061    /// assert_eq!(vec.pos.len(), permuted.pos.len());
1062    /// assert_eq!(vec.neg.len(), permuted.neg.len());
1063    /// ```
1064    pub fn permute(&self, shift: usize) -> SparseVec {
1065        let permute_index = |idx: usize| (idx + shift) % DIM;
1066
1067        let pos: Vec<usize> = self.pos.iter().map(|&idx| permute_index(idx)).collect();
1068        let neg: Vec<usize> = self.neg.iter().map(|&idx| permute_index(idx)).collect();
1069
1070        // Indices must remain sorted for efficient operations
1071        let mut pos = pos;
1072        let mut neg = neg;
1073        pos.sort_unstable();
1074        neg.sort_unstable();
1075
1076        SparseVec { pos, neg }
1077    }
1078
1079    /// Apply inverse cyclic permutation to vector indices
1080    /// Decodes sequence order by reversing the permutation shift
1081    ///
1082    /// # Arguments
1083    /// * `shift` - Number of positions to reverse shift indices cyclically
1084    ///
1085    /// # Examples
1086    ///
1087    /// ```
1088    /// use embeddenator_vsa::SparseVec;
1089    ///
1090    /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1091    /// let vec = SparseVec::encode_data(b"test", &config, None);
1092    /// let permuted = vec.permute(100);
1093    /// let recovered = permuted.inverse_permute(100);
1094    ///
1095    /// // Round-trip should recover original vector
1096    /// assert_eq!(vec.pos, recovered.pos);
1097    /// assert_eq!(vec.neg, recovered.neg);
1098    /// ```
1099    pub fn inverse_permute(&self, shift: usize) -> SparseVec {
1100        let inverse_permute_index = |idx: usize| (idx + DIM - (shift % DIM)) % DIM;
1101
1102        let pos: Vec<usize> = self
1103            .pos
1104            .iter()
1105            .map(|&idx| inverse_permute_index(idx))
1106            .collect();
1107        let neg: Vec<usize> = self
1108            .neg
1109            .iter()
1110            .map(|&idx| inverse_permute_index(idx))
1111            .collect();
1112
1113        // Indices must remain sorted for efficient operations
1114        let mut pos = pos;
1115        let mut neg = neg;
1116        pos.sort_unstable();
1117        neg.sort_unstable();
1118
1119        SparseVec { pos, neg }
1120    }
1121
1122    /// Context-Dependent Thinning Algorithm
1123    ///
1124    /// Thinning controls vector sparsity during bundle operations to prevent
1125    /// exponential density growth that degrades VSA performance. The algorithm:
1126    ///
1127    /// 1. Calculate current density = (pos.len() + neg.len()) as f32 / DIM as f32
1128    /// 2. If current_density <= target_density, return unchanged
1129    /// 3. Otherwise, randomly sample indices to reduce to target count
1130    /// 4. Preserve pos/neg ratio to maintain signal polarity balance
1131    /// 5. Use deterministic seeding for reproducible results
1132    ///
1133    /// Edge Cases:
1134    /// - Empty vector: return unchanged
1135    /// - target_non_zero = 0: return empty vector (not recommended)
1136    /// - target_non_zero >= current: return clone
1137    /// - Single polarity vectors: preserve polarity distribution
1138    ///
1139    /// Performance: O(n log n) due to sorting, where n = target_non_zero
1140    pub fn thin(&self, target_non_zero: usize) -> SparseVec {
1141        let current_count = self.pos.len() + self.neg.len();
1142
1143        // Edge case: already at or below target
1144        if current_count <= target_non_zero {
1145            return self.clone();
1146        }
1147
1148        // Edge case: target is zero
1149        if target_non_zero == 0 {
1150            return SparseVec::new();
1151        }
1152
1153        // Calculate how many to keep from each polarity
1154        let pos_ratio = self.pos.len() as f32 / current_count as f32;
1155        let target_pos = (target_non_zero as f32 * pos_ratio).round() as usize;
1156        let target_neg = target_non_zero - target_pos;
1157
1158        // Randomly sample indices using deterministic seed based on vector content
1159        let mut seed = [0u8; 32];
1160        seed[0..4].copy_from_slice(&(self.pos.len() as u32).to_le_bytes());
1161        seed[4..8].copy_from_slice(&(self.neg.len() as u32).to_le_bytes());
1162        seed[8..12].copy_from_slice(&(target_non_zero as u32).to_le_bytes());
1163        // Rest remains zero for deterministic seeding
1164        let mut rng = rand::rngs::StdRng::from_seed(seed);
1165
1166        // Sample positive indices
1167        let mut sampled_pos: Vec<usize> = self.pos.clone();
1168        sampled_pos.shuffle(&mut rng);
1169        sampled_pos.truncate(target_pos);
1170        sampled_pos.sort_unstable();
1171
1172        // Sample negative indices
1173        let mut sampled_neg: Vec<usize> = self.neg.clone();
1174        sampled_neg.shuffle(&mut rng);
1175        sampled_neg.truncate(target_neg);
1176        sampled_neg.sort_unstable();
1177
1178        SparseVec {
1179            pos: sampled_pos,
1180            neg: sampled_neg,
1181        }
1182    }
1183}
embeddenator_vsa/vsa.rs

embeddenator_vsa/
vsa.rs