embeddenator_vsa/
vsa.rs

1//! Vector Symbolic Architecture (VSA) Implementation
2//!
3//! Sparse ternary vector representation with algebraic operations:
4//! - Bundle (⊕): Associative superposition
5//! - Bind (⊙): Non-commutative composition
6//! - Cosine similarity for retrieval
7
8use rand::seq::SliceRandom;
9use rand::SeedableRng;
10use serde::{Deserialize, Serialize};
11use sha2::{Digest, Sha256};
12
13use crate::ternary_vec::PackedTritVec;
14
15use std::cell::RefCell;
16
17/// Default dimension of VSA vectors (10,000 dimensions)
18pub const DIM: usize = 10000;
19
20/// Configuration for VSA vector operations
21///
22/// Controls the dimensionality and sparsity of vectors. This allows
23/// dynamic configuration of vector dimensions at runtime rather than
24/// being locked to the compile-time `DIM` constant.
25///
26/// # Example
27///
28/// ```rust
29/// use embeddenator_vsa::VsaConfig;
30///
31/// // Default 10,000 dimensions
32/// let default_config = VsaConfig::default();
33/// assert_eq!(default_config.dimension, 10_000);
34///
35/// // Custom configuration for smaller vectors
36/// let small_config = VsaConfig::new(1000).with_density(0.02); // 2% density
37///
38/// // Configuration for larger vectors
39/// let large_config = VsaConfig::new(100_000).with_density(0.005); // 0.5% density
40/// ```
41#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
42pub struct VsaConfig {
43    /// Number of dimensions in vectors
44    pub dimension: usize,
45    /// Target density of non-zero elements (0.0 to 1.0)
46    /// Default is 0.01 (1% density)
47    pub density: f64,
48    /// Minimum non-zero elements per vector
49    pub min_nonzero: usize,
50    /// Maximum non-zero elements per vector
51    pub max_nonzero: usize,
52    /// Auto-thinning threshold: automatically thin bundles when nnz exceeds this
53    /// Set to 0 to disable auto-thinning
54    pub auto_thin_threshold: usize,
55    /// Dynamic sparsity scaling mode
56    pub scaling_mode: SparsityScaling,
57    /// Target reconstruction accuracy (0.0-1.0)
58    /// Used by `for_accuracy()` to select appropriate dimensions
59    pub target_accuracy: f64,
60}
61
62/// Sparsity scaling modes for dynamic dimension handling
63#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Default)]
64pub enum SparsityScaling {
65    /// Fixed density - nnz scales linearly with dimension
66    /// Good for: Consistent memory usage per dimension
67    Fixed,
68    /// Square root scaling - nnz scales as sqrt(dimension)
69    /// Good for: Maintaining constant similarity precision
70    /// Formula: target_nnz = base_nnz * sqrt(dim / base_dim)
71    #[default]
72    SquareRoot,
73    /// Logarithmic scaling - nnz scales as log(dimension)
74    /// Good for: Memory-constrained environments with large dimensions
75    /// Formula: target_nnz = base_nnz * log(dim) / log(base_dim)
76    Logarithmic,
77    /// Custom scaling with user-provided function coefficients
78    /// Formula: target_nnz = a * dim^b + c
79    Custom { a: f64, b: f64, c: f64 },
80}
81
82impl Default for VsaConfig {
83    fn default() -> Self {
84        Self {
85            dimension: DIM,
86            density: 0.01, // 1% density
87            min_nonzero: 10,
88            max_nonzero: DIM / 10,
89            auto_thin_threshold: DIM / 5, // Auto-thin when > 20% dense
90            scaling_mode: SparsityScaling::SquareRoot,
91            target_accuracy: 0.95,
92        }
93    }
94}
95
96impl VsaConfig {
97    /// Create a new VSA configuration with the specified dimension
98    pub fn new(dimension: usize) -> Self {
99        Self {
100            dimension,
101            density: 0.01,
102            min_nonzero: 10,
103            max_nonzero: dimension / 10,
104            auto_thin_threshold: dimension / 5,
105            scaling_mode: SparsityScaling::SquareRoot,
106            target_accuracy: 0.95,
107        }
108    }
109
110    /// Set the target density for non-zero elements
111    pub fn with_density(mut self, density: f64) -> Self {
112        self.density = density.clamp(0.001, 0.5);
113        self
114    }
115
116    /// Set minimum number of non-zero elements
117    pub fn with_min_nonzero(mut self, min: usize) -> Self {
118        self.min_nonzero = min;
119        self
120    }
121
122    /// Set maximum number of non-zero elements
123    pub fn with_max_nonzero(mut self, max: usize) -> Self {
124        self.max_nonzero = max;
125        self
126    }
127
128    /// Set auto-thinning threshold (0 to disable)
129    pub fn with_auto_thin(mut self, threshold: usize) -> Self {
130        self.auto_thin_threshold = threshold;
131        self
132    }
133
134    /// Set sparsity scaling mode
135    pub fn with_scaling(mut self, mode: SparsityScaling) -> Self {
136        self.scaling_mode = mode;
137        self
138    }
139
140    /// Set target reconstruction accuracy
141    pub fn with_target_accuracy(mut self, accuracy: f64) -> Self {
142        self.target_accuracy = accuracy.clamp(0.5, 0.9999);
143        self
144    }
145
146    /// Calculate the sparsity (number of non-zero elements per polarity)
147    /// based on dimension, density, and scaling mode
148    pub fn sparsity(&self) -> usize {
149        let base_target = ((self.dimension as f64 * self.density) / 2.0).round() as usize;
150
151        let scaled_target = match self.scaling_mode {
152            SparsityScaling::Fixed => base_target,
153            SparsityScaling::SquareRoot => {
154                // Scale as sqrt(dim) relative to base 10K dimensions
155                let base_dim = 10_000.0_f64;
156                let scale = (self.dimension as f64 / base_dim).sqrt();
157                (base_target as f64 * scale).round() as usize
158            }
159            SparsityScaling::Logarithmic => {
160                // Scale as log(dim) relative to base 10K dimensions
161                let base_dim = 10_000.0_f64;
162                let scale = (self.dimension as f64).ln() / base_dim.ln();
163                (base_target as f64 * scale).round() as usize
164            }
165            SparsityScaling::Custom { a, b, c } => {
166                // Custom formula: a * dim^b + c
167                (a * (self.dimension as f64).powf(b) + c).round() as usize
168            }
169        };
170
171        scaled_target.clamp(self.min_nonzero, self.max_nonzero)
172    }
173
174    /// Create configuration optimized for a target reconstruction accuracy
175    ///
176    /// This automatically selects dimension and density based on empirical
177    /// relationships between VSA parameters and encoding fidelity.
178    ///
179    /// | Accuracy | Dimension | Density | Typical Use Case |
180    /// |----------|-----------|---------|------------------|
181    /// | 0.90     | 5,000     | 2%      | Fast prototyping |
182    /// | 0.95     | 10,000    | 1%      | Default balance  |
183    /// | 0.98     | 50,000    | 0.5%    | High fidelity    |
184    /// | 0.99     | 100,000   | 0.3%    | Production       |
185    ///
186    /// # Example
187    ///
188    /// ```rust
189    /// use embeddenator_vsa::VsaConfig;
190    ///
191    /// // Create config targeting 95% reconstruction accuracy
192    /// let config = VsaConfig::for_accuracy(0.95);
193    /// // Accuracy maps to 25K dimensions (between 0.95 and 0.98)
194    /// assert!(config.dimension >= 10_000 && config.dimension <= 50_000);
195    /// ```
196    pub fn for_accuracy(target: f64) -> Self {
197        let target = target.clamp(0.5, 0.9999);
198
199        // Empirical mapping: accuracy → (dimension, density)
200        // Based on information-theoretic capacity of ternary HDV
201        let (dimension, density) = if target < 0.85 {
202            (2_000, 0.03)
203        } else if target < 0.90 {
204            (5_000, 0.02)
205        } else if target < 0.95 {
206            (10_000, 0.01)
207        } else if target < 0.98 {
208            (25_000, 0.008)
209        } else if target < 0.99 {
210            (50_000, 0.005)
211        } else if target < 0.995 {
212            (100_000, 0.003)
213        } else if target < 0.999 {
214            (250_000, 0.002)
215        } else {
216            (500_000, 0.001)
217        };
218
219        Self::new(dimension)
220            .with_density(density)
221            .with_target_accuracy(target)
222            .with_scaling(SparsityScaling::SquareRoot)
223    }
224
225    /// Create configuration from a custom schema/specification
226    ///
227    /// Allows users to provide their own parameter mapping for specialized use cases.
228    ///
229    /// # Example
230    ///
231    /// ```rust
232    /// use embeddenator_vsa::{VsaConfig, VsaConfigSchema, SparsityScaling};
233    ///
234    /// // Custom schema for memory-constrained embedded systems
235    /// let schema = VsaConfigSchema {
236    ///     dimension: 4096,
237    ///     density: 0.025,
238    ///     scaling: Some(SparsityScaling::Logarithmic),
239    ///     auto_thin: Some(512),
240    ///     min_nnz: Some(8),
241    ///     max_nnz: Some(256),
242    ///     target_accuracy: None,
243    /// };
244    /// let config = VsaConfig::from_schema(schema);
245    /// assert_eq!(config.dimension, 4096);
246    /// ```
247    pub fn from_schema(schema: VsaConfigSchema) -> Self {
248        let mut config = Self::new(schema.dimension).with_density(schema.density);
249
250        if let Some(scaling) = schema.scaling {
251            config = config.with_scaling(scaling);
252        }
253        if let Some(threshold) = schema.auto_thin {
254            config = config.with_auto_thin(threshold);
255        }
256        if let Some(min) = schema.min_nnz {
257            config = config.with_min_nonzero(min);
258        }
259        if let Some(max) = schema.max_nnz {
260            config = config.with_max_nonzero(max);
261        }
262        if let Some(accuracy) = schema.target_accuracy {
263            config = config.with_target_accuracy(accuracy);
264        }
265
266        config
267    }
268
269    /// Configuration for small vectors (1,000 dimensions)
270    pub fn small() -> Self {
271        Self::new(1_000).with_density(0.02)
272    }
273
274    /// Configuration for medium vectors (10,000 dimensions - default)
275    pub fn medium() -> Self {
276        Self::default()
277    }
278
279    /// Configuration for large vectors (100,000 dimensions)
280    pub fn large() -> Self {
281        Self::new(100_000).with_density(0.005)
282    }
283
284    /// Configuration for huge vectors (1,000,000 dimensions)
285    /// Use sparingly - memory intensive
286    pub fn huge() -> Self {
287        Self::new(1_000_000).with_density(0.001)
288    }
289
290    /// Get the effective auto-thin threshold, accounting for dimension
291    pub fn effective_auto_thin(&self) -> usize {
292        if self.auto_thin_threshold == 0 {
293            usize::MAX // Disabled
294        } else {
295            self.auto_thin_threshold
296        }
297    }
298
299    /// Check if a vector exceeds the auto-thin threshold
300    pub fn should_thin(&self, nnz: usize) -> bool {
301        self.auto_thin_threshold > 0 && nnz > self.auto_thin_threshold
302    }
303}
304
305/// User-provided schema for custom VSA configuration
306///
307/// This allows users to specify their own parameters for specialized use cases
308/// without needing to understand the internal scaling formulas.
309///
310/// # Example
311///
312/// ```rust
313/// use embeddenator_vsa::{VsaConfig, VsaConfigSchema, SparsityScaling};
314///
315/// // Custom schema for a specific application
316/// let schema = VsaConfigSchema {
317///     dimension: 8192,
318///     density: 0.015,
319///     scaling: Some(SparsityScaling::SquareRoot),
320///     auto_thin: Some(1000),
321///     min_nnz: Some(16),
322///     max_nnz: Some(512),
323///     target_accuracy: Some(0.97),
324/// };
325///
326/// let config = VsaConfig::from_schema(schema);
327/// assert_eq!(config.dimension, 8192);
328/// ```
329#[derive(Clone, Debug, Serialize, Deserialize)]
330pub struct VsaConfigSchema {
331    /// Required: Number of dimensions
332    pub dimension: usize,
333    /// Required: Target density (0.0-1.0)
334    pub density: f64,
335    /// Optional: Sparsity scaling mode
336    #[serde(default)]
337    pub scaling: Option<SparsityScaling>,
338    /// Optional: Auto-thinning threshold (0 to disable)
339    #[serde(default)]
340    pub auto_thin: Option<usize>,
341    /// Optional: Minimum non-zero elements
342    #[serde(default)]
343    pub min_nnz: Option<usize>,
344    /// Optional: Maximum non-zero elements
345    #[serde(default)]
346    pub max_nnz: Option<usize>,
347    /// Optional: Target reconstruction accuracy
348    #[serde(default)]
349    pub target_accuracy: Option<f64>,
350}
351
352impl VsaConfigSchema {
353    /// Create a minimal schema with just dimension and density
354    pub fn new(dimension: usize, density: f64) -> Self {
355        Self {
356            dimension,
357            density,
358            scaling: None,
359            auto_thin: None,
360            min_nnz: None,
361            max_nnz: None,
362            target_accuracy: None,
363        }
364    }
365
366    /// Set scaling mode
367    pub fn with_scaling(mut self, mode: SparsityScaling) -> Self {
368        self.scaling = Some(mode);
369        self
370    }
371
372    /// Set auto-thin threshold
373    pub fn with_auto_thin(mut self, threshold: usize) -> Self {
374        self.auto_thin = Some(threshold);
375        self
376    }
377
378    /// Parse from JSON string
379    pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
380        serde_json::from_str(json)
381    }
382
383    /// Serialize to JSON string
384    pub fn to_json(&self) -> Result<String, serde_json::Error> {
385        serde_json::to_string_pretty(self)
386    }
387}
388
389thread_local! {
390    // Reused packed buffers for hot paths. Using TLS keeps this allocation
391    // amortized while remaining safe in multi-threaded contexts.
392    static PACKED_SCRATCH_A: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
393    static PACKED_SCRATCH_B: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
394    static PACKED_SCRATCH_OUT: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
395}
396
397/// Configuration for reversible VSA encoding/decoding operations
398#[derive(Clone, Debug, Serialize, Deserialize)]
399pub struct ReversibleVSAConfig {
400    /// Block size for chunked encoding (must be power of 2 for efficiency)
401    pub block_size: usize,
402    /// Maximum path depth for hierarchical encoding
403    pub max_path_depth: usize,
404    /// Base permutation shift for path-based encoding
405    pub base_shift: usize,
406    /// Target sparsity level for operations (number of non-zero elements)
407    pub target_sparsity: usize,
408}
409
410impl Default for ReversibleVSAConfig {
411    fn default() -> Self {
412        ReversibleVSAConfig {
413            block_size: 256, // 256-byte blocks
414            max_path_depth: 10,
415            base_shift: 1000,
416            target_sparsity: 200, // Default sparsity level
417        }
418    }
419}
420
421impl ReversibleVSAConfig {
422    /// Create config optimized for small data blocks
423    pub fn small_blocks() -> Self {
424        ReversibleVSAConfig {
425            block_size: 64,
426            max_path_depth: 5,
427            base_shift: 500,
428            target_sparsity: 100,
429        }
430    }
431
432    /// Create config optimized for large data blocks
433    pub fn large_blocks() -> Self {
434        ReversibleVSAConfig {
435            block_size: 1024,
436            max_path_depth: 20,
437            base_shift: 2000,
438            target_sparsity: 400,
439        }
440    }
441}
442
443/// Sparse ternary vector with positive and negative indices
444#[derive(Clone, Debug, Serialize, Deserialize)]
445pub struct SparseVec {
446    /// Indices with +1 value
447    pub pos: Vec<usize>,
448    /// Indices with -1 value
449    pub neg: Vec<usize>,
450}
451
452impl Default for SparseVec {
453    fn default() -> Self {
454        Self::new()
455    }
456}
457
458impl SparseVec {
459    #[inline]
460    fn nnz(&self) -> usize {
461        self.pos.len() + self.neg.len()
462    }
463
464    fn intersection_count_sorted(a: &[usize], b: &[usize]) -> usize {
465        let mut i = 0usize;
466        let mut j = 0usize;
467        let mut count = 0usize;
468        while i < a.len() && j < b.len() {
469            match a[i].cmp(&b[j]) {
470                std::cmp::Ordering::Less => i += 1,
471                std::cmp::Ordering::Greater => j += 1,
472                std::cmp::Ordering::Equal => {
473                    count += 1;
474                    i += 1;
475                    j += 1;
476                }
477            }
478        }
479        count
480    }
481
482    fn union_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
483        let mut out = Vec::with_capacity(a.len() + b.len());
484        let mut i = 0usize;
485        let mut j = 0usize;
486
487        while i < a.len() && j < b.len() {
488            match a[i].cmp(&b[j]) {
489                std::cmp::Ordering::Less => {
490                    out.push(a[i]);
491                    i += 1;
492                }
493                std::cmp::Ordering::Greater => {
494                    out.push(b[j]);
495                    j += 1;
496                }
497                std::cmp::Ordering::Equal => {
498                    out.push(a[i]);
499                    i += 1;
500                    j += 1;
501                }
502            }
503        }
504
505        if i < a.len() {
506            out.extend_from_slice(&a[i..]);
507        }
508        if j < b.len() {
509            out.extend_from_slice(&b[j..]);
510        }
511
512        out
513    }
514
515    fn difference_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
516        let mut out = Vec::with_capacity(a.len());
517        let mut i = 0usize;
518        let mut j = 0usize;
519
520        while i < a.len() && j < b.len() {
521            match a[i].cmp(&b[j]) {
522                std::cmp::Ordering::Less => {
523                    out.push(a[i]);
524                    i += 1;
525                }
526                std::cmp::Ordering::Greater => j += 1,
527                std::cmp::Ordering::Equal => {
528                    i += 1;
529                    j += 1;
530                }
531            }
532        }
533
534        if i < a.len() {
535            out.extend_from_slice(&a[i..]);
536        }
537
538        out
539    }
540
541    fn intersection_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
542        let mut out = Vec::with_capacity(a.len().min(b.len()));
543        let mut i = 0usize;
544        let mut j = 0usize;
545
546        while i < a.len() && j < b.len() {
547            match a[i].cmp(&b[j]) {
548                std::cmp::Ordering::Less => i += 1,
549                std::cmp::Ordering::Greater => j += 1,
550                std::cmp::Ordering::Equal => {
551                    out.push(a[i]);
552                    i += 1;
553                    j += 1;
554                }
555            }
556        }
557
558        out
559    }
560    /// Create an empty sparse vector
561    ///
562    /// # Examples
563    ///
564    /// ```
565    /// use embeddenator_vsa::SparseVec;
566    ///
567    /// let vec = SparseVec::new();
568    /// assert!(vec.pos.is_empty());
569    /// assert!(vec.neg.is_empty());
570    /// ```
571    pub fn new() -> Self {
572        SparseVec {
573            pos: Vec::new(),
574            neg: Vec::new(),
575        }
576    }
577
578    /// Generate a random sparse vector with ~1% density
579    ///
580    /// # Examples
581    ///
582    /// ```
583    /// use embeddenator_vsa::SparseVec;
584    ///
585    /// let vec = SparseVec::random();
586    /// // Vector should have approximately 1% density (100 positive + 100 negative)
587    /// assert!(vec.pos.len() > 0);
588    /// assert!(vec.neg.len() > 0);
589    /// ```
590    pub fn random() -> Self {
591        let mut rng = rand::rng();
592        let sparsity = DIM / 100; // ~1% density
593
594        // Generate random indices without replacement
595        let mut indices: Vec<usize> = (0..DIM).collect();
596        indices.shuffle(&mut rng);
597
598        let mut pos: Vec<_> = indices[..sparsity].to_vec();
599        let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
600
601        pos.sort_unstable();
602        neg.sort_unstable();
603
604        SparseVec { pos, neg }
605    }
606
607    /// Generate a random sparse vector with configurable dimensions and density
608    ///
609    /// This method allows creating vectors with custom dimensions, useful for
610    /// benchmarking with different data types or when dimensions need to be
611    /// determined at runtime.
612    ///
613    /// # Arguments
614    /// * `config` - Configuration specifying dimension and density
615    ///
616    /// # Examples
617    ///
618    /// ```rust
619    /// use embeddenator_vsa::{SparseVec, VsaConfig};
620    ///
621    /// // Create a small vector (1000 dims, 2% density)
622    /// let config = VsaConfig::small();
623    /// let vec = SparseVec::random_with_config(&config);
624    /// assert!(vec.pos.len() > 0);
625    ///
626    /// // Create a custom-sized vector
627    /// let custom = VsaConfig::new(5000).with_density(0.02);
628    /// let vec = SparseVec::random_with_config(&custom);
629    /// ```
630    pub fn random_with_config(config: &VsaConfig) -> Self {
631        let mut rng = rand::rng();
632        let sparsity = config.sparsity();
633
634        // Generate random indices without replacement
635        let mut indices: Vec<usize> = (0..config.dimension).collect();
636        indices.shuffle(&mut rng);
637
638        let mut pos: Vec<_> = indices[..sparsity].to_vec();
639        let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
640
641        pos.sort_unstable();
642        neg.sort_unstable();
643
644        SparseVec { pos, neg }
645    }
646
647    /// Encode data into a reversible sparse vector using block-based mapping
648    ///
649    /// This method implements hierarchical encoding with path-based permutations
650    /// for lossless data recovery. The encoding process:
651    /// 1. Splits data into blocks of configurable size
652    /// 2. Applies path-based permutations to each block
653    /// 3. Combines blocks using hierarchical bundling
654    ///
655    /// # Arguments
656    /// * `data` - The data to encode
657    /// * `config` - Configuration for encoding parameters
658    /// * `path` - Optional path string for hierarchical encoding (affects permutation)
659    ///
660    /// # Returns
661    /// A SparseVec that can be decoded back to the original data
662    ///
663    /// # Examples
664    ///
665    /// ```
666    /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
667    ///
668    /// let data = b"hello world";
669    /// let config = ReversibleVSAConfig::default();
670    /// let encoded = SparseVec::encode_data(data, &config, None);
671    ///
672    /// // encoded vector contains reversible representation of the data
673    /// assert!(!encoded.pos.is_empty() || !encoded.neg.is_empty());
674    /// ```
675    pub fn encode_data(data: &[u8], config: &ReversibleVSAConfig, path: Option<&str>) -> Self {
676        if data.is_empty() {
677            return SparseVec::new();
678        }
679
680        // Calculate path-based shift for hierarchical encoding
681        let path_shift = if let Some(path_str) = path {
682            // Use path hash to determine shift, constrained to prevent overflow
683            let mut hasher = Sha256::new();
684            hasher.update(path_str.as_bytes());
685            let hash = hasher.finalize();
686            // SAFETY: SHA256 always produces 32 bytes, first 4 bytes are always valid
687            let hash_bytes: [u8; 4] = hash[0..4]
688                .try_into()
689                .expect("SHA256 hash must produce at least 4 bytes");
690            let path_hash = u32::from_le_bytes(hash_bytes) as usize;
691            (path_hash % config.max_path_depth) * config.base_shift
692        } else {
693            0
694        };
695
696        // Split data into blocks
697        let mut blocks = Vec::new();
698        for chunk in data.chunks(config.block_size) {
699            blocks.push(chunk);
700        }
701
702        // Encode each block with position-based permutation
703        let mut encoded_blocks = Vec::new();
704        for (i, block) in blocks.iter().enumerate() {
705            let block_shift = path_shift + (i * config.base_shift / blocks.len().max(1));
706            let block_vec = Self::encode_block(block, block_shift);
707            encoded_blocks.push(block_vec);
708        }
709
710        // Combine blocks hierarchically
711        if encoded_blocks.is_empty() {
712            SparseVec::new()
713        } else if encoded_blocks.len() == 1 {
714            // SAFETY: we just checked len() == 1, so next() must return Some
715            encoded_blocks
716                .into_iter()
717                .next()
718                .expect("encoded_blocks must have exactly one element")
719        } else {
720            // Hierarchical bundling: combine in binary tree fashion
721            Self::hierarchical_bundle(&encoded_blocks)
722        }
723    }
724
725    /// Decode data from a reversible sparse vector
726    ///
727    /// Reverses the encoding process to recover the original data.
728    /// Requires the same configuration and path used during encoding.
729    ///
730    /// # Arguments
731    /// * `config` - Same configuration used for encoding
732    /// * `path` - Same path string used for encoding
733    /// * `expected_size` - Expected size of the decoded data (for validation)
734    ///
735    /// # Returns
736    /// The original data bytes (may need correction layer for 100% fidelity)
737    ///
738    /// # Examples
739    ///
740    /// ```no_run
741    /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
742    ///
743    /// let data = b"hello world";
744    /// let config = ReversibleVSAConfig::default();
745    /// let encoded = SparseVec::encode_data(data, &config, None);
746    /// let decoded = encoded.decode_data(&config, None, data.len());
747    ///
748    /// // Note: For 100% fidelity, use CorrectionStore with EmbrFS
749    /// // Raw decode may have minor differences that corrections compensate for
750    /// ```
751    pub fn decode_data(
752        &self,
753        config: &ReversibleVSAConfig,
754        path: Option<&str>,
755        expected_size: usize,
756    ) -> Vec<u8> {
757        if self.pos.is_empty() && self.neg.is_empty() {
758            return Vec::new();
759        }
760
761        if expected_size == 0 {
762            return Vec::new();
763        }
764
765        // Calculate path-based shift (same as encoding)
766        let path_shift = if let Some(path_str) = path {
767            let mut hasher = Sha256::new();
768            hasher.update(path_str.as_bytes());
769            let hash = hasher.finalize();
770            // SAFETY: SHA256 always produces 32 bytes, first 4 bytes are always valid
771            let hash_bytes: [u8; 4] = hash[0..4]
772                .try_into()
773                .expect("SHA256 hash must produce at least 4 bytes");
774            let path_hash = u32::from_le_bytes(hash_bytes) as usize;
775            (path_hash % config.max_path_depth) * config.base_shift
776        } else {
777            0
778        };
779
780        // Estimate number of blocks based on expected size
781        let estimated_blocks = expected_size.div_ceil(config.block_size);
782
783        // For single block case
784        if estimated_blocks <= 1 {
785            return Self::decode_block(self, path_shift, expected_size);
786        }
787
788        // =======================================================================
789        // MULTI-BLOCK DECODING LIMITATIONS (v0.21.0)
790        // =======================================================================
791        //
792        // Current Status: EXPERIMENTAL / SIMPLIFIED IMPLEMENTATION
793        //
794        // The multi-block decoding uses a simplified linear approach without proper
795        // hierarchical factorization. This has the following implications:
796        //
797        // 1. **Information Loss**: When multiple blocks are bundled hierarchically,
798        //    separating them requires VSA factorization techniques (e.g., HRR unbinding,
799        //    MAP estimation) that are not yet implemented.
800        //
801        // 2. **Use Case**: Works reliably for:
802        //    - Single-block data (data.len() <= config.block_size)
803        //    - Data with clear block boundaries
804        //
805        // 3. **Known Limitation**: Multi-block data (> block_size) may have reduced
806        //    fidelity. For 100% accuracy, use the Codebook's residual mechanism or
807        //    external correction stores.
808        //
809        // 4. **Future Work**: Full hierarchical decoding is planned for v1.1.0.
810        //    See IMPLEMENTATION_PLAN.md Task 4.2 for details.
811        //
812        // =======================================================================
813
814        // For multiple blocks, we need to factorize the hierarchical bundle
815        // This is a simplified approach - in practice, we'd need more sophisticated
816        // factorization to separate the blocks
817        let mut result = Vec::new();
818
819        // Simplified best-effort linear decoding for multi-block data.
820        // WARNING: This path trades accuracy for speed and may:
821        //   - Lose information across blocks, and/or
822        //   - Mis-assign tokens between adjacent blocks.
823        // For applications that require 100% fidelity, prefer the Codebook's
824        // residual mechanism or external correction stores instead of relying
825        // solely on this multi-block decoder.
826        for i in 0..estimated_blocks {
827            let block_shift = path_shift + (i * config.base_shift / estimated_blocks.max(1));
828            let remaining = expected_size.saturating_sub(result.len());
829            if remaining == 0 {
830                break;
831            }
832            let max_len = remaining.min(config.block_size);
833            let block_data = Self::decode_block(self, block_shift, max_len);
834            if block_data.is_empty() {
835                break;
836            }
837            result.extend(block_data);
838            if result.len() >= expected_size {
839                break;
840            }
841        }
842
843        // Truncate to expected size
844        result.truncate(expected_size);
845        result
846    }
847
848    /// Encode a single block of data with position-based permutation
849    fn encode_block(data: &[u8], shift: usize) -> SparseVec {
850        if data.is_empty() {
851            return SparseVec::new();
852        }
853
854        // Map data bytes to vector indices using the permuted mapping
855        let mut pos = Vec::new();
856        let mut neg = Vec::new();
857
858        for (i, &byte) in data.iter().enumerate() {
859            let base_idx = (i + shift) % DIM;
860
861            // Use byte value to determine polarity and offset
862            if byte & 0x80 != 0 {
863                // High bit set -> negative
864                neg.push((base_idx + (byte & 0x7F) as usize) % DIM);
865            } else {
866                // High bit clear -> positive
867                pos.push((base_idx + byte as usize) % DIM);
868            }
869        }
870
871        pos.sort_unstable();
872        pos.dedup();
873        neg.sort_unstable();
874        neg.dedup();
875
876        // CRITICAL FIX: Ensure pos and neg are disjoint
877        // If an index appears in both pos and neg, it means we have conflicting
878        // encodings. Remove from both to treat as 0 (cancel out).
879        let overlap = Self::intersection_sorted(&pos, &neg);
880        if !overlap.is_empty() {
881            pos = Self::difference_sorted(&pos, &overlap);
882            neg = Self::difference_sorted(&neg, &overlap);
883        }
884
885        SparseVec { pos, neg }
886    }
887
888    /// Decode a single block of data
889    fn decode_block(encoded: &SparseVec, shift: usize, max_len: usize) -> Vec<u8> {
890        if max_len == 0 {
891            return Vec::new();
892        }
893
894        let mut result = Vec::with_capacity(max_len);
895
896        // Reconstruct data by reversing the permutation.
897        // Note: `pos` and `neg` are kept sorted, so membership can be checked via binary search.
898        for i in 0..max_len {
899            let base_idx = (i + shift) % DIM;
900
901            // Look for indices that map back to this position
902            let mut found_byte = None;
903            for offset in 0..128u8 {
904                let test_idx = (base_idx + offset as usize) % DIM;
905
906                if encoded.pos.binary_search(&test_idx).is_ok() {
907                    found_byte = Some(offset);
908                    break;
909                } else if encoded.neg.binary_search(&test_idx).is_ok() {
910                    found_byte = Some(offset | 0x80);
911                    break;
912                }
913            }
914
915            if let Some(byte) = found_byte {
916                result.push(byte);
917            } else {
918                // No more data found
919                break;
920            }
921        }
922
923        result
924    }
925
926    /// Combine multiple vectors using hierarchical bundling
927    fn hierarchical_bundle(vectors: &[SparseVec]) -> SparseVec {
928        if vectors.is_empty() {
929            return SparseVec::new();
930        }
931        if vectors.len() == 1 {
932            return vectors[0].clone();
933        }
934
935        // Binary tree combination
936        let mut result = vectors[0].clone();
937        for vec in &vectors[1..] {
938            result = result.bundle(vec);
939        }
940        result
941    }
942
943    /// Generate a deterministic sparse vector from data using SHA256 seed
944    /// DEPRECATED: Use encode_data() for new code
945    ///
946    /// # Examples
947    ///
948    /// ```
949    /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
950    ///
951    /// let data = b"hello world";
952    /// let config = ReversibleVSAConfig::default();
953    /// let vec1 = SparseVec::encode_data(data, &config, None);
954    /// let vec2 = SparseVec::encode_data(data, &config, None);
955    ///
956    /// // Same input produces same vector (deterministic)
957    /// assert_eq!(vec1.pos, vec2.pos);
958    /// assert_eq!(vec1.neg, vec2.neg);
959    /// ```
960    #[deprecated(since = "0.2.0", note = "Use encode_data() for reversible encoding")]
961    pub fn from_data(data: &[u8]) -> Self {
962        let mut hasher = Sha256::new();
963        hasher.update(data);
964        let hash = hasher.finalize();
965
966        // SAFETY: SHA256 always produces exactly 32 bytes
967        let seed: [u8; 32] = hash[..32]
968            .try_into()
969            .expect("SHA256 must produce exactly 32 bytes");
970        let mut rng = rand::rngs::StdRng::from_seed(seed);
971
972        let mut indices: Vec<usize> = (0..DIM).collect();
973        indices.shuffle(&mut rng);
974
975        let sparsity = DIM / 100;
976        let mut pos = indices[..sparsity].to_vec();
977        let mut neg = indices[sparsity..sparsity * 2].to_vec();
978
979        pos.sort_unstable();
980        neg.sort_unstable();
981
982        SparseVec { pos, neg }
983    }
984
985    /// Bundle operation: pairwise conflict-cancel superposition (A ⊕ B)
986    ///
987    /// This is a fast, commutative merge for two vectors:
988    /// - same sign => keep
989    /// - opposite signs => cancel to 0
990    /// - sign vs 0 => keep sign
991    ///
992    /// Note: While this is well-defined for two vectors, repeated application across 3+
993    /// vectors is generally **not associative** because early cancellation/thresholding can
994    /// discard multiplicity information.
995    ///
996    /// # Arguments
997    /// * `other` - The vector to bundle with self
998    /// * `config` - Optional ReversibleVSAConfig for controlling sparsity via thinning
999    ///
1000    /// # Examples
1001    ///
1002    /// ```
1003    /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
1004    ///
1005    /// let config = ReversibleVSAConfig::default();
1006    /// let vec1 = SparseVec::encode_data(b"data1", &config, None);
1007    /// let vec2 = SparseVec::encode_data(b"data2", &config, None);
1008    /// let bundled = vec1.bundle_with_config(&vec2, Some(&config));
1009    ///
1010    /// // Bundled vector contains superposition of both inputs
1011    /// // Should be similar to both original vectors
1012    /// let sim1 = vec1.cosine(&bundled);
1013    /// let sim2 = vec2.cosine(&bundled);
1014    /// assert!(sim1 > 0.3);
1015    /// assert!(sim2 > 0.3);
1016    /// ```
1017    pub fn bundle_with_config(
1018        &self,
1019        other: &SparseVec,
1020        config: Option<&ReversibleVSAConfig>,
1021    ) -> SparseVec {
1022        let mut result = self.bundle(other);
1023
1024        // Apply thinning if config provided and result exceeds target sparsity
1025        if let Some(cfg) = config {
1026            let current_count = result.pos.len() + result.neg.len();
1027            if current_count > cfg.target_sparsity {
1028                result = result.thin(cfg.target_sparsity);
1029            }
1030        }
1031
1032        result
1033    }
1034
1035    /// Bundle operation: pairwise conflict-cancel superposition (A ⊕ B)
1036    ///
1037    /// See `bundle()` for semantic details; this wrapper optionally applies thinning via
1038    /// `ReversibleVSAConfig`.
1039    ///
1040    /// # Examples
1041    ///
1042    /// ```
1043    /// use embeddenator_vsa::SparseVec;
1044    ///
1045    /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1046    /// let vec1 = SparseVec::encode_data(b"data1", &config, None);
1047    /// let vec2 = SparseVec::encode_data(b"data2", &config, None);
1048    /// let bundled = vec1.bundle(&vec2);
1049    ///
1050    /// // Bundled vector contains superposition of both inputs
1051    /// // Should be similar to both original vectors
1052    /// let sim1 = vec1.cosine(&bundled);
1053    /// let sim2 = vec2.cosine(&bundled);
1054    /// assert!(sim1 > 0.3);
1055    /// assert!(sim2 > 0.3);
1056    /// ```
1057    pub fn bundle(&self, other: &SparseVec) -> SparseVec {
1058        // Optional ternary-native fast path (migration gate).
1059        // This is primarily intended for cases where vectors become dense enough
1060        // that packed word-wise operations are competitive.
1061        {
1062            // Converting SparseVec -> PackedTritVec is O(DIM) for allocation/zeroing, plus O(nnz)
1063            // for setting lanes. Only attempt the packed path when total density is high enough,
1064            // and avoid converting an extremely sparse operand.
1065            // Thresholds lowered from DIM/4 to DIM/8 for earlier SIMD dispatch now that
1066            // PackedTritVec has proper bitsliced operations.
1067            let a_nnz = self.nnz();
1068            let b_nnz = other.nnz();
1069            let total = a_nnz + b_nnz;
1070            if total > DIM / 8 {
1071                let min_nnz = a_nnz.min(b_nnz);
1072                if min_nnz > DIM / 64 {
1073                    return PACKED_SCRATCH_A.with(|a_cell| {
1074                        PACKED_SCRATCH_B.with(|b_cell| {
1075                            PACKED_SCRATCH_OUT.with(|out_cell| {
1076                                let mut a = a_cell.borrow_mut();
1077                                let mut b = b_cell.borrow_mut();
1078                                let mut out = out_cell.borrow_mut();
1079
1080                                a.fill_from_sparsevec(self, DIM);
1081                                b.fill_from_sparsevec(other, DIM);
1082                                a.bundle_into(&b, &mut out);
1083                                out.to_sparsevec()
1084                            })
1085                        })
1086                    });
1087                }
1088            }
1089        }
1090
1091        // Majority voting for two sparse ternary vectors:
1092        // - Same sign => keep
1093        // - Opposite signs => cancel to 0
1094        // - Sign vs 0 => keep sign
1095        // This can be expressed via sorted set differences/unions.
1096        let pos_a = Self::difference_sorted(&self.pos, &other.neg);
1097        let pos_b = Self::difference_sorted(&other.pos, &self.neg);
1098        let neg_a = Self::difference_sorted(&self.neg, &other.pos);
1099        let neg_b = Self::difference_sorted(&other.neg, &self.pos);
1100
1101        let mut pos = Self::union_sorted(&pos_a, &pos_b);
1102        let mut neg = Self::union_sorted(&neg_a, &neg_b);
1103
1104        // CRITICAL FIX: Ensure pos and neg remain disjoint
1105        // Remove any indices that appear in both (shouldn't happen in theory,
1106        // but can occur due to floating-point or hierarchical bundling issues)
1107        let overlap = Self::intersection_sorted(&pos, &neg);
1108        if !overlap.is_empty() {
1109            pos = Self::difference_sorted(&pos, &overlap);
1110            neg = Self::difference_sorted(&neg, &overlap);
1111        }
1112
1113        SparseVec { pos, neg }
1114    }
1115
1116    /// Associative bundle over many vectors: sums contributions per index, then thresholds to sign.
1117    /// This is order-independent because all contributions are accumulated before applying sign.
1118    /// Complexity: O(K log K) where K is total non-zero entries across inputs.
1119    pub fn bundle_sum_many<'a, I>(vectors: I) -> SparseVec
1120    where
1121        I: IntoIterator<Item = &'a SparseVec>,
1122    {
1123        let mut contributions: Vec<(usize, i32)> = Vec::new();
1124
1125        for vec in vectors {
1126            contributions.extend(vec.pos.iter().map(|&idx| (idx, 1i32)));
1127            contributions.extend(vec.neg.iter().map(|&idx| (idx, -1i32)));
1128        }
1129
1130        if contributions.is_empty() {
1131            return SparseVec {
1132                pos: Vec::new(),
1133                neg: Vec::new(),
1134            };
1135        }
1136
1137        contributions.sort_unstable_by_key(|(idx, _)| *idx);
1138
1139        let mut pos = Vec::new();
1140        let mut neg = Vec::new();
1141
1142        let mut iter = contributions.into_iter();
1143        // SAFETY: we checked contributions.is_empty() above and returned early if empty
1144        let (mut current_idx, mut acc) = iter
1145            .next()
1146            .expect("contributions must be non-empty after early return check");
1147
1148        for (idx, value) in iter {
1149            if idx == current_idx {
1150                acc += value;
1151            } else {
1152                if acc > 0 {
1153                    pos.push(current_idx);
1154                } else if acc < 0 {
1155                    neg.push(current_idx);
1156                }
1157                current_idx = idx;
1158                acc = value;
1159            }
1160        }
1161
1162        if acc > 0 {
1163            pos.push(current_idx);
1164        } else if acc < 0 {
1165            neg.push(current_idx);
1166        }
1167
1168        SparseVec { pos, neg }
1169    }
1170
1171    /// Hybrid bundle: choose a fast pairwise fold for very sparse regimes (to preserve sparsity),
1172    /// otherwise use the associative sum-then-threshold path (order-independent, more faithful to majority).
1173    ///
1174    /// Heuristic: estimate expected overlap/collision count assuming uniform hashing into `DIM`.
1175    /// If expected colliding dimensions is below a small budget, use pairwise `bundle`; else use
1176    /// `bundle_sum_many`.
1177    pub fn bundle_hybrid_many<'a, I>(vectors: I) -> SparseVec
1178    where
1179        I: IntoIterator<Item = &'a SparseVec>,
1180    {
1181        let collected: Vec<&'a SparseVec> = vectors.into_iter().collect();
1182        if collected.is_empty() {
1183            return SparseVec {
1184                pos: Vec::new(),
1185                neg: Vec::new(),
1186            };
1187        }
1188
1189        if collected.len() == 1 {
1190            return collected[0].clone();
1191        }
1192
1193        if collected.len() == 2 {
1194            return collected[0].bundle(collected[1]);
1195        }
1196
1197        let total_nnz: usize = collected.iter().map(|v| v.pos.len() + v.neg.len()).sum();
1198
1199        if total_nnz == 0 {
1200            return SparseVec {
1201                pos: Vec::new(),
1202                neg: Vec::new(),
1203            };
1204        }
1205
1206        // Constant-time overlap/collision risk estimate.
1207        // Model: each non-zero lands uniformly in DIM dimensions.
1208        // Let λ = total_nnz / DIM be expected hits per dimension.
1209        // Then P(K>=2) ≈ 1 - e^{-λ}(1+λ) for Poisson(λ).
1210        // If expected number of colliding dimensions is tiny, pairwise fold is effectively safe
1211        // (and faster). Otherwise, use associative accumulation to avoid order sensitivity.
1212        let lambda = total_nnz as f64 / DIM as f64;
1213        let p_ge_2 = 1.0 - (-lambda).exp() * (1.0 + lambda);
1214        let expected_colliding_dims = p_ge_2 * DIM as f64;
1215
1216        // Budget: allow a small number of potentially order-sensitive dimensions.
1217        // Tune via benchmarks; this is conservative for integrity.
1218        let collision_budget_dims = 32.0;
1219
1220        if expected_colliding_dims <= collision_budget_dims {
1221            let mut iter = collected.into_iter();
1222            // SAFETY: hierarchical_bundle is only called when collected.len() > 1
1223            let mut acc = iter
1224                .next()
1225                .expect("hierarchical_bundle must be called with non-empty collection")
1226                .clone();
1227            for v in iter {
1228                acc = acc.bundle(v);
1229            }
1230            return acc;
1231        }
1232
1233        SparseVec::bundle_sum_many(collected)
1234    }
1235
1236    /// Bind operation: non-commutative composition (A ⊙ B)
1237    /// Performs element-wise multiplication. Self-inverse: A ⊙ A ≈ I
1238    ///
1239    /// # Examples
1240    ///
1241    /// ```
1242    /// use embeddenator_vsa::SparseVec;
1243    ///
1244    /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1245    /// let vec = SparseVec::encode_data(b"test", &config, None);
1246    /// let bound = vec.bind(&vec);
1247    ///
1248    /// // Bind with self should produce high similarity (self-inverse property)
1249    /// let identity = SparseVec::encode_data(b"identity", &config, None);
1250    /// let sim = bound.cosine(&identity);
1251    /// // Result is approximately identity, so similarity varies
1252    /// assert!(sim >= -1.0 && sim <= 1.0);
1253    /// ```
1254    pub fn bind(&self, other: &SparseVec) -> SparseVec {
1255        {
1256            // Packed bind is only worthwhile when both operands are dense enough.
1257            // Using a short-circuiting check avoids paying extra overhead for sparse workloads.
1258            // Thresholds lowered from DIM/4 to DIM/8 for earlier SIMD dispatch.
1259            let a_nnz = self.nnz();
1260            if a_nnz > DIM / 8 {
1261                let b_nnz = other.nnz();
1262                if b_nnz > DIM / 8 {
1263                    return PACKED_SCRATCH_A.with(|a_cell| {
1264                        PACKED_SCRATCH_B.with(|b_cell| {
1265                            PACKED_SCRATCH_OUT.with(|out_cell| {
1266                                let mut a = a_cell.borrow_mut();
1267                                let mut b = b_cell.borrow_mut();
1268                                let mut out = out_cell.borrow_mut();
1269
1270                                a.fill_from_sparsevec(self, DIM);
1271                                b.fill_from_sparsevec(other, DIM);
1272                                a.bind_into(&b, &mut out);
1273                                out.to_sparsevec()
1274                            })
1275                        })
1276                    });
1277                }
1278            }
1279        }
1280
1281        // Sparse bind is element-wise multiplication restricted to non-zero support.
1282        // We can compute this in a single merge-join over the signed supports, keeping
1283        // outputs sorted without any post-sort.
1284        let mut result_pos = Vec::new();
1285        let mut result_neg = Vec::new();
1286
1287        let mut a_pos = 0usize;
1288        let mut a_neg = 0usize;
1289        let mut b_pos = 0usize;
1290        let mut b_neg = 0usize;
1291
1292        loop {
1293            let next_a = match (self.pos.get(a_pos), self.neg.get(a_neg)) {
1294                (Some(&p), Some(&n)) => {
1295                    if p < n {
1296                        Some((p, 1i8, true))
1297                    } else {
1298                        Some((n, -1i8, false))
1299                    }
1300                }
1301                (Some(&p), None) => Some((p, 1i8, true)),
1302                (None, Some(&n)) => Some((n, -1i8, false)),
1303                (None, None) => None,
1304            };
1305
1306            let next_b = match (other.pos.get(b_pos), other.neg.get(b_neg)) {
1307                (Some(&p), Some(&n)) => {
1308                    if p < n {
1309                        Some((p, 1i8, true))
1310                    } else {
1311                        Some((n, -1i8, false))
1312                    }
1313                }
1314                (Some(&p), None) => Some((p, 1i8, true)),
1315                (None, Some(&n)) => Some((n, -1i8, false)),
1316                (None, None) => None,
1317            };
1318
1319            let Some((idx_a, sign_a, a_is_pos)) = next_a else {
1320                break;
1321            };
1322            let Some((idx_b, sign_b, b_is_pos)) = next_b else {
1323                break;
1324            };
1325
1326            match idx_a.cmp(&idx_b) {
1327                std::cmp::Ordering::Less => {
1328                    if a_is_pos {
1329                        a_pos += 1;
1330                    } else {
1331                        a_neg += 1;
1332                    }
1333                }
1334                std::cmp::Ordering::Greater => {
1335                    if b_is_pos {
1336                        b_pos += 1;
1337                    } else {
1338                        b_neg += 1;
1339                    }
1340                }
1341                std::cmp::Ordering::Equal => {
1342                    let prod = sign_a * sign_b;
1343                    if prod == 1 {
1344                        result_pos.push(idx_a);
1345                    } else {
1346                        result_neg.push(idx_a);
1347                    }
1348
1349                    if a_is_pos {
1350                        a_pos += 1;
1351                    } else {
1352                        a_neg += 1;
1353                    }
1354                    if b_is_pos {
1355                        b_pos += 1;
1356                    } else {
1357                        b_neg += 1;
1358                    }
1359                }
1360            }
1361        }
1362
1363        SparseVec {
1364            pos: result_pos,
1365            neg: result_neg,
1366        }
1367    }
1368
1369    /// Calculate cosine similarity between two sparse vectors
1370    /// Returns value in [-1, 1] where 1 is identical, 0 is orthogonal
1371    ///
1372    /// When the `simd` feature is enabled, this will automatically use
1373    /// AVX2 (x86_64) or NEON (aarch64) acceleration if available.
1374    ///
1375    /// # Examples
1376    ///
1377    /// ```
1378    /// use embeddenator_vsa::SparseVec;
1379    ///
1380    /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1381    /// let vec1 = SparseVec::encode_data(b"cat", &config, None);
1382    /// let vec2 = SparseVec::encode_data(b"cat", &config, None);
1383    /// let vec3 = SparseVec::encode_data(b"dog", &config, None);
1384    ///
1385    /// // Identical data produces identical vectors
1386    /// assert!((vec1.cosine(&vec2) - 1.0).abs() < 0.01);
1387    ///
1388    /// // Different data produces low similarity
1389    /// let sim = vec1.cosine(&vec3);
1390    /// assert!(sim < 0.3);
1391    /// ```
1392    pub fn cosine(&self, other: &SparseVec) -> f64 {
1393        #[cfg(feature = "simd")]
1394        {
1395            crate::simd_cosine::cosine_simd(self, other)
1396        }
1397
1398        #[cfg(not(feature = "simd"))]
1399        {
1400            // Original implementation remains as fallback
1401            self.cosine_scalar(other)
1402        }
1403    }
1404
1405    /// Scalar (non-SIMD) cosine similarity implementation.
1406    ///
1407    /// This is the original implementation and serves as the baseline
1408    /// for SIMD optimizations. It's also used when SIMD is not available.
1409    #[inline]
1410    pub fn cosine_scalar(&self, other: &SparseVec) -> f64 {
1411        {
1412            // Only use packed cosine when total density is high enough to amortize conversion,
1413            // and avoid converting an extremely sparse operand.
1414            // Thresholds lowered from DIM/4 to DIM/8 for earlier SIMD dispatch.
1415            let a_nnz = self.nnz();
1416            let b_nnz = other.nnz();
1417            let total = a_nnz + b_nnz;
1418            if total > DIM / 8 {
1419                let min_nnz = a_nnz.min(b_nnz);
1420                if min_nnz > DIM / 64 {
1421                    let dot = PACKED_SCRATCH_A.with(|a_cell| {
1422                        PACKED_SCRATCH_B.with(|b_cell| {
1423                            let mut a = a_cell.borrow_mut();
1424                            let mut b = b_cell.borrow_mut();
1425                            a.fill_from_sparsevec(self, DIM);
1426                            b.fill_from_sparsevec(other, DIM);
1427                            a.dot(&b)
1428                        })
1429                    }) as f64;
1430                    let self_norm = a_nnz as f64;
1431                    let other_norm = b_nnz as f64;
1432                    if self_norm == 0.0 || other_norm == 0.0 {
1433                        return 0.0;
1434                    }
1435                    return dot / (self_norm.sqrt() * other_norm.sqrt());
1436                }
1437            }
1438        }
1439
1440        // Sparse ternary dot product:
1441        // +1 when signs match, -1 when signs oppose.
1442        let pp = Self::intersection_count_sorted(&self.pos, &other.pos) as i32;
1443        let nn = Self::intersection_count_sorted(&self.neg, &other.neg) as i32;
1444        let pn = Self::intersection_count_sorted(&self.pos, &other.neg) as i32;
1445        let np = Self::intersection_count_sorted(&self.neg, &other.pos) as i32;
1446        let dot = (pp + nn) - (pn + np);
1447
1448        let self_norm = self.nnz() as f64;
1449        let other_norm = other.nnz() as f64;
1450
1451        if self_norm == 0.0 || other_norm == 0.0 {
1452            return 0.0;
1453        }
1454
1455        dot as f64 / (self_norm.sqrt() * other_norm.sqrt())
1456    }
1457
1458    /// Apply cyclic permutation to vector indices
1459    /// Used for encoding sequence order in hierarchical structures
1460    ///
1461    /// # Arguments
1462    /// * `shift` - Number of positions to shift indices cyclically
1463    ///
1464    /// # Examples
1465    ///
1466    /// ```
1467    /// use embeddenator_vsa::SparseVec;
1468    ///
1469    /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1470    /// let vec = SparseVec::encode_data(b"test", &config, None);
1471    /// let permuted = vec.permute(100);
1472    ///
1473    /// // Permuted vector should have different indices but same structure
1474    /// assert_eq!(vec.pos.len(), permuted.pos.len());
1475    /// assert_eq!(vec.neg.len(), permuted.neg.len());
1476    /// ```
1477    pub fn permute(&self, shift: usize) -> SparseVec {
1478        let permute_index = |idx: usize| (idx + shift) % DIM;
1479
1480        let pos: Vec<usize> = self.pos.iter().map(|&idx| permute_index(idx)).collect();
1481        let neg: Vec<usize> = self.neg.iter().map(|&idx| permute_index(idx)).collect();
1482
1483        // Indices must remain sorted for efficient operations
1484        let mut pos = pos;
1485        let mut neg = neg;
1486        pos.sort_unstable();
1487        neg.sort_unstable();
1488
1489        SparseVec { pos, neg }
1490    }
1491
1492    /// Apply inverse cyclic permutation to vector indices
1493    /// Decodes sequence order by reversing the permutation shift
1494    ///
1495    /// # Arguments
1496    /// * `shift` - Number of positions to reverse shift indices cyclically
1497    ///
1498    /// # Examples
1499    ///
1500    /// ```
1501    /// use embeddenator_vsa::SparseVec;
1502    ///
1503    /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1504    /// let vec = SparseVec::encode_data(b"test", &config, None);
1505    /// let permuted = vec.permute(100);
1506    /// let recovered = permuted.inverse_permute(100);
1507    ///
1508    /// // Round-trip should recover original vector
1509    /// assert_eq!(vec.pos, recovered.pos);
1510    /// assert_eq!(vec.neg, recovered.neg);
1511    /// ```
1512    pub fn inverse_permute(&self, shift: usize) -> SparseVec {
1513        let inverse_permute_index = |idx: usize| (idx + DIM - (shift % DIM)) % DIM;
1514
1515        let pos: Vec<usize> = self
1516            .pos
1517            .iter()
1518            .map(|&idx| inverse_permute_index(idx))
1519            .collect();
1520        let neg: Vec<usize> = self
1521            .neg
1522            .iter()
1523            .map(|&idx| inverse_permute_index(idx))
1524            .collect();
1525
1526        // Indices must remain sorted for efficient operations
1527        let mut pos = pos;
1528        let mut neg = neg;
1529        pos.sort_unstable();
1530        neg.sort_unstable();
1531
1532        SparseVec { pos, neg }
1533    }
1534
1535    /// Context-Dependent Thinning Algorithm
1536    ///
1537    /// Thinning controls vector sparsity during bundle operations to prevent
1538    /// exponential density growth that degrades VSA performance. The algorithm:
1539    ///
1540    /// 1. Calculate current density = (pos.len() + neg.len()) as f32 / DIM as f32
1541    /// 2. If current_density <= target_density, return unchanged
1542    /// 3. Otherwise, randomly sample indices to reduce to target count
1543    /// 4. Preserve pos/neg ratio to maintain signal polarity balance
1544    /// 5. Use deterministic seeding for reproducible results
1545    ///
1546    /// Edge Cases:
1547    /// - Empty vector: return unchanged
1548    /// - target_non_zero = 0: return empty vector (not recommended)
1549    /// - target_non_zero >= current: return clone
1550    /// - Single polarity vectors: preserve polarity distribution
1551    ///
1552    /// Performance: O(n log n) due to sorting, where n = target_non_zero
1553    pub fn thin(&self, target_non_zero: usize) -> SparseVec {
1554        let current_count = self.pos.len() + self.neg.len();
1555
1556        // Edge case: already at or below target
1557        if current_count <= target_non_zero {
1558            return self.clone();
1559        }
1560
1561        // Edge case: target is zero
1562        if target_non_zero == 0 {
1563            return SparseVec::new();
1564        }
1565
1566        // Calculate how many to keep from each polarity
1567        let pos_ratio = self.pos.len() as f32 / current_count as f32;
1568        let target_pos = (target_non_zero as f32 * pos_ratio).round() as usize;
1569        let target_neg = target_non_zero - target_pos;
1570
1571        // Randomly sample indices using deterministic seed based on vector content
1572        let mut seed = [0u8; 32];
1573        seed[0..4].copy_from_slice(&(self.pos.len() as u32).to_le_bytes());
1574        seed[4..8].copy_from_slice(&(self.neg.len() as u32).to_le_bytes());
1575        seed[8..12].copy_from_slice(&(target_non_zero as u32).to_le_bytes());
1576        // Rest remains zero for deterministic seeding
1577        let mut rng = rand::rngs::StdRng::from_seed(seed);
1578
1579        // Sample positive indices
1580        let mut sampled_pos: Vec<usize> = self.pos.clone();
1581        sampled_pos.shuffle(&mut rng);
1582        sampled_pos.truncate(target_pos);
1583        sampled_pos.sort_unstable();
1584
1585        // Sample negative indices
1586        let mut sampled_neg: Vec<usize> = self.neg.clone();
1587        sampled_neg.shuffle(&mut rng);
1588        sampled_neg.truncate(target_neg);
1589        sampled_neg.sort_unstable();
1590
1591        SparseVec {
1592            pos: sampled_pos,
1593            neg: sampled_neg,
1594        }
1595    }
1596}
embeddenator_vsa/vsa.rs

embeddenator_vsa/
vsa.rs