embeddenator_vsa/vsa.rs
1//! Vector Symbolic Architecture (VSA) Implementation
2//!
3//! Sparse ternary vector representation with algebraic operations:
4//! - Bundle (⊕): Associative superposition
5//! - Bind (⊙): Non-commutative composition
6//! - Cosine similarity for retrieval
7
8use rand::seq::SliceRandom;
9use rand::SeedableRng;
10use serde::{Deserialize, Serialize};
11use sha2::{Digest, Sha256};
12
13use crate::ternary_vec::PackedTritVec;
14
15use std::cell::RefCell;
16
17/// Default dimension of VSA vectors (10,000 dimensions)
18pub const DIM: usize = 10000;
19
20/// Configuration for VSA vector operations
21///
22/// Controls the dimensionality and sparsity of vectors. This allows
23/// dynamic configuration of vector dimensions at runtime rather than
24/// being locked to the compile-time `DIM` constant.
25///
26/// # Example
27///
28/// ```rust
29/// use embeddenator_vsa::VsaConfig;
30///
31/// // Default 10,000 dimensions
32/// let default_config = VsaConfig::default();
33/// assert_eq!(default_config.dimension, 10_000);
34///
35/// // Custom configuration for smaller vectors
36/// let small_config = VsaConfig::new(1000).with_density(0.02); // 2% density
37///
38/// // Configuration for larger vectors
39/// let large_config = VsaConfig::new(100_000).with_density(0.005); // 0.5% density
40/// ```
41#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
42pub struct VsaConfig {
43 /// Number of dimensions in vectors
44 pub dimension: usize,
45 /// Target density of non-zero elements (0.0 to 1.0)
46 /// Default is 0.01 (1% density)
47 pub density: f64,
48 /// Minimum non-zero elements per vector
49 pub min_nonzero: usize,
50 /// Maximum non-zero elements per vector
51 pub max_nonzero: usize,
52 /// Auto-thinning threshold: automatically thin bundles when nnz exceeds this
53 /// Set to 0 to disable auto-thinning
54 pub auto_thin_threshold: usize,
55 /// Dynamic sparsity scaling mode
56 pub scaling_mode: SparsityScaling,
57 /// Target reconstruction accuracy (0.0-1.0)
58 /// Used by `for_accuracy()` to select appropriate dimensions
59 pub target_accuracy: f64,
60}
61
62/// Sparsity scaling modes for dynamic dimension handling
63#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Default)]
64pub enum SparsityScaling {
65 /// Fixed density - nnz scales linearly with dimension
66 /// Good for: Consistent memory usage per dimension
67 Fixed,
68 /// Square root scaling - nnz scales as sqrt(dimension)
69 /// Good for: Maintaining constant similarity precision
70 /// Formula: target_nnz = base_nnz * sqrt(dim / base_dim)
71 #[default]
72 SquareRoot,
73 /// Logarithmic scaling - nnz scales as log(dimension)
74 /// Good for: Memory-constrained environments with large dimensions
75 /// Formula: target_nnz = base_nnz * log(dim) / log(base_dim)
76 Logarithmic,
77 /// Custom scaling with user-provided function coefficients
78 /// Formula: target_nnz = a * dim^b + c
79 Custom { a: f64, b: f64, c: f64 },
80}
81
82impl Default for VsaConfig {
83 fn default() -> Self {
84 Self {
85 dimension: DIM,
86 density: 0.01, // 1% density
87 min_nonzero: 10,
88 max_nonzero: DIM / 10,
89 auto_thin_threshold: DIM / 5, // Auto-thin when > 20% dense
90 scaling_mode: SparsityScaling::SquareRoot,
91 target_accuracy: 0.95,
92 }
93 }
94}
95
96impl VsaConfig {
97 /// Create a new VSA configuration with the specified dimension
98 pub fn new(dimension: usize) -> Self {
99 Self {
100 dimension,
101 density: 0.01,
102 min_nonzero: 10,
103 max_nonzero: dimension / 10,
104 auto_thin_threshold: dimension / 5,
105 scaling_mode: SparsityScaling::SquareRoot,
106 target_accuracy: 0.95,
107 }
108 }
109
110 /// Set the target density for non-zero elements
111 pub fn with_density(mut self, density: f64) -> Self {
112 self.density = density.clamp(0.001, 0.5);
113 self
114 }
115
116 /// Set minimum number of non-zero elements
117 pub fn with_min_nonzero(mut self, min: usize) -> Self {
118 self.min_nonzero = min;
119 self
120 }
121
122 /// Set maximum number of non-zero elements
123 pub fn with_max_nonzero(mut self, max: usize) -> Self {
124 self.max_nonzero = max;
125 self
126 }
127
128 /// Set auto-thinning threshold (0 to disable)
129 pub fn with_auto_thin(mut self, threshold: usize) -> Self {
130 self.auto_thin_threshold = threshold;
131 self
132 }
133
134 /// Set sparsity scaling mode
135 pub fn with_scaling(mut self, mode: SparsityScaling) -> Self {
136 self.scaling_mode = mode;
137 self
138 }
139
140 /// Set target reconstruction accuracy
141 pub fn with_target_accuracy(mut self, accuracy: f64) -> Self {
142 self.target_accuracy = accuracy.clamp(0.5, 0.9999);
143 self
144 }
145
146 /// Calculate the sparsity (number of non-zero elements per polarity)
147 /// based on dimension, density, and scaling mode
148 pub fn sparsity(&self) -> usize {
149 let base_target = ((self.dimension as f64 * self.density) / 2.0).round() as usize;
150
151 let scaled_target = match self.scaling_mode {
152 SparsityScaling::Fixed => base_target,
153 SparsityScaling::SquareRoot => {
154 // Scale as sqrt(dim) relative to base 10K dimensions
155 let base_dim = 10_000.0_f64;
156 let scale = (self.dimension as f64 / base_dim).sqrt();
157 (base_target as f64 * scale).round() as usize
158 }
159 SparsityScaling::Logarithmic => {
160 // Scale as log(dim) relative to base 10K dimensions
161 let base_dim = 10_000.0_f64;
162 let scale = (self.dimension as f64).ln() / base_dim.ln();
163 (base_target as f64 * scale).round() as usize
164 }
165 SparsityScaling::Custom { a, b, c } => {
166 // Custom formula: a * dim^b + c
167 (a * (self.dimension as f64).powf(b) + c).round() as usize
168 }
169 };
170
171 scaled_target.clamp(self.min_nonzero, self.max_nonzero)
172 }
173
174 /// Create configuration optimized for a target reconstruction accuracy
175 ///
176 /// This automatically selects dimension and density based on empirical
177 /// relationships between VSA parameters and encoding fidelity.
178 ///
179 /// | Accuracy | Dimension | Density | Typical Use Case |
180 /// |----------|-----------|---------|------------------|
181 /// | 0.90 | 5,000 | 2% | Fast prototyping |
182 /// | 0.95 | 10,000 | 1% | Default balance |
183 /// | 0.98 | 50,000 | 0.5% | High fidelity |
184 /// | 0.99 | 100,000 | 0.3% | Production |
185 ///
186 /// # Example
187 ///
188 /// ```rust
189 /// use embeddenator_vsa::VsaConfig;
190 ///
191 /// // Create config targeting 95% reconstruction accuracy
192 /// let config = VsaConfig::for_accuracy(0.95);
193 /// // Accuracy maps to 25K dimensions (between 0.95 and 0.98)
194 /// assert!(config.dimension >= 10_000 && config.dimension <= 50_000);
195 /// ```
196 pub fn for_accuracy(target: f64) -> Self {
197 let target = target.clamp(0.5, 0.9999);
198
199 // Empirical mapping: accuracy → (dimension, density)
200 // Based on information-theoretic capacity of ternary HDV
201 let (dimension, density) = if target < 0.85 {
202 (2_000, 0.03)
203 } else if target < 0.90 {
204 (5_000, 0.02)
205 } else if target < 0.95 {
206 (10_000, 0.01)
207 } else if target < 0.98 {
208 (25_000, 0.008)
209 } else if target < 0.99 {
210 (50_000, 0.005)
211 } else if target < 0.995 {
212 (100_000, 0.003)
213 } else if target < 0.999 {
214 (250_000, 0.002)
215 } else {
216 (500_000, 0.001)
217 };
218
219 Self::new(dimension)
220 .with_density(density)
221 .with_target_accuracy(target)
222 .with_scaling(SparsityScaling::SquareRoot)
223 }
224
225 /// Create configuration from a custom schema/specification
226 ///
227 /// Allows users to provide their own parameter mapping for specialized use cases.
228 ///
229 /// # Example
230 ///
231 /// ```rust
232 /// use embeddenator_vsa::{VsaConfig, VsaConfigSchema, SparsityScaling};
233 ///
234 /// // Custom schema for memory-constrained embedded systems
235 /// let schema = VsaConfigSchema {
236 /// dimension: 4096,
237 /// density: 0.025,
238 /// scaling: Some(SparsityScaling::Logarithmic),
239 /// auto_thin: Some(512),
240 /// min_nnz: Some(8),
241 /// max_nnz: Some(256),
242 /// target_accuracy: None,
243 /// };
244 /// let config = VsaConfig::from_schema(schema);
245 /// assert_eq!(config.dimension, 4096);
246 /// ```
247 pub fn from_schema(schema: VsaConfigSchema) -> Self {
248 let mut config = Self::new(schema.dimension).with_density(schema.density);
249
250 if let Some(scaling) = schema.scaling {
251 config = config.with_scaling(scaling);
252 }
253 if let Some(threshold) = schema.auto_thin {
254 config = config.with_auto_thin(threshold);
255 }
256 if let Some(min) = schema.min_nnz {
257 config = config.with_min_nonzero(min);
258 }
259 if let Some(max) = schema.max_nnz {
260 config = config.with_max_nonzero(max);
261 }
262 if let Some(accuracy) = schema.target_accuracy {
263 config = config.with_target_accuracy(accuracy);
264 }
265
266 config
267 }
268
269 /// Configuration for small vectors (1,000 dimensions)
270 pub fn small() -> Self {
271 Self::new(1_000).with_density(0.02)
272 }
273
274 /// Configuration for medium vectors (10,000 dimensions - default)
275 pub fn medium() -> Self {
276 Self::default()
277 }
278
279 /// Configuration for large vectors (100,000 dimensions)
280 pub fn large() -> Self {
281 Self::new(100_000).with_density(0.005)
282 }
283
284 /// Configuration for huge vectors (1,000,000 dimensions)
285 /// Use sparingly - memory intensive
286 pub fn huge() -> Self {
287 Self::new(1_000_000).with_density(0.001)
288 }
289
290 /// Get the effective auto-thin threshold, accounting for dimension
291 pub fn effective_auto_thin(&self) -> usize {
292 if self.auto_thin_threshold == 0 {
293 usize::MAX // Disabled
294 } else {
295 self.auto_thin_threshold
296 }
297 }
298
299 /// Check if a vector exceeds the auto-thin threshold
300 pub fn should_thin(&self, nnz: usize) -> bool {
301 self.auto_thin_threshold > 0 && nnz > self.auto_thin_threshold
302 }
303}
304
305/// User-provided schema for custom VSA configuration
306///
307/// This allows users to specify their own parameters for specialized use cases
308/// without needing to understand the internal scaling formulas.
309///
310/// # Example
311///
312/// ```rust
313/// use embeddenator_vsa::{VsaConfig, VsaConfigSchema, SparsityScaling};
314///
315/// // Custom schema for a specific application
316/// let schema = VsaConfigSchema {
317/// dimension: 8192,
318/// density: 0.015,
319/// scaling: Some(SparsityScaling::SquareRoot),
320/// auto_thin: Some(1000),
321/// min_nnz: Some(16),
322/// max_nnz: Some(512),
323/// target_accuracy: Some(0.97),
324/// };
325///
326/// let config = VsaConfig::from_schema(schema);
327/// assert_eq!(config.dimension, 8192);
328/// ```
329#[derive(Clone, Debug, Serialize, Deserialize)]
330pub struct VsaConfigSchema {
331 /// Required: Number of dimensions
332 pub dimension: usize,
333 /// Required: Target density (0.0-1.0)
334 pub density: f64,
335 /// Optional: Sparsity scaling mode
336 #[serde(default)]
337 pub scaling: Option<SparsityScaling>,
338 /// Optional: Auto-thinning threshold (0 to disable)
339 #[serde(default)]
340 pub auto_thin: Option<usize>,
341 /// Optional: Minimum non-zero elements
342 #[serde(default)]
343 pub min_nnz: Option<usize>,
344 /// Optional: Maximum non-zero elements
345 #[serde(default)]
346 pub max_nnz: Option<usize>,
347 /// Optional: Target reconstruction accuracy
348 #[serde(default)]
349 pub target_accuracy: Option<f64>,
350}
351
352impl VsaConfigSchema {
353 /// Create a minimal schema with just dimension and density
354 pub fn new(dimension: usize, density: f64) -> Self {
355 Self {
356 dimension,
357 density,
358 scaling: None,
359 auto_thin: None,
360 min_nnz: None,
361 max_nnz: None,
362 target_accuracy: None,
363 }
364 }
365
366 /// Set scaling mode
367 pub fn with_scaling(mut self, mode: SparsityScaling) -> Self {
368 self.scaling = Some(mode);
369 self
370 }
371
372 /// Set auto-thin threshold
373 pub fn with_auto_thin(mut self, threshold: usize) -> Self {
374 self.auto_thin = Some(threshold);
375 self
376 }
377
378 /// Parse from JSON string
379 pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
380 serde_json::from_str(json)
381 }
382
383 /// Serialize to JSON string
384 pub fn to_json(&self) -> Result<String, serde_json::Error> {
385 serde_json::to_string_pretty(self)
386 }
387}
388
389thread_local! {
390 // Reused packed buffers for hot paths. Using TLS keeps this allocation
391 // amortized while remaining safe in multi-threaded contexts.
392 static PACKED_SCRATCH_A: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
393 static PACKED_SCRATCH_B: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
394 static PACKED_SCRATCH_OUT: RefCell<PackedTritVec> = RefCell::new(PackedTritVec::new_zero(DIM));
395}
396
397/// Configuration for reversible VSA encoding/decoding operations
398#[derive(Clone, Debug, Serialize, Deserialize)]
399pub struct ReversibleVSAConfig {
400 /// Block size for chunked encoding (must be power of 2 for efficiency)
401 pub block_size: usize,
402 /// Maximum path depth for hierarchical encoding
403 pub max_path_depth: usize,
404 /// Base permutation shift for path-based encoding
405 pub base_shift: usize,
406 /// Target sparsity level for operations (number of non-zero elements)
407 pub target_sparsity: usize,
408}
409
410impl Default for ReversibleVSAConfig {
411 fn default() -> Self {
412 ReversibleVSAConfig {
413 block_size: 256, // 256-byte blocks
414 max_path_depth: 10,
415 base_shift: 1000,
416 target_sparsity: 200, // Default sparsity level
417 }
418 }
419}
420
421impl ReversibleVSAConfig {
422 /// Create config optimized for small data blocks
423 pub fn small_blocks() -> Self {
424 ReversibleVSAConfig {
425 block_size: 64,
426 max_path_depth: 5,
427 base_shift: 500,
428 target_sparsity: 100,
429 }
430 }
431
432 /// Create config optimized for large data blocks
433 pub fn large_blocks() -> Self {
434 ReversibleVSAConfig {
435 block_size: 1024,
436 max_path_depth: 20,
437 base_shift: 2000,
438 target_sparsity: 400,
439 }
440 }
441}
442
443/// Sparse ternary vector with positive and negative indices
444#[derive(Clone, Debug, Serialize, Deserialize)]
445pub struct SparseVec {
446 /// Indices with +1 value
447 pub pos: Vec<usize>,
448 /// Indices with -1 value
449 pub neg: Vec<usize>,
450}
451
452impl Default for SparseVec {
453 fn default() -> Self {
454 Self::new()
455 }
456}
457
458impl SparseVec {
459 #[inline]
460 fn nnz(&self) -> usize {
461 self.pos.len() + self.neg.len()
462 }
463
464 fn intersection_count_sorted(a: &[usize], b: &[usize]) -> usize {
465 let mut i = 0usize;
466 let mut j = 0usize;
467 let mut count = 0usize;
468 while i < a.len() && j < b.len() {
469 match a[i].cmp(&b[j]) {
470 std::cmp::Ordering::Less => i += 1,
471 std::cmp::Ordering::Greater => j += 1,
472 std::cmp::Ordering::Equal => {
473 count += 1;
474 i += 1;
475 j += 1;
476 }
477 }
478 }
479 count
480 }
481
482 fn union_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
483 let mut out = Vec::with_capacity(a.len() + b.len());
484 let mut i = 0usize;
485 let mut j = 0usize;
486
487 while i < a.len() && j < b.len() {
488 match a[i].cmp(&b[j]) {
489 std::cmp::Ordering::Less => {
490 out.push(a[i]);
491 i += 1;
492 }
493 std::cmp::Ordering::Greater => {
494 out.push(b[j]);
495 j += 1;
496 }
497 std::cmp::Ordering::Equal => {
498 out.push(a[i]);
499 i += 1;
500 j += 1;
501 }
502 }
503 }
504
505 if i < a.len() {
506 out.extend_from_slice(&a[i..]);
507 }
508 if j < b.len() {
509 out.extend_from_slice(&b[j..]);
510 }
511
512 out
513 }
514
515 fn difference_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
516 let mut out = Vec::with_capacity(a.len());
517 let mut i = 0usize;
518 let mut j = 0usize;
519
520 while i < a.len() && j < b.len() {
521 match a[i].cmp(&b[j]) {
522 std::cmp::Ordering::Less => {
523 out.push(a[i]);
524 i += 1;
525 }
526 std::cmp::Ordering::Greater => j += 1,
527 std::cmp::Ordering::Equal => {
528 i += 1;
529 j += 1;
530 }
531 }
532 }
533
534 if i < a.len() {
535 out.extend_from_slice(&a[i..]);
536 }
537
538 out
539 }
540
541 fn intersection_sorted(a: &[usize], b: &[usize]) -> Vec<usize> {
542 let mut out = Vec::with_capacity(a.len().min(b.len()));
543 let mut i = 0usize;
544 let mut j = 0usize;
545
546 while i < a.len() && j < b.len() {
547 match a[i].cmp(&b[j]) {
548 std::cmp::Ordering::Less => i += 1,
549 std::cmp::Ordering::Greater => j += 1,
550 std::cmp::Ordering::Equal => {
551 out.push(a[i]);
552 i += 1;
553 j += 1;
554 }
555 }
556 }
557
558 out
559 }
560 /// Create an empty sparse vector
561 ///
562 /// # Examples
563 ///
564 /// ```
565 /// use embeddenator_vsa::SparseVec;
566 ///
567 /// let vec = SparseVec::new();
568 /// assert!(vec.pos.is_empty());
569 /// assert!(vec.neg.is_empty());
570 /// ```
571 pub fn new() -> Self {
572 SparseVec {
573 pos: Vec::new(),
574 neg: Vec::new(),
575 }
576 }
577
578 /// Generate a random sparse vector with ~1% density
579 ///
580 /// # Examples
581 ///
582 /// ```
583 /// use embeddenator_vsa::SparseVec;
584 ///
585 /// let vec = SparseVec::random();
586 /// // Vector should have approximately 1% density (100 positive + 100 negative)
587 /// assert!(vec.pos.len() > 0);
588 /// assert!(vec.neg.len() > 0);
589 /// ```
590 pub fn random() -> Self {
591 let mut rng = rand::rng();
592 let sparsity = DIM / 100; // ~1% density
593
594 // Generate random indices without replacement
595 let mut indices: Vec<usize> = (0..DIM).collect();
596 indices.shuffle(&mut rng);
597
598 let mut pos: Vec<_> = indices[..sparsity].to_vec();
599 let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
600
601 pos.sort_unstable();
602 neg.sort_unstable();
603
604 SparseVec { pos, neg }
605 }
606
607 /// Generate a random sparse vector with configurable dimensions and density
608 ///
609 /// This method allows creating vectors with custom dimensions, useful for
610 /// benchmarking with different data types or when dimensions need to be
611 /// determined at runtime.
612 ///
613 /// # Arguments
614 /// * `config` - Configuration specifying dimension and density
615 ///
616 /// # Examples
617 ///
618 /// ```rust
619 /// use embeddenator_vsa::{SparseVec, VsaConfig};
620 ///
621 /// // Create a small vector (1000 dims, 2% density)
622 /// let config = VsaConfig::small();
623 /// let vec = SparseVec::random_with_config(&config);
624 /// assert!(vec.pos.len() > 0);
625 ///
626 /// // Create a custom-sized vector
627 /// let custom = VsaConfig::new(5000).with_density(0.02);
628 /// let vec = SparseVec::random_with_config(&custom);
629 /// ```
630 pub fn random_with_config(config: &VsaConfig) -> Self {
631 let mut rng = rand::rng();
632 let sparsity = config.sparsity();
633
634 // Generate random indices without replacement
635 let mut indices: Vec<usize> = (0..config.dimension).collect();
636 indices.shuffle(&mut rng);
637
638 let mut pos: Vec<_> = indices[..sparsity].to_vec();
639 let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
640
641 pos.sort_unstable();
642 neg.sort_unstable();
643
644 SparseVec { pos, neg }
645 }
646
647 /// Encode data into a reversible sparse vector using block-based mapping
648 ///
649 /// This method implements hierarchical encoding with path-based permutations
650 /// for lossless data recovery. The encoding process:
651 /// 1. Splits data into blocks of configurable size
652 /// 2. Applies path-based permutations to each block
653 /// 3. Combines blocks using hierarchical bundling
654 ///
655 /// # Arguments
656 /// * `data` - The data to encode
657 /// * `config` - Configuration for encoding parameters
658 /// * `path` - Optional path string for hierarchical encoding (affects permutation)
659 ///
660 /// # Returns
661 /// A SparseVec that can be decoded back to the original data
662 ///
663 /// # Examples
664 ///
665 /// ```
666 /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
667 ///
668 /// let data = b"hello world";
669 /// let config = ReversibleVSAConfig::default();
670 /// let encoded = SparseVec::encode_data(data, &config, None);
671 ///
672 /// // encoded vector contains reversible representation of the data
673 /// assert!(!encoded.pos.is_empty() || !encoded.neg.is_empty());
674 /// ```
675 pub fn encode_data(data: &[u8], config: &ReversibleVSAConfig, path: Option<&str>) -> Self {
676 if data.is_empty() {
677 return SparseVec::new();
678 }
679
680 // Calculate path-based shift for hierarchical encoding
681 let path_shift = if let Some(path_str) = path {
682 // Use path hash to determine shift, constrained to prevent overflow
683 let mut hasher = Sha256::new();
684 hasher.update(path_str.as_bytes());
685 let hash = hasher.finalize();
686 // SAFETY: SHA256 always produces 32 bytes, first 4 bytes are always valid
687 let hash_bytes: [u8; 4] = hash[0..4]
688 .try_into()
689 .expect("SHA256 hash must produce at least 4 bytes");
690 let path_hash = u32::from_le_bytes(hash_bytes) as usize;
691 (path_hash % config.max_path_depth) * config.base_shift
692 } else {
693 0
694 };
695
696 // Split data into blocks
697 let mut blocks = Vec::new();
698 for chunk in data.chunks(config.block_size) {
699 blocks.push(chunk);
700 }
701
702 // Encode each block with position-based permutation
703 let mut encoded_blocks = Vec::new();
704 for (i, block) in blocks.iter().enumerate() {
705 let block_shift = path_shift + (i * config.base_shift / blocks.len().max(1));
706 let block_vec = Self::encode_block(block, block_shift);
707 encoded_blocks.push(block_vec);
708 }
709
710 // Combine blocks hierarchically
711 if encoded_blocks.is_empty() {
712 SparseVec::new()
713 } else if encoded_blocks.len() == 1 {
714 // SAFETY: we just checked len() == 1, so next() must return Some
715 encoded_blocks
716 .into_iter()
717 .next()
718 .expect("encoded_blocks must have exactly one element")
719 } else {
720 // Hierarchical bundling: combine in binary tree fashion
721 Self::hierarchical_bundle(&encoded_blocks)
722 }
723 }
724
725 /// Decode data from a reversible sparse vector
726 ///
727 /// Reverses the encoding process to recover the original data.
728 /// Requires the same configuration and path used during encoding.
729 ///
730 /// # Arguments
731 /// * `config` - Same configuration used for encoding
732 /// * `path` - Same path string used for encoding
733 /// * `expected_size` - Expected size of the decoded data (for validation)
734 ///
735 /// # Returns
736 /// The original data bytes (may need correction layer for 100% fidelity)
737 ///
738 /// # Examples
739 ///
740 /// ```no_run
741 /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
742 ///
743 /// let data = b"hello world";
744 /// let config = ReversibleVSAConfig::default();
745 /// let encoded = SparseVec::encode_data(data, &config, None);
746 /// let decoded = encoded.decode_data(&config, None, data.len());
747 ///
748 /// // Note: For 100% fidelity, use CorrectionStore with EmbrFS
749 /// // Raw decode may have minor differences that corrections compensate for
750 /// ```
751 pub fn decode_data(
752 &self,
753 config: &ReversibleVSAConfig,
754 path: Option<&str>,
755 expected_size: usize,
756 ) -> Vec<u8> {
757 if self.pos.is_empty() && self.neg.is_empty() {
758 return Vec::new();
759 }
760
761 if expected_size == 0 {
762 return Vec::new();
763 }
764
765 // Calculate path-based shift (same as encoding)
766 let path_shift = if let Some(path_str) = path {
767 let mut hasher = Sha256::new();
768 hasher.update(path_str.as_bytes());
769 let hash = hasher.finalize();
770 // SAFETY: SHA256 always produces 32 bytes, first 4 bytes are always valid
771 let hash_bytes: [u8; 4] = hash[0..4]
772 .try_into()
773 .expect("SHA256 hash must produce at least 4 bytes");
774 let path_hash = u32::from_le_bytes(hash_bytes) as usize;
775 (path_hash % config.max_path_depth) * config.base_shift
776 } else {
777 0
778 };
779
780 // Estimate number of blocks based on expected size
781 let estimated_blocks = expected_size.div_ceil(config.block_size);
782
783 // For single block case
784 if estimated_blocks <= 1 {
785 return Self::decode_block(self, path_shift, expected_size);
786 }
787
788 // =======================================================================
789 // MULTI-BLOCK DECODING LIMITATIONS (v0.21.0)
790 // =======================================================================
791 //
792 // Current Status: EXPERIMENTAL / SIMPLIFIED IMPLEMENTATION
793 //
794 // The multi-block decoding uses a simplified linear approach without proper
795 // hierarchical factorization. This has the following implications:
796 //
797 // 1. **Information Loss**: When multiple blocks are bundled hierarchically,
798 // separating them requires VSA factorization techniques (e.g., HRR unbinding,
799 // MAP estimation) that are not yet implemented.
800 //
801 // 2. **Use Case**: Works reliably for:
802 // - Single-block data (data.len() <= config.block_size)
803 // - Data with clear block boundaries
804 //
805 // 3. **Known Limitation**: Multi-block data (> block_size) may have reduced
806 // fidelity. For 100% accuracy, use the Codebook's residual mechanism or
807 // external correction stores.
808 //
809 // 4. **Future Work**: Full hierarchical decoding is planned for v1.1.0.
810 // See IMPLEMENTATION_PLAN.md Task 4.2 for details.
811 //
812 // =======================================================================
813
814 // For multiple blocks, we need to factorize the hierarchical bundle
815 // This is a simplified approach - in practice, we'd need more sophisticated
816 // factorization to separate the blocks
817 let mut result = Vec::new();
818
819 // Simplified best-effort linear decoding for multi-block data.
820 // WARNING: This path trades accuracy for speed and may:
821 // - Lose information across blocks, and/or
822 // - Mis-assign tokens between adjacent blocks.
823 // For applications that require 100% fidelity, prefer the Codebook's
824 // residual mechanism or external correction stores instead of relying
825 // solely on this multi-block decoder.
826 for i in 0..estimated_blocks {
827 let block_shift = path_shift + (i * config.base_shift / estimated_blocks.max(1));
828 let remaining = expected_size.saturating_sub(result.len());
829 if remaining == 0 {
830 break;
831 }
832 let max_len = remaining.min(config.block_size);
833 let block_data = Self::decode_block(self, block_shift, max_len);
834 if block_data.is_empty() {
835 break;
836 }
837 result.extend(block_data);
838 if result.len() >= expected_size {
839 break;
840 }
841 }
842
843 // Truncate to expected size
844 result.truncate(expected_size);
845 result
846 }
847
848 /// Encode a single block of data with position-based permutation
849 fn encode_block(data: &[u8], shift: usize) -> SparseVec {
850 if data.is_empty() {
851 return SparseVec::new();
852 }
853
854 // Map data bytes to vector indices using the permuted mapping
855 let mut pos = Vec::new();
856 let mut neg = Vec::new();
857
858 for (i, &byte) in data.iter().enumerate() {
859 let base_idx = (i + shift) % DIM;
860
861 // Use byte value to determine polarity and offset
862 if byte & 0x80 != 0 {
863 // High bit set -> negative
864 neg.push((base_idx + (byte & 0x7F) as usize) % DIM);
865 } else {
866 // High bit clear -> positive
867 pos.push((base_idx + byte as usize) % DIM);
868 }
869 }
870
871 pos.sort_unstable();
872 pos.dedup();
873 neg.sort_unstable();
874 neg.dedup();
875
876 // CRITICAL FIX: Ensure pos and neg are disjoint
877 // If an index appears in both pos and neg, it means we have conflicting
878 // encodings. Remove from both to treat as 0 (cancel out).
879 let overlap = Self::intersection_sorted(&pos, &neg);
880 if !overlap.is_empty() {
881 pos = Self::difference_sorted(&pos, &overlap);
882 neg = Self::difference_sorted(&neg, &overlap);
883 }
884
885 SparseVec { pos, neg }
886 }
887
888 /// Decode a single block of data
889 fn decode_block(encoded: &SparseVec, shift: usize, max_len: usize) -> Vec<u8> {
890 if max_len == 0 {
891 return Vec::new();
892 }
893
894 let mut result = Vec::with_capacity(max_len);
895
896 // Reconstruct data by reversing the permutation.
897 // Note: `pos` and `neg` are kept sorted, so membership can be checked via binary search.
898 for i in 0..max_len {
899 let base_idx = (i + shift) % DIM;
900
901 // Look for indices that map back to this position
902 let mut found_byte = None;
903 for offset in 0..128u8 {
904 let test_idx = (base_idx + offset as usize) % DIM;
905
906 if encoded.pos.binary_search(&test_idx).is_ok() {
907 found_byte = Some(offset);
908 break;
909 } else if encoded.neg.binary_search(&test_idx).is_ok() {
910 found_byte = Some(offset | 0x80);
911 break;
912 }
913 }
914
915 if let Some(byte) = found_byte {
916 result.push(byte);
917 } else {
918 // No more data found
919 break;
920 }
921 }
922
923 result
924 }
925
926 /// Combine multiple vectors using hierarchical bundling
927 fn hierarchical_bundle(vectors: &[SparseVec]) -> SparseVec {
928 if vectors.is_empty() {
929 return SparseVec::new();
930 }
931 if vectors.len() == 1 {
932 return vectors[0].clone();
933 }
934
935 // Binary tree combination
936 let mut result = vectors[0].clone();
937 for vec in &vectors[1..] {
938 result = result.bundle(vec);
939 }
940 result
941 }
942
943 /// Generate a deterministic sparse vector from data using SHA256 seed
944 /// DEPRECATED: Use encode_data() for new code
945 ///
946 /// # Examples
947 ///
948 /// ```
949 /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
950 ///
951 /// let data = b"hello world";
952 /// let config = ReversibleVSAConfig::default();
953 /// let vec1 = SparseVec::encode_data(data, &config, None);
954 /// let vec2 = SparseVec::encode_data(data, &config, None);
955 ///
956 /// // Same input produces same vector (deterministic)
957 /// assert_eq!(vec1.pos, vec2.pos);
958 /// assert_eq!(vec1.neg, vec2.neg);
959 /// ```
960 #[deprecated(since = "0.2.0", note = "Use encode_data() for reversible encoding")]
961 pub fn from_data(data: &[u8]) -> Self {
962 let mut hasher = Sha256::new();
963 hasher.update(data);
964 let hash = hasher.finalize();
965
966 // SAFETY: SHA256 always produces exactly 32 bytes
967 let seed: [u8; 32] = hash[..32]
968 .try_into()
969 .expect("SHA256 must produce exactly 32 bytes");
970 let mut rng = rand::rngs::StdRng::from_seed(seed);
971
972 let mut indices: Vec<usize> = (0..DIM).collect();
973 indices.shuffle(&mut rng);
974
975 let sparsity = DIM / 100;
976 let mut pos = indices[..sparsity].to_vec();
977 let mut neg = indices[sparsity..sparsity * 2].to_vec();
978
979 pos.sort_unstable();
980 neg.sort_unstable();
981
982 SparseVec { pos, neg }
983 }
984
985 /// Bundle operation: pairwise conflict-cancel superposition (A ⊕ B)
986 ///
987 /// This is a fast, commutative merge for two vectors:
988 /// - same sign => keep
989 /// - opposite signs => cancel to 0
990 /// - sign vs 0 => keep sign
991 ///
992 /// Note: While this is well-defined for two vectors, repeated application across 3+
993 /// vectors is generally **not associative** because early cancellation/thresholding can
994 /// discard multiplicity information.
995 ///
996 /// # Arguments
997 /// * `other` - The vector to bundle with self
998 /// * `config` - Optional ReversibleVSAConfig for controlling sparsity via thinning
999 ///
1000 /// # Examples
1001 ///
1002 /// ```
1003 /// use embeddenator_vsa::{SparseVec, ReversibleVSAConfig};
1004 ///
1005 /// let config = ReversibleVSAConfig::default();
1006 /// let vec1 = SparseVec::encode_data(b"data1", &config, None);
1007 /// let vec2 = SparseVec::encode_data(b"data2", &config, None);
1008 /// let bundled = vec1.bundle_with_config(&vec2, Some(&config));
1009 ///
1010 /// // Bundled vector contains superposition of both inputs
1011 /// // Should be similar to both original vectors
1012 /// let sim1 = vec1.cosine(&bundled);
1013 /// let sim2 = vec2.cosine(&bundled);
1014 /// assert!(sim1 > 0.3);
1015 /// assert!(sim2 > 0.3);
1016 /// ```
1017 pub fn bundle_with_config(
1018 &self,
1019 other: &SparseVec,
1020 config: Option<&ReversibleVSAConfig>,
1021 ) -> SparseVec {
1022 let mut result = self.bundle(other);
1023
1024 // Apply thinning if config provided and result exceeds target sparsity
1025 if let Some(cfg) = config {
1026 let current_count = result.pos.len() + result.neg.len();
1027 if current_count > cfg.target_sparsity {
1028 result = result.thin(cfg.target_sparsity);
1029 }
1030 }
1031
1032 result
1033 }
1034
1035 /// Bundle operation: pairwise conflict-cancel superposition (A ⊕ B)
1036 ///
1037 /// See `bundle()` for semantic details; this wrapper optionally applies thinning via
1038 /// `ReversibleVSAConfig`.
1039 ///
1040 /// # Examples
1041 ///
1042 /// ```
1043 /// use embeddenator_vsa::SparseVec;
1044 ///
1045 /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1046 /// let vec1 = SparseVec::encode_data(b"data1", &config, None);
1047 /// let vec2 = SparseVec::encode_data(b"data2", &config, None);
1048 /// let bundled = vec1.bundle(&vec2);
1049 ///
1050 /// // Bundled vector contains superposition of both inputs
1051 /// // Should be similar to both original vectors
1052 /// let sim1 = vec1.cosine(&bundled);
1053 /// let sim2 = vec2.cosine(&bundled);
1054 /// assert!(sim1 > 0.3);
1055 /// assert!(sim2 > 0.3);
1056 /// ```
1057 pub fn bundle(&self, other: &SparseVec) -> SparseVec {
1058 // Optional ternary-native fast path (migration gate).
1059 // This is primarily intended for cases where vectors become dense enough
1060 // that packed word-wise operations are competitive.
1061 {
1062 // Converting SparseVec -> PackedTritVec is O(DIM) for allocation/zeroing, plus O(nnz)
1063 // for setting lanes. Only attempt the packed path when total density is high enough,
1064 // and avoid converting an extremely sparse operand.
1065 // Thresholds lowered from DIM/4 to DIM/8 for earlier SIMD dispatch now that
1066 // PackedTritVec has proper bitsliced operations.
1067 let a_nnz = self.nnz();
1068 let b_nnz = other.nnz();
1069 let total = a_nnz + b_nnz;
1070 if total > DIM / 8 {
1071 let min_nnz = a_nnz.min(b_nnz);
1072 if min_nnz > DIM / 64 {
1073 return PACKED_SCRATCH_A.with(|a_cell| {
1074 PACKED_SCRATCH_B.with(|b_cell| {
1075 PACKED_SCRATCH_OUT.with(|out_cell| {
1076 let mut a = a_cell.borrow_mut();
1077 let mut b = b_cell.borrow_mut();
1078 let mut out = out_cell.borrow_mut();
1079
1080 a.fill_from_sparsevec(self, DIM);
1081 b.fill_from_sparsevec(other, DIM);
1082 a.bundle_into(&b, &mut out);
1083 out.to_sparsevec()
1084 })
1085 })
1086 });
1087 }
1088 }
1089 }
1090
1091 // Majority voting for two sparse ternary vectors:
1092 // - Same sign => keep
1093 // - Opposite signs => cancel to 0
1094 // - Sign vs 0 => keep sign
1095 // This can be expressed via sorted set differences/unions.
1096 let pos_a = Self::difference_sorted(&self.pos, &other.neg);
1097 let pos_b = Self::difference_sorted(&other.pos, &self.neg);
1098 let neg_a = Self::difference_sorted(&self.neg, &other.pos);
1099 let neg_b = Self::difference_sorted(&other.neg, &self.pos);
1100
1101 let mut pos = Self::union_sorted(&pos_a, &pos_b);
1102 let mut neg = Self::union_sorted(&neg_a, &neg_b);
1103
1104 // CRITICAL FIX: Ensure pos and neg remain disjoint
1105 // Remove any indices that appear in both (shouldn't happen in theory,
1106 // but can occur due to floating-point or hierarchical bundling issues)
1107 let overlap = Self::intersection_sorted(&pos, &neg);
1108 if !overlap.is_empty() {
1109 pos = Self::difference_sorted(&pos, &overlap);
1110 neg = Self::difference_sorted(&neg, &overlap);
1111 }
1112
1113 SparseVec { pos, neg }
1114 }
1115
1116 /// Associative bundle over many vectors: sums contributions per index, then thresholds to sign.
1117 /// This is order-independent because all contributions are accumulated before applying sign.
1118 /// Complexity: O(K log K) where K is total non-zero entries across inputs.
1119 pub fn bundle_sum_many<'a, I>(vectors: I) -> SparseVec
1120 where
1121 I: IntoIterator<Item = &'a SparseVec>,
1122 {
1123 let mut contributions: Vec<(usize, i32)> = Vec::new();
1124
1125 for vec in vectors {
1126 contributions.extend(vec.pos.iter().map(|&idx| (idx, 1i32)));
1127 contributions.extend(vec.neg.iter().map(|&idx| (idx, -1i32)));
1128 }
1129
1130 if contributions.is_empty() {
1131 return SparseVec {
1132 pos: Vec::new(),
1133 neg: Vec::new(),
1134 };
1135 }
1136
1137 contributions.sort_unstable_by_key(|(idx, _)| *idx);
1138
1139 let mut pos = Vec::new();
1140 let mut neg = Vec::new();
1141
1142 let mut iter = contributions.into_iter();
1143 // SAFETY: we checked contributions.is_empty() above and returned early if empty
1144 let (mut current_idx, mut acc) = iter
1145 .next()
1146 .expect("contributions must be non-empty after early return check");
1147
1148 for (idx, value) in iter {
1149 if idx == current_idx {
1150 acc += value;
1151 } else {
1152 if acc > 0 {
1153 pos.push(current_idx);
1154 } else if acc < 0 {
1155 neg.push(current_idx);
1156 }
1157 current_idx = idx;
1158 acc = value;
1159 }
1160 }
1161
1162 if acc > 0 {
1163 pos.push(current_idx);
1164 } else if acc < 0 {
1165 neg.push(current_idx);
1166 }
1167
1168 SparseVec { pos, neg }
1169 }
1170
1171 /// Hybrid bundle: choose a fast pairwise fold for very sparse regimes (to preserve sparsity),
1172 /// otherwise use the associative sum-then-threshold path (order-independent, more faithful to majority).
1173 ///
1174 /// Heuristic: estimate expected overlap/collision count assuming uniform hashing into `DIM`.
1175 /// If expected colliding dimensions is below a small budget, use pairwise `bundle`; else use
1176 /// `bundle_sum_many`.
1177 pub fn bundle_hybrid_many<'a, I>(vectors: I) -> SparseVec
1178 where
1179 I: IntoIterator<Item = &'a SparseVec>,
1180 {
1181 let collected: Vec<&'a SparseVec> = vectors.into_iter().collect();
1182 if collected.is_empty() {
1183 return SparseVec {
1184 pos: Vec::new(),
1185 neg: Vec::new(),
1186 };
1187 }
1188
1189 if collected.len() == 1 {
1190 return collected[0].clone();
1191 }
1192
1193 if collected.len() == 2 {
1194 return collected[0].bundle(collected[1]);
1195 }
1196
1197 let total_nnz: usize = collected.iter().map(|v| v.pos.len() + v.neg.len()).sum();
1198
1199 if total_nnz == 0 {
1200 return SparseVec {
1201 pos: Vec::new(),
1202 neg: Vec::new(),
1203 };
1204 }
1205
1206 // Constant-time overlap/collision risk estimate.
1207 // Model: each non-zero lands uniformly in DIM dimensions.
1208 // Let λ = total_nnz / DIM be expected hits per dimension.
1209 // Then P(K>=2) ≈ 1 - e^{-λ}(1+λ) for Poisson(λ).
1210 // If expected number of colliding dimensions is tiny, pairwise fold is effectively safe
1211 // (and faster). Otherwise, use associative accumulation to avoid order sensitivity.
1212 let lambda = total_nnz as f64 / DIM as f64;
1213 let p_ge_2 = 1.0 - (-lambda).exp() * (1.0 + lambda);
1214 let expected_colliding_dims = p_ge_2 * DIM as f64;
1215
1216 // Budget: allow a small number of potentially order-sensitive dimensions.
1217 // Tune via benchmarks; this is conservative for integrity.
1218 let collision_budget_dims = 32.0;
1219
1220 if expected_colliding_dims <= collision_budget_dims {
1221 let mut iter = collected.into_iter();
1222 // SAFETY: hierarchical_bundle is only called when collected.len() > 1
1223 let mut acc = iter
1224 .next()
1225 .expect("hierarchical_bundle must be called with non-empty collection")
1226 .clone();
1227 for v in iter {
1228 acc = acc.bundle(v);
1229 }
1230 return acc;
1231 }
1232
1233 SparseVec::bundle_sum_many(collected)
1234 }
1235
1236 /// Bind operation: non-commutative composition (A ⊙ B)
1237 /// Performs element-wise multiplication. Self-inverse: A ⊙ A ≈ I
1238 ///
1239 /// # Examples
1240 ///
1241 /// ```
1242 /// use embeddenator_vsa::SparseVec;
1243 ///
1244 /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1245 /// let vec = SparseVec::encode_data(b"test", &config, None);
1246 /// let bound = vec.bind(&vec);
1247 ///
1248 /// // Bind with self should produce high similarity (self-inverse property)
1249 /// let identity = SparseVec::encode_data(b"identity", &config, None);
1250 /// let sim = bound.cosine(&identity);
1251 /// // Result is approximately identity, so similarity varies
1252 /// assert!(sim >= -1.0 && sim <= 1.0);
1253 /// ```
1254 pub fn bind(&self, other: &SparseVec) -> SparseVec {
1255 {
1256 // Packed bind is only worthwhile when both operands are dense enough.
1257 // Using a short-circuiting check avoids paying extra overhead for sparse workloads.
1258 // Thresholds lowered from DIM/4 to DIM/8 for earlier SIMD dispatch.
1259 let a_nnz = self.nnz();
1260 if a_nnz > DIM / 8 {
1261 let b_nnz = other.nnz();
1262 if b_nnz > DIM / 8 {
1263 return PACKED_SCRATCH_A.with(|a_cell| {
1264 PACKED_SCRATCH_B.with(|b_cell| {
1265 PACKED_SCRATCH_OUT.with(|out_cell| {
1266 let mut a = a_cell.borrow_mut();
1267 let mut b = b_cell.borrow_mut();
1268 let mut out = out_cell.borrow_mut();
1269
1270 a.fill_from_sparsevec(self, DIM);
1271 b.fill_from_sparsevec(other, DIM);
1272 a.bind_into(&b, &mut out);
1273 out.to_sparsevec()
1274 })
1275 })
1276 });
1277 }
1278 }
1279 }
1280
1281 // Sparse bind is element-wise multiplication restricted to non-zero support.
1282 // We can compute this in a single merge-join over the signed supports, keeping
1283 // outputs sorted without any post-sort.
1284 let mut result_pos = Vec::new();
1285 let mut result_neg = Vec::new();
1286
1287 let mut a_pos = 0usize;
1288 let mut a_neg = 0usize;
1289 let mut b_pos = 0usize;
1290 let mut b_neg = 0usize;
1291
1292 loop {
1293 let next_a = match (self.pos.get(a_pos), self.neg.get(a_neg)) {
1294 (Some(&p), Some(&n)) => {
1295 if p < n {
1296 Some((p, 1i8, true))
1297 } else {
1298 Some((n, -1i8, false))
1299 }
1300 }
1301 (Some(&p), None) => Some((p, 1i8, true)),
1302 (None, Some(&n)) => Some((n, -1i8, false)),
1303 (None, None) => None,
1304 };
1305
1306 let next_b = match (other.pos.get(b_pos), other.neg.get(b_neg)) {
1307 (Some(&p), Some(&n)) => {
1308 if p < n {
1309 Some((p, 1i8, true))
1310 } else {
1311 Some((n, -1i8, false))
1312 }
1313 }
1314 (Some(&p), None) => Some((p, 1i8, true)),
1315 (None, Some(&n)) => Some((n, -1i8, false)),
1316 (None, None) => None,
1317 };
1318
1319 let Some((idx_a, sign_a, a_is_pos)) = next_a else {
1320 break;
1321 };
1322 let Some((idx_b, sign_b, b_is_pos)) = next_b else {
1323 break;
1324 };
1325
1326 match idx_a.cmp(&idx_b) {
1327 std::cmp::Ordering::Less => {
1328 if a_is_pos {
1329 a_pos += 1;
1330 } else {
1331 a_neg += 1;
1332 }
1333 }
1334 std::cmp::Ordering::Greater => {
1335 if b_is_pos {
1336 b_pos += 1;
1337 } else {
1338 b_neg += 1;
1339 }
1340 }
1341 std::cmp::Ordering::Equal => {
1342 let prod = sign_a * sign_b;
1343 if prod == 1 {
1344 result_pos.push(idx_a);
1345 } else {
1346 result_neg.push(idx_a);
1347 }
1348
1349 if a_is_pos {
1350 a_pos += 1;
1351 } else {
1352 a_neg += 1;
1353 }
1354 if b_is_pos {
1355 b_pos += 1;
1356 } else {
1357 b_neg += 1;
1358 }
1359 }
1360 }
1361 }
1362
1363 SparseVec {
1364 pos: result_pos,
1365 neg: result_neg,
1366 }
1367 }
1368
1369 /// Calculate cosine similarity between two sparse vectors
1370 /// Returns value in [-1, 1] where 1 is identical, 0 is orthogonal
1371 ///
1372 /// When the `simd` feature is enabled, this will automatically use
1373 /// AVX2 (x86_64) or NEON (aarch64) acceleration if available.
1374 ///
1375 /// # Examples
1376 ///
1377 /// ```
1378 /// use embeddenator_vsa::SparseVec;
1379 ///
1380 /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1381 /// let vec1 = SparseVec::encode_data(b"cat", &config, None);
1382 /// let vec2 = SparseVec::encode_data(b"cat", &config, None);
1383 /// let vec3 = SparseVec::encode_data(b"dog", &config, None);
1384 ///
1385 /// // Identical data produces identical vectors
1386 /// assert!((vec1.cosine(&vec2) - 1.0).abs() < 0.01);
1387 ///
1388 /// // Different data produces low similarity
1389 /// let sim = vec1.cosine(&vec3);
1390 /// assert!(sim < 0.3);
1391 /// ```
1392 pub fn cosine(&self, other: &SparseVec) -> f64 {
1393 #[cfg(feature = "simd")]
1394 {
1395 crate::simd_cosine::cosine_simd(self, other)
1396 }
1397
1398 #[cfg(not(feature = "simd"))]
1399 {
1400 // Original implementation remains as fallback
1401 self.cosine_scalar(other)
1402 }
1403 }
1404
1405 /// Scalar (non-SIMD) cosine similarity implementation.
1406 ///
1407 /// This is the original implementation and serves as the baseline
1408 /// for SIMD optimizations. It's also used when SIMD is not available.
1409 #[inline]
1410 pub fn cosine_scalar(&self, other: &SparseVec) -> f64 {
1411 {
1412 // Only use packed cosine when total density is high enough to amortize conversion,
1413 // and avoid converting an extremely sparse operand.
1414 // Thresholds lowered from DIM/4 to DIM/8 for earlier SIMD dispatch.
1415 let a_nnz = self.nnz();
1416 let b_nnz = other.nnz();
1417 let total = a_nnz + b_nnz;
1418 if total > DIM / 8 {
1419 let min_nnz = a_nnz.min(b_nnz);
1420 if min_nnz > DIM / 64 {
1421 let dot = PACKED_SCRATCH_A.with(|a_cell| {
1422 PACKED_SCRATCH_B.with(|b_cell| {
1423 let mut a = a_cell.borrow_mut();
1424 let mut b = b_cell.borrow_mut();
1425 a.fill_from_sparsevec(self, DIM);
1426 b.fill_from_sparsevec(other, DIM);
1427 a.dot(&b)
1428 })
1429 }) as f64;
1430 let self_norm = a_nnz as f64;
1431 let other_norm = b_nnz as f64;
1432 if self_norm == 0.0 || other_norm == 0.0 {
1433 return 0.0;
1434 }
1435 return dot / (self_norm.sqrt() * other_norm.sqrt());
1436 }
1437 }
1438 }
1439
1440 // Sparse ternary dot product:
1441 // +1 when signs match, -1 when signs oppose.
1442 let pp = Self::intersection_count_sorted(&self.pos, &other.pos) as i32;
1443 let nn = Self::intersection_count_sorted(&self.neg, &other.neg) as i32;
1444 let pn = Self::intersection_count_sorted(&self.pos, &other.neg) as i32;
1445 let np = Self::intersection_count_sorted(&self.neg, &other.pos) as i32;
1446 let dot = (pp + nn) - (pn + np);
1447
1448 let self_norm = self.nnz() as f64;
1449 let other_norm = other.nnz() as f64;
1450
1451 if self_norm == 0.0 || other_norm == 0.0 {
1452 return 0.0;
1453 }
1454
1455 dot as f64 / (self_norm.sqrt() * other_norm.sqrt())
1456 }
1457
1458 /// Apply cyclic permutation to vector indices
1459 /// Used for encoding sequence order in hierarchical structures
1460 ///
1461 /// # Arguments
1462 /// * `shift` - Number of positions to shift indices cyclically
1463 ///
1464 /// # Examples
1465 ///
1466 /// ```
1467 /// use embeddenator_vsa::SparseVec;
1468 ///
1469 /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1470 /// let vec = SparseVec::encode_data(b"test", &config, None);
1471 /// let permuted = vec.permute(100);
1472 ///
1473 /// // Permuted vector should have different indices but same structure
1474 /// assert_eq!(vec.pos.len(), permuted.pos.len());
1475 /// assert_eq!(vec.neg.len(), permuted.neg.len());
1476 /// ```
1477 pub fn permute(&self, shift: usize) -> SparseVec {
1478 let permute_index = |idx: usize| (idx + shift) % DIM;
1479
1480 let pos: Vec<usize> = self.pos.iter().map(|&idx| permute_index(idx)).collect();
1481 let neg: Vec<usize> = self.neg.iter().map(|&idx| permute_index(idx)).collect();
1482
1483 // Indices must remain sorted for efficient operations
1484 let mut pos = pos;
1485 let mut neg = neg;
1486 pos.sort_unstable();
1487 neg.sort_unstable();
1488
1489 SparseVec { pos, neg }
1490 }
1491
1492 /// Apply inverse cyclic permutation to vector indices
1493 /// Decodes sequence order by reversing the permutation shift
1494 ///
1495 /// # Arguments
1496 /// * `shift` - Number of positions to reverse shift indices cyclically
1497 ///
1498 /// # Examples
1499 ///
1500 /// ```
1501 /// use embeddenator_vsa::SparseVec;
1502 ///
1503 /// let config = embeddenator_vsa::ReversibleVSAConfig::default();
1504 /// let vec = SparseVec::encode_data(b"test", &config, None);
1505 /// let permuted = vec.permute(100);
1506 /// let recovered = permuted.inverse_permute(100);
1507 ///
1508 /// // Round-trip should recover original vector
1509 /// assert_eq!(vec.pos, recovered.pos);
1510 /// assert_eq!(vec.neg, recovered.neg);
1511 /// ```
1512 pub fn inverse_permute(&self, shift: usize) -> SparseVec {
1513 let inverse_permute_index = |idx: usize| (idx + DIM - (shift % DIM)) % DIM;
1514
1515 let pos: Vec<usize> = self
1516 .pos
1517 .iter()
1518 .map(|&idx| inverse_permute_index(idx))
1519 .collect();
1520 let neg: Vec<usize> = self
1521 .neg
1522 .iter()
1523 .map(|&idx| inverse_permute_index(idx))
1524 .collect();
1525
1526 // Indices must remain sorted for efficient operations
1527 let mut pos = pos;
1528 let mut neg = neg;
1529 pos.sort_unstable();
1530 neg.sort_unstable();
1531
1532 SparseVec { pos, neg }
1533 }
1534
1535 /// Context-Dependent Thinning Algorithm
1536 ///
1537 /// Thinning controls vector sparsity during bundle operations to prevent
1538 /// exponential density growth that degrades VSA performance. The algorithm:
1539 ///
1540 /// 1. Calculate current density = (pos.len() + neg.len()) as f32 / DIM as f32
1541 /// 2. If current_density <= target_density, return unchanged
1542 /// 3. Otherwise, randomly sample indices to reduce to target count
1543 /// 4. Preserve pos/neg ratio to maintain signal polarity balance
1544 /// 5. Use deterministic seeding for reproducible results
1545 ///
1546 /// Edge Cases:
1547 /// - Empty vector: return unchanged
1548 /// - target_non_zero = 0: return empty vector (not recommended)
1549 /// - target_non_zero >= current: return clone
1550 /// - Single polarity vectors: preserve polarity distribution
1551 ///
1552 /// Performance: O(n log n) due to sorting, where n = target_non_zero
1553 pub fn thin(&self, target_non_zero: usize) -> SparseVec {
1554 let current_count = self.pos.len() + self.neg.len();
1555
1556 // Edge case: already at or below target
1557 if current_count <= target_non_zero {
1558 return self.clone();
1559 }
1560
1561 // Edge case: target is zero
1562 if target_non_zero == 0 {
1563 return SparseVec::new();
1564 }
1565
1566 // Calculate how many to keep from each polarity
1567 let pos_ratio = self.pos.len() as f32 / current_count as f32;
1568 let target_pos = (target_non_zero as f32 * pos_ratio).round() as usize;
1569 let target_neg = target_non_zero - target_pos;
1570
1571 // Randomly sample indices using deterministic seed based on vector content
1572 let mut seed = [0u8; 32];
1573 seed[0..4].copy_from_slice(&(self.pos.len() as u32).to_le_bytes());
1574 seed[4..8].copy_from_slice(&(self.neg.len() as u32).to_le_bytes());
1575 seed[8..12].copy_from_slice(&(target_non_zero as u32).to_le_bytes());
1576 // Rest remains zero for deterministic seeding
1577 let mut rng = rand::rngs::StdRng::from_seed(seed);
1578
1579 // Sample positive indices
1580 let mut sampled_pos: Vec<usize> = self.pos.clone();
1581 sampled_pos.shuffle(&mut rng);
1582 sampled_pos.truncate(target_pos);
1583 sampled_pos.sort_unstable();
1584
1585 // Sample negative indices
1586 let mut sampled_neg: Vec<usize> = self.neg.clone();
1587 sampled_neg.shuffle(&mut rng);
1588 sampled_neg.truncate(target_neg);
1589 sampled_neg.sort_unstable();
1590
1591 SparseVec {
1592 pos: sampled_pos,
1593 neg: sampled_neg,
1594 }
1595 }
1596}