Skip to main content

embeddenator_vsa/
codebook.rs

1//! Codebook - Differential Encoding Base Model
2//!
3//! The codebook serves as a learned/constructed basis set for differential encoding.
4//! Data is projected onto this basis, and only the residuals (what can't be expressed
5//! by the codebook) plus semantic markers are stored in the engram.
6//!
7//! # Architecture
8//!
9//! ```text
10//! Codebook = { basis_vectors: [B₀, B₁, ..., Bₙ], semantic_markers: [...] }
11//!
12//! Encoding:  data → coefficients × basis + residual + semantic_outliers
13//! Decoding:  coefficients × basis + residual + semantic_outliers → data
14//! ```
15//!
16//! # Note on Reconstruction
17//!
18//! The codebook is required for data reconstruction:
19//! - Without the codebook, reconstruction is not possible
20//! - Different codebooks produce different representations
21//!
22//! Note: This is not a security feature. The codebook is not encryption.
23//! Security features will be implemented separately in the future.
24
25use crate::vsa::{SparseVec, DIM};
26use crate::VsaError;
27use serde::{Deserialize, Serialize};
28use std::collections::HashMap;
29
30/// 64-bit balanced ternary encoding unit
31/// - 61 bits: data payload (39 trits worth of information)
32/// - 3 bits: parity/metadata (2 trits)
33#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
34pub struct BalancedTernaryWord {
35    /// Raw 64-bit representation
36    /// Bits 0-60: 39 trits of data (each trit = log₂(3) ≈ 1.585 bits)
37    /// Bits 61-63: parity trit + metadata trit
38    packed: u64,
39}
40
41/// Metadata flags stored in the upper 3 bits
42#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
43pub enum WordMetadata {
44    /// Standard data word
45    Data = 0b000,
46    /// Semantic outlier marker
47    SemanticOutlier = 0b001,
48    /// Residual correction word
49    Residual = 0b010,
50    /// Continuation of previous word
51    Continuation = 0b011,
52    /// End of sequence marker
53    EndOfSequence = 0b100,
54    /// Parity check word
55    Parity = 0b101,
56}
57
58impl BalancedTernaryWord {
59    /// Maximum value representable in 38 trits (signed balanced)
60    /// 3^38 = 1,350,851,717,672,992,089 (fits in 61 bits)
61    /// Range: -(3^38-1)/2 to +(3^38-1)/2
62    pub const MAX_VALUE: i64 = 675_425_858_836_496_044;
63    pub const MIN_VALUE: i64 = -675_425_858_836_496_044;
64
65    /// Number of trits in the data portion (38 trits = 61 bits)
66    pub const DATA_TRITS: usize = 38;
67
68    /// Number of trits for metadata/parity (stored in upper 3 bits)
69    pub const META_TRITS: usize = 2;
70
71    /// Create a new word from a signed integer value and metadata
72    pub fn new(value: i64, metadata: WordMetadata) -> Result<Self, VsaError> {
73        if !(Self::MIN_VALUE..=Self::MAX_VALUE).contains(&value) {
74            return Err(VsaError::ValueOutOfRange {
75                value,
76                min: Self::MIN_VALUE,
77                max: Self::MAX_VALUE,
78            });
79        }
80
81        // Convert signed value to balanced ternary representation
82        let encoded = Self::encode_balanced_ternary(value);
83
84        // Pack metadata into upper 3 bits
85        let meta_bits = (metadata as u64) << 61;
86
87        Ok(BalancedTernaryWord {
88            packed: encoded | meta_bits,
89        })
90    }
91
92    /// Create from raw packed representation
93    pub fn from_raw(packed: u64) -> Self {
94        BalancedTernaryWord { packed }
95    }
96
97    /// Get the raw packed value
98    pub fn raw(&self) -> u64 {
99        self.packed
100    }
101
102    /// Extract the data portion (lower 61 bits)
103    pub fn data_bits(&self) -> u64 {
104        self.packed & 0x1FFF_FFFF_FFFF_FFFF
105    }
106
107    /// Extract metadata
108    pub fn metadata(&self) -> WordMetadata {
109        match (self.packed >> 61) & 0b111 {
110            0b000 => WordMetadata::Data,
111            0b001 => WordMetadata::SemanticOutlier,
112            0b010 => WordMetadata::Residual,
113            0b011 => WordMetadata::Continuation,
114            0b100 => WordMetadata::EndOfSequence,
115            0b101 => WordMetadata::Parity,
116            _ => WordMetadata::Data, // Default fallback
117        }
118    }
119
120    /// Decode to signed integer value
121    pub fn decode(&self) -> i64 {
122        Self::decode_balanced_ternary(self.data_bits())
123    }
124
125    /// Encode a signed integer to balanced ternary packed representation
126    ///
127    /// We store the value directly as a base-3 representation where:
128    /// - Digit 0 = trit 0
129    /// - Digit 1 = trit +1  
130    /// - Digit 2 = trit -1
131    fn encode_balanced_ternary(value: i64) -> u64 {
132        // For balanced ternary, we convert by repeatedly dividing
133        // and adjusting for the balanced representation
134        let mut v = value;
135        let mut result: u64 = 0;
136        let mut power: u64 = 1;
137
138        for _ in 0..Self::DATA_TRITS {
139            // Get remainder in range [-1, 0, 1]
140            let mut rem = v % 3;
141            v /= 3;
142
143            if rem == 2 {
144                rem = -1;
145                v += 1;
146            } else if rem == -2 {
147                rem = 1;
148                v -= 1;
149            }
150
151            // Encode: -1 -> 2, 0 -> 0, +1 -> 1
152            let encoded = match rem {
153                -1 => 2u64,
154                0 => 0u64,
155                1 => 1u64,
156                _ => 0u64, // Safety fallback
157            };
158
159            result += encoded * power;
160            power *= 3;
161        }
162
163        result
164    }
165
166    /// Decode balanced ternary packed representation to signed integer
167    fn decode_balanced_ternary(packed: u64) -> i64 {
168        let mut result: i64 = 0;
169        let mut power: i64 = 1;
170        let mut remaining = packed;
171
172        for _ in 0..Self::DATA_TRITS {
173            let trit = remaining % 3;
174            remaining /= 3;
175
176            match trit {
177                0 => {} // Add 0
178                1 => result += power,
179                2 => result -= power, // -1 in balanced ternary
180                _ => unreachable!(),
181            }
182            power *= 3;
183        }
184
185        result
186    }
187
188    /// Negate all trits in a packed representation
189    #[allow(dead_code)]
190    fn negate_trits(packed: u64) -> u64 {
191        let mut result: u64 = 0;
192        let mut remaining = packed;
193        let mut power: u64 = 1;
194
195        for _ in 0..Self::DATA_TRITS {
196            let trit = remaining % 3;
197            remaining /= 3;
198
199            // Negate: 0->0, 1->2, 2->1
200            let negated = match trit {
201                0 => 0,
202                1 => 2,
203                2 => 1,
204                _ => unreachable!(),
205            };
206            result += negated * power;
207            power *= 3;
208        }
209
210        result
211    }
212
213    /// Compute parity trit for error detection
214    pub fn compute_parity(&self) -> i8 {
215        let mut sum: i64 = 0;
216        let mut remaining = self.data_bits();
217
218        for _ in 0..Self::DATA_TRITS {
219            let trit = (remaining % 3) as i64;
220            remaining /= 3;
221
222            // Convert to balanced: 0->0, 1->1, 2->-1
223            sum += match trit {
224                0 => 0,
225                1 => 1,
226                2 => -1,
227                _ => 0,
228            };
229        }
230
231        // Parity trit: makes sum divisible by 3
232        ((3 - (sum.rem_euclid(3))) % 3) as i8
233    }
234}
235
236/// Semantic outlier detected during analysis
237#[derive(Clone, Debug, Serialize, Deserialize)]
238pub struct SemanticOutlier {
239    /// Position in the original data
240    pub position: usize,
241    /// Length of the outlier pattern
242    pub length: usize,
243    /// Entropy score (higher = more unusual)
244    pub entropy_score: f64,
245    /// The outlier pattern encoded as balanced ternary words
246    pub encoded_pattern: Vec<BalancedTernaryWord>,
247    /// Semantic vector for similarity matching
248    pub semantic_vec: SparseVec,
249}
250
251/// Basis vector in the codebook
252#[derive(Clone, Debug, Serialize, Deserialize)]
253pub struct BasisVector {
254    /// Unique identifier for this basis
255    pub id: u32,
256    /// The sparse ternary representation
257    pub vector: SparseVec,
258    /// Human-readable label (optional)
259    pub label: Option<String>,
260    /// Frequency weight (how often this pattern appears)
261    pub weight: f64,
262}
263
264/// The Codebook - acts as the private key for reconstruction
265#[derive(Clone, Debug, Serialize, Deserialize)]
266pub struct Codebook {
267    /// Version for compatibility
268    pub version: u32,
269
270    /// Dimensionality of basis vectors
271    pub dimensionality: usize,
272
273    /// The basis vectors forming the encoding dictionary
274    /// Data is projected onto these bases
275    pub basis_vectors: Vec<BasisVector>,
276
277    /// Semantic marker vectors for outlier detection
278    pub semantic_markers: Vec<SparseVec>,
279
280    /// Statistics for adaptive encoding
281    pub statistics: CodebookStatistics,
282
283    /// Cryptographic salt for key derivation (optional)
284    pub salt: Option<[u8; 32]>,
285}
286
287/// Statistics tracked by the codebook
288#[derive(Clone, Debug, Default, Serialize, Deserialize)]
289pub struct CodebookStatistics {
290    /// Total bytes encoded using this codebook
291    pub total_bytes_encoded: u64,
292    /// Average compression ratio achieved
293    pub avg_compression_ratio: f64,
294    /// Number of semantic outliers detected
295    pub outlier_count: u64,
296    /// Distribution of coefficient magnitudes
297    pub coefficient_histogram: [u64; 16],
298}
299
300/// Configuration for codebook projection operations
301#[derive(Clone, Debug)]
302pub struct ProjectionConfig {
303    /// Size of data chunks to process (bytes)
304    pub chunk_size: usize,
305    /// Similarity threshold for basis vector relevance
306    pub similarity_threshold: f64,
307    /// Maximum number of top basis matches to use
308    pub max_basis_matches: usize,
309    /// Scaling factor for coefficient encoding
310    pub coefficient_scale: f64,
311    /// Key spacing factor for coefficient indexing
312    pub coefficient_key_spacing: u32,
313}
314
315impl Default for ProjectionConfig {
316    fn default() -> Self {
317        Self {
318            chunk_size: 64,
319            similarity_threshold: 0.3,
320            max_basis_matches: 4,
321            coefficient_scale: 1000.0,
322            coefficient_key_spacing: 1000,
323        }
324    }
325}
326
327/// Result of projecting data onto the codebook
328#[derive(Clone, Debug, Serialize, Deserialize)]
329pub struct ProjectionResult {
330    /// Coefficients for each basis vector (sparse - only non-zero)
331    pub coefficients: HashMap<u32, BalancedTernaryWord>,
332    /// Residual that couldn't be expressed by the basis
333    pub residual: Vec<BalancedTernaryWord>,
334    /// Detected semantic outliers
335    pub outliers: Vec<SemanticOutlier>,
336    /// Reconstruction quality score (1.0 = perfect)
337    pub quality_score: f64,
338}
339
340impl Default for Codebook {
341    fn default() -> Self {
342        Self::new(DIM)
343    }
344}
345
346impl Codebook {
347    /// Create a new empty codebook
348    pub fn new(dimensionality: usize) -> Self {
349        Codebook {
350            version: 1,
351            dimensionality,
352            basis_vectors: Vec::new(),
353            semantic_markers: Vec::new(),
354            statistics: CodebookStatistics::default(),
355            salt: None,
356        }
357    }
358
359    /// Create a codebook with cryptographic salt for key derivation
360    pub fn with_salt(dimensionality: usize, salt: [u8; 32]) -> Self {
361        let mut codebook = Self::new(dimensionality);
362        codebook.salt = Some(salt);
363        codebook
364    }
365
366    /// Initialize with common basis vectors for text/binary data
367    pub fn initialize_standard_basis(&mut self) {
368        // Add basis vectors for common byte patterns
369        // These act as a "vocabulary" for differential encoding
370
371        // Zero runs (common in binary)
372        self.add_basis_for_pattern(0, b"\x00\x00\x00\x00", "zero_run");
373
374        // ASCII space/newline (common in text)
375        self.add_basis_for_pattern(1, b"    ", "space_run");
376        self.add_basis_for_pattern(2, b"\n\n", "newline_pair");
377
378        // Common text patterns
379        self.add_basis_for_pattern(3, b"the ", "the_space");
380        self.add_basis_for_pattern(4, b"ing ", "ing_space");
381        self.add_basis_for_pattern(5, b"tion", "tion");
382
383        // Binary markers
384        self.add_basis_for_pattern(6, b"\x89PNG", "png_header");
385        self.add_basis_for_pattern(7, b"\xFF\xD8\xFF", "jpeg_header");
386        self.add_basis_for_pattern(8, b"PK\x03\x04", "zip_header");
387
388        // Add semantic markers for entropy detection
389        self.initialize_semantic_markers();
390    }
391
392    /// Add a basis vector for a specific pattern
393    fn add_basis_for_pattern(&mut self, id: u32, pattern: &[u8], label: &str) {
394        use sha2::{Digest, Sha256};
395
396        // Generate deterministic sparse vector from pattern
397        let mut hasher = Sha256::new();
398        hasher.update(pattern);
399        if let Some(salt) = &self.salt {
400            hasher.update(salt);
401        }
402        let hash = hasher.finalize();
403
404        // Use hash to seed sparse vector generation
405        let seed: [u8; 32] = hash.into();
406        let vector = SparseVec::from_seed(&seed, self.dimensionality);
407
408        self.basis_vectors.push(BasisVector {
409            id,
410            vector,
411            label: Some(label.to_string()),
412            weight: 1.0,
413        });
414    }
415
416    /// Initialize semantic markers for outlier detection
417    fn initialize_semantic_markers(&mut self) {
418        use sha2::{Digest, Sha256};
419
420        let seed_for = |label: &str| -> [u8; 32] {
421            let mut hasher = Sha256::new();
422            hasher.update(b"embeddenator:semantic_marker:v1:");
423            hasher.update(label.as_bytes());
424            hasher.update((self.dimensionality as u64).to_le_bytes());
425            if let Some(salt) = &self.salt {
426                hasher.update(salt);
427            }
428            hasher.finalize().into()
429        };
430
431        // High entropy marker
432        let seed = seed_for("high_entropy");
433        self.semantic_markers
434            .push(SparseVec::from_seed(&seed, self.dimensionality));
435
436        // Repetition marker
437        let seed = seed_for("repetition");
438        self.semantic_markers
439            .push(SparseVec::from_seed(&seed, self.dimensionality));
440
441        // Boundary marker (transitions)
442        let seed = seed_for("boundary");
443        self.semantic_markers
444            .push(SparseVec::from_seed(&seed, self.dimensionality));
445    }
446
447    /// Project data onto the codebook basis
448    /// Returns coefficients, residual, and detected outliers
449    pub fn project(&self, data: &[u8]) -> ProjectionResult {
450        self.project_with_config(data, &ProjectionConfig::default())
451    }
452
453    /// Project data onto the codebook using custom configuration
454    pub fn project_with_config(&self, data: &[u8], config: &ProjectionConfig) -> ProjectionResult {
455        let mut coefficients = HashMap::new();
456        let mut residual = Vec::new();
457        let mut outliers = Vec::new();
458
459        // 1. Analyze data for semantic outliers (entropy spikes)
460        let detected_outliers = self.detect_semantic_outliers(data);
461        outliers.extend(detected_outliers);
462
463        // 2. Project data chunks onto basis vectors
464        let chunk_size = config.chunk_size;
465        for (chunk_idx, chunk) in data.chunks(chunk_size).enumerate() {
466            let chunk_vec = SparseVec::from_bytes(chunk);
467
468            // Find best matching basis vectors
469            let mut best_matches: Vec<(u32, f64)> = self
470                .basis_vectors
471                .iter()
472                .map(|basis| (basis.id, chunk_vec.cosine(&basis.vector)))
473                .filter(|(_, sim)| *sim > config.similarity_threshold)
474                .collect();
475
476            // Sort by similarity (descending), treating NaN as less than any value
477            best_matches.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
478
479            // Take top N matches
480            for (basis_id, similarity) in best_matches.iter().take(config.max_basis_matches) {
481                // Encode coefficient as balanced ternary
482                let coef_value = (*similarity * config.coefficient_scale) as i64;
483                if let Ok(word) = BalancedTernaryWord::new(coef_value, WordMetadata::Data) {
484                    coefficients.insert(
485                        *basis_id * config.coefficient_key_spacing + chunk_idx as u32,
486                        word,
487                    );
488                }
489            }
490
491            // 3. Compute residual (what basis couldn't capture)
492            let reconstructed = self.reconstruct_chunk(&coefficients, chunk_idx, chunk.len());
493            let chunk_residual = self.compute_residual(chunk, &reconstructed);
494
495            for residual_byte in chunk_residual {
496                if let Ok(word) =
497                    BalancedTernaryWord::new(residual_byte as i64, WordMetadata::Residual)
498                {
499                    residual.push(word);
500                }
501            }
502        }
503
504        // Calculate quality score
505        let quality_score = self.calculate_quality_score(data, &coefficients, &residual);
506
507        ProjectionResult {
508            coefficients,
509            residual,
510            outliers,
511            quality_score,
512        }
513    }
514
515    /// Detect semantic outliers (high entropy, rare patterns)
516    fn detect_semantic_outliers(&self, data: &[u8]) -> Vec<SemanticOutlier> {
517        let mut outliers = Vec::new();
518        let window_size = 32;
519
520        if data.len() < window_size {
521            return outliers;
522        }
523
524        for i in 0..data.len() - window_size {
525            let window = &data[i..i + window_size];
526            let entropy = self.calculate_entropy(window);
527
528            // High entropy windows are outliers (compressed/encrypted data)
529            if entropy > 7.5 {
530                let pattern_vec = SparseVec::from_bytes(window);
531
532                // Encode the outlier pattern
533                let mut encoded_pattern = Vec::new();
534                for chunk in window.chunks(8) {
535                    let value = chunk
536                        .iter()
537                        .enumerate()
538                        .fold(0i64, |acc, (j, &b)| acc + ((b as i64) << (j * 8)));
539                    if let Ok(word) = BalancedTernaryWord::new(value, WordMetadata::SemanticOutlier)
540                    {
541                        encoded_pattern.push(word);
542                    }
543                }
544
545                outliers.push(SemanticOutlier {
546                    position: i,
547                    length: window_size,
548                    entropy_score: entropy,
549                    encoded_pattern,
550                    semantic_vec: pattern_vec,
551                });
552
553                // Skip ahead to avoid overlapping outliers
554                // i += window_size / 2; // Can't mutate loop variable, handled by dedup later
555            }
556        }
557
558        // Deduplicate overlapping outliers
559        outliers.dedup_by(|a, b| a.position.abs_diff(b.position) < window_size / 2);
560
561        outliers
562    }
563
564    /// Calculate Shannon entropy of a byte slice
565    fn calculate_entropy(&self, data: &[u8]) -> f64 {
566        let mut counts = [0u32; 256];
567        for &byte in data {
568            counts[byte as usize] += 1;
569        }
570
571        let len = data.len() as f64;
572        counts
573            .iter()
574            .filter(|&&c| c > 0)
575            .map(|&c| {
576                let p = c as f64 / len;
577                -p * p.log2()
578            })
579            .sum()
580    }
581
582    /// Reconstruct a chunk from coefficients
583    ///
584    /// This implementation combines basis vectors weighted by their coefficients.
585    /// The coefficient keys are structured as: `basis_id * coefficient_key_spacing + chunk_idx`
586    ///
587    /// Note: Since basis vectors are sparse ternary representations (SparseVec),
588    /// reconstruction is an approximation. The actual byte-level reconstruction
589    /// relies on the residual corrections applied after this step.
590    fn reconstruct_chunk(
591        &self,
592        coefficients: &HashMap<u32, BalancedTernaryWord>,
593        chunk_idx: usize,
594        chunk_len: usize,
595    ) -> Vec<u8> {
596        // Early return for empty cases
597        if chunk_len == 0 || coefficients.is_empty() || self.basis_vectors.is_empty() {
598            return vec![0u8; chunk_len];
599        }
600
601        let config = ProjectionConfig::default();
602        let key_spacing = config.coefficient_key_spacing;
603        let coef_scale = config.coefficient_scale;
604
605        // Accumulator for weighted reconstruction (using i32 for intermediate values)
606        let mut reconstruction: Vec<i32> = vec![0i32; chunk_len];
607
608        // Find coefficients for this chunk
609        for basis in &self.basis_vectors {
610            let key = basis.id * key_spacing + chunk_idx as u32;
611            if let Some(coef_word) = coefficients.get(&key) {
612                let coef_value = coef_word.decode();
613                // Convert back from scaled similarity to weight factor
614                let weight = coef_value as f64 / coef_scale;
615
616                // Apply weighted contribution from this basis vector
617                // Since basis vectors are sparse ternary, we use their structure
618                // to influence reconstruction. The magnitude is based on coefficient.
619                //
620                // For reconstruction: treat pos indices as contributing +weight
621                // and neg indices as contributing -weight to nearby byte positions.
622                let chunk_weight = (weight * 128.0) as i32;
623
624                // Map basis vector indices to chunk positions using modulo.
625                // Note: This is an approximation since the basis covers the full DIM space.
626                // The modulo mapping may cause non-uniform distribution, but this is
627                // acceptable because the residual mechanism compensates for any
628                // reconstruction errors in the final output.
629                for &idx in &basis.vector.pos {
630                    let pos = idx % chunk_len;
631                    reconstruction[pos] = reconstruction[pos].saturating_add(chunk_weight);
632                }
633                for &idx in &basis.vector.neg {
634                    let pos = idx % chunk_len;
635                    reconstruction[pos] = reconstruction[pos].saturating_sub(chunk_weight);
636                }
637            }
638        }
639
640        // Normalize and clamp to u8 range
641        reconstruction
642            .iter()
643            .map(|&val| val.clamp(0, 255) as u8)
644            .collect()
645    }
646
647    /// Compute residual between original and reconstructed
648    fn compute_residual(&self, original: &[u8], reconstructed: &[u8]) -> Vec<u8> {
649        original
650            .iter()
651            .zip(reconstructed.iter())
652            .map(|(&o, &r)| o.wrapping_sub(r))
653            .collect()
654    }
655
656    /// Calculate reconstruction quality score
657    ///
658    /// Quality is measured based on how well the basis vectors capture the data.
659    /// Score of 1.0 means excellent basis coverage, near 0.0 means poor coverage.
660    ///
661    /// The score reflects:
662    /// 1. Number of basis vectors that matched the data (coefficients stored)
663    /// 2. The magnitude of similarity scores (higher = better match)
664    fn calculate_quality_score(
665        &self,
666        original: &[u8],
667        coefficients: &HashMap<u32, BalancedTernaryWord>,
668        _residual: &[BalancedTernaryWord],
669    ) -> f64 {
670        if original.is_empty() {
671            return 1.0; // Empty data is trivially reconstructed
672        }
673
674        if coefficients.is_empty() {
675            // No basis vectors matched - low quality
676            // But not zero since residual can still reconstruct
677            return 0.1;
678        }
679
680        let config = ProjectionConfig::default();
681
682        // Calculate average coefficient magnitude (similarity scores)
683        let total_coef_magnitude: f64 = coefficients
684            .values()
685            .map(|word| {
686                let val = word.decode() as f64;
687                // Convert back from scaled similarity
688                (val / config.coefficient_scale).abs()
689            })
690            .sum();
691
692        let avg_similarity = total_coef_magnitude / coefficients.len() as f64;
693
694        // Calculate coverage: how many chunks have at least one coefficient
695        let chunk_count = original.len().div_ceil(config.chunk_size);
696        let key_spacing = config.coefficient_key_spacing;
697
698        let chunks_with_coefs: std::collections::HashSet<u32> =
699            coefficients.keys().map(|&key| key % key_spacing).collect();
700
701        let coverage_ratio = chunks_with_coefs.len() as f64 / chunk_count.max(1) as f64;
702
703        // Quality combines average similarity with coverage
704        // Both factors are in [0, 1] range
705        let quality = (avg_similarity * 0.5 + coverage_ratio * 0.5).min(1.0);
706
707        // Ensure non-zero quality when coefficients exist
708        quality.max(0.1)
709    }
710
711    /// Reconstruct original data from projection result
712    pub fn reconstruct(&self, projection: &ProjectionResult, expected_size: usize) -> Vec<u8> {
713        let mut result = Vec::with_capacity(expected_size);
714
715        // 1. Reconstruct from basis coefficients
716        let chunk_size = 64;
717        let num_chunks = expected_size.div_ceil(chunk_size);
718
719        for chunk_idx in 0..num_chunks {
720            let chunk = self.reconstruct_chunk(&projection.coefficients, chunk_idx, chunk_size);
721            result.extend(chunk);
722        }
723
724        // 2. Apply residual corrections
725        for (i, residual_word) in projection.residual.iter().enumerate() {
726            if i < result.len() {
727                let correction = residual_word.decode() as u8;
728                result[i] = result[i].wrapping_add(correction);
729            }
730        }
731
732        // 3. Apply semantic outlier corrections
733        for outlier in &projection.outliers {
734            if outlier.position + outlier.length <= result.len() {
735                // Decode outlier pattern and overwrite
736                let mut decoded = Vec::new();
737                for word in &outlier.encoded_pattern {
738                    let value = word.decode();
739                    for j in 0..8 {
740                        decoded.push(((value >> (j * 8)) & 0xFF) as u8);
741                    }
742                }
743
744                for (j, &byte) in decoded.iter().enumerate().take(outlier.length) {
745                    if outlier.position + j < result.len() {
746                        result[outlier.position + j] = byte;
747                    }
748                }
749            }
750        }
751
752        result.truncate(expected_size);
753        result
754    }
755}
756
757impl SparseVec {
758    /// Create a sparse vector from a seed (deterministic)
759    pub fn from_seed(seed: &[u8; 32], dim: usize) -> Self {
760        use rand::seq::SliceRandom;
761        use rand::SeedableRng;
762
763        let mut rng = rand::rngs::StdRng::from_seed(*seed);
764        let sparsity = dim / 100; // 1% density
765
766        let mut indices: Vec<usize> = (0..dim).collect();
767        indices.shuffle(&mut rng);
768
769        let mut pos: Vec<_> = indices[..sparsity].to_vec();
770        let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
771
772        pos.sort_unstable();
773        neg.sort_unstable();
774
775        SparseVec { pos, neg }
776    }
777
778    /// Create a sparse vector directly from bytes
779    pub fn from_bytes(data: &[u8]) -> Self {
780        use sha2::{Digest, Sha256};
781
782        let mut hasher = Sha256::new();
783        hasher.update(data);
784        let hash = hasher.finalize();
785        let seed: [u8; 32] = hash.into();
786
787        Self::from_seed(&seed, DIM)
788    }
789}
790
791#[cfg(test)]
792mod tests {
793    use super::*;
794
795    #[test]
796    fn test_balanced_ternary_roundtrip() {
797        let test_values = [
798            0i64,
799            1,
800            -1,
801            100,
802            -100,
803            12345,
804            -12345,
805            BalancedTernaryWord::MAX_VALUE / 2,
806            BalancedTernaryWord::MIN_VALUE / 2,
807        ];
808
809        for &value in &test_values {
810            let word = BalancedTernaryWord::new(value, WordMetadata::Data)
811                .expect("Test value should be encodable");
812            let decoded = word.decode();
813            assert_eq!(value, decoded, "Failed roundtrip for {}", value);
814        }
815    }
816
817    #[test]
818    fn test_balanced_ternary_metadata() {
819        let word = BalancedTernaryWord::new(42, WordMetadata::SemanticOutlier)
820            .expect("42 should be encodable");
821        assert_eq!(word.metadata(), WordMetadata::SemanticOutlier);
822        assert_eq!(word.decode(), 42);
823    }
824
825    #[test]
826    fn test_balanced_ternary_range() {
827        // Should succeed at boundaries
828        assert!(
829            BalancedTernaryWord::new(BalancedTernaryWord::MAX_VALUE, WordMetadata::Data).is_ok()
830        );
831        assert!(
832            BalancedTernaryWord::new(BalancedTernaryWord::MIN_VALUE, WordMetadata::Data).is_ok()
833        );
834
835        // Should fail outside boundaries
836        assert!(
837            BalancedTernaryWord::new(BalancedTernaryWord::MAX_VALUE + 1, WordMetadata::Data)
838                .is_err()
839        );
840        assert!(
841            BalancedTernaryWord::new(BalancedTernaryWord::MIN_VALUE - 1, WordMetadata::Data)
842                .is_err()
843        );
844    }
845
846    #[test]
847    fn test_codebook_projection() {
848        let mut codebook = Codebook::new(10000);
849        codebook.initialize_standard_basis();
850
851        let data = b"the quick brown fox jumps over the lazy dog";
852        let projection = codebook.project(data);
853
854        assert!(projection.quality_score > 0.0);
855        assert!(!projection.coefficients.is_empty() || !projection.residual.is_empty());
856    }
857
858    #[test]
859    fn test_parity_computation() {
860        let word =
861            BalancedTernaryWord::new(12345, WordMetadata::Data).expect("12345 should be encodable");
862        let parity = word.compute_parity();
863        assert!((-1..=1).contains(&parity));
864    }
865}