embeddenator_vsa/
codebook.rs

1//! Codebook - Differential Encoding Base Model
2//!
3//! The codebook serves as a learned/constructed basis set for differential encoding.
4//! Data is projected onto this basis, and only the residuals (what can't be expressed
5//! by the codebook) plus semantic markers are stored in the engram.
6//!
7//! # Architecture
8//!
9//! ```text
10//! Codebook = { basis_vectors: [B₀, B₁, ..., Bₙ], semantic_markers: [...] }
11//!
12//! Encoding:  data → coefficients × basis + residual + semantic_outliers
13//! Decoding:  coefficients × basis + residual + semantic_outliers → data
14//! ```
15//!
16//! # Note on Reconstruction
17//!
18//! The codebook is required for data reconstruction:
19//! - Without the codebook, reconstruction is not possible
20//! - Different codebooks produce different representations
21//!
22//! Note: This is not a security feature. The codebook is not encryption.
23//! Security features will be implemented separately in the future.
24
25use crate::vsa::{SparseVec, DIM};
26use crate::VsaError;
27use serde::{Deserialize, Serialize};
28use std::collections::HashMap;
29
30/// 64-bit balanced ternary encoding unit
31/// - 61 bits: data payload (39 trits worth of information)
32/// - 3 bits: parity/metadata (2 trits)
33#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
34pub struct BalancedTernaryWord {
35    /// Raw 64-bit representation
36    /// Bits 0-60: 39 trits of data (each trit = log₂(3) ≈ 1.585 bits)
37    /// Bits 61-63: parity trit + metadata trit
38    packed: u64,
39}
40
41/// Metadata flags stored in the upper 3 bits
42#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
43pub enum WordMetadata {
44    /// Standard data word
45    Data = 0b000,
46    /// Semantic outlier marker
47    SemanticOutlier = 0b001,
48    /// Residual correction word
49    Residual = 0b010,
50    /// Continuation of previous word
51    Continuation = 0b011,
52    /// End of sequence marker
53    EndOfSequence = 0b100,
54    /// Parity check word
55    Parity = 0b101,
56}
57
58impl BalancedTernaryWord {
59    /// Maximum value representable in 38 trits (signed balanced)
60    /// 3^38 = 1,350,851,717,672,992,089 (fits in 61 bits)
61    /// Range: -(3^38-1)/2 to +(3^38-1)/2
62    pub const MAX_VALUE: i64 = 675_425_858_836_496_044;
63    pub const MIN_VALUE: i64 = -675_425_858_836_496_044;
64
65    /// Number of trits in the data portion (38 trits = 61 bits)
66    pub const DATA_TRITS: usize = 38;
67
68    /// Number of trits for metadata/parity (stored in upper 3 bits)
69    pub const META_TRITS: usize = 2;
70
71    /// Create a new word from a signed integer value and metadata
72    pub fn new(value: i64, metadata: WordMetadata) -> Result<Self, VsaError> {
73        if !(Self::MIN_VALUE..=Self::MAX_VALUE).contains(&value) {
74            return Err(VsaError::ValueOutOfRange {
75                value,
76                min: Self::MIN_VALUE,
77                max: Self::MAX_VALUE,
78            });
79        }
80
81        // Convert signed value to balanced ternary representation
82        let encoded = Self::encode_balanced_ternary(value);
83
84        // Pack metadata into upper 3 bits
85        let meta_bits = (metadata as u64) << 61;
86
87        Ok(BalancedTernaryWord {
88            packed: encoded | meta_bits,
89        })
90    }
91
92    /// Create from raw packed representation
93    pub fn from_raw(packed: u64) -> Self {
94        BalancedTernaryWord { packed }
95    }
96
97    /// Get the raw packed value
98    pub fn raw(&self) -> u64 {
99        self.packed
100    }
101
102    /// Extract the data portion (lower 61 bits)
103    pub fn data_bits(&self) -> u64 {
104        self.packed & 0x1FFF_FFFF_FFFF_FFFF
105    }
106
107    /// Extract metadata
108    pub fn metadata(&self) -> WordMetadata {
109        match (self.packed >> 61) & 0b111 {
110            0b000 => WordMetadata::Data,
111            0b001 => WordMetadata::SemanticOutlier,
112            0b010 => WordMetadata::Residual,
113            0b011 => WordMetadata::Continuation,
114            0b100 => WordMetadata::EndOfSequence,
115            0b101 => WordMetadata::Parity,
116            _ => WordMetadata::Data, // Default fallback
117        }
118    }
119
120    /// Decode to signed integer value
121    pub fn decode(&self) -> i64 {
122        Self::decode_balanced_ternary(self.data_bits())
123    }
124
125    /// Encode a signed integer to balanced ternary packed representation
126    ///
127    /// We store the value directly as a base-3 representation where:
128    /// - Digit 0 = trit 0
129    /// - Digit 1 = trit +1  
130    /// - Digit 2 = trit -1
131    fn encode_balanced_ternary(value: i64) -> u64 {
132        // For balanced ternary, we convert by repeatedly dividing
133        // and adjusting for the balanced representation
134        let mut v = value;
135        let mut result: u64 = 0;
136        let mut power: u64 = 1;
137
138        for _ in 0..Self::DATA_TRITS {
139            // Get remainder in range [-1, 0, 1]
140            let mut rem = v % 3;
141            v /= 3;
142
143            if rem == 2 {
144                rem = -1;
145                v += 1;
146            } else if rem == -2 {
147                rem = 1;
148                v -= 1;
149            }
150
151            // Encode: -1 -> 2, 0 -> 0, +1 -> 1
152            let encoded = match rem {
153                -1 => 2u64,
154                0 => 0u64,
155                1 => 1u64,
156                _ => 0u64, // Safety fallback
157            };
158
159            result += encoded * power;
160            power *= 3;
161        }
162
163        result
164    }
165
166    /// Decode balanced ternary packed representation to signed integer
167    fn decode_balanced_ternary(packed: u64) -> i64 {
168        let mut result: i64 = 0;
169        let mut power: i64 = 1;
170        let mut remaining = packed;
171
172        for _ in 0..Self::DATA_TRITS {
173            let trit = remaining % 3;
174            remaining /= 3;
175
176            match trit {
177                0 => {} // Add 0
178                1 => result += power,
179                2 => result -= power, // -1 in balanced ternary
180                _ => unreachable!(),
181            }
182            power *= 3;
183        }
184
185        result
186    }
187
188    /// Negate all trits in a packed representation
189    #[allow(dead_code)]
190    fn negate_trits(packed: u64) -> u64 {
191        let mut result: u64 = 0;
192        let mut remaining = packed;
193        let mut power: u64 = 1;
194
195        for _ in 0..Self::DATA_TRITS {
196            let trit = remaining % 3;
197            remaining /= 3;
198
199            // Negate: 0->0, 1->2, 2->1
200            let negated = match trit {
201                0 => 0,
202                1 => 2,
203                2 => 1,
204                _ => unreachable!(),
205            };
206            result += negated * power;
207            power *= 3;
208        }
209
210        result
211    }
212
213    /// Compute parity trit for error detection
214    pub fn compute_parity(&self) -> i8 {
215        let mut sum: i64 = 0;
216        let mut remaining = self.data_bits();
217
218        for _ in 0..Self::DATA_TRITS {
219            let trit = (remaining % 3) as i64;
220            remaining /= 3;
221
222            // Convert to balanced: 0->0, 1->1, 2->-1
223            sum += match trit {
224                0 => 0,
225                1 => 1,
226                2 => -1,
227                _ => 0,
228            };
229        }
230
231        // Parity trit: makes sum divisible by 3
232        ((3 - (sum.rem_euclid(3))) % 3) as i8
233    }
234}
235
236/// Semantic outlier detected during analysis
237#[derive(Clone, Debug, Serialize, Deserialize)]
238pub struct SemanticOutlier {
239    /// Position in the original data
240    pub position: usize,
241    /// Length of the outlier pattern
242    pub length: usize,
243    /// Entropy score (higher = more unusual)
244    pub entropy_score: f64,
245    /// The outlier pattern encoded as balanced ternary words
246    pub encoded_pattern: Vec<BalancedTernaryWord>,
247    /// Semantic vector for similarity matching
248    pub semantic_vec: SparseVec,
249}
250
251/// Basis vector in the codebook
252#[derive(Clone, Debug, Serialize, Deserialize)]
253pub struct BasisVector {
254    /// Unique identifier for this basis
255    pub id: u32,
256    /// The sparse ternary representation
257    pub vector: SparseVec,
258    /// Human-readable label (optional)
259    pub label: Option<String>,
260    /// Frequency weight (how often this pattern appears)
261    pub weight: f64,
262}
263
264/// The Codebook - acts as the private key for reconstruction
265#[derive(Clone, Debug, Serialize, Deserialize)]
266pub struct Codebook {
267    /// Version for compatibility
268    pub version: u32,
269
270    /// Dimensionality of basis vectors
271    pub dimensionality: usize,
272
273    /// The basis vectors forming the encoding dictionary
274    /// Data is projected onto these bases
275    pub basis_vectors: Vec<BasisVector>,
276
277    /// Semantic marker vectors for outlier detection
278    pub semantic_markers: Vec<SparseVec>,
279
280    /// Statistics for adaptive encoding
281    pub statistics: CodebookStatistics,
282
283    /// Cryptographic salt for key derivation (optional)
284    pub salt: Option<[u8; 32]>,
285}
286
287/// Statistics tracked by the codebook
288#[derive(Clone, Debug, Default, Serialize, Deserialize)]
289pub struct CodebookStatistics {
290    /// Total bytes encoded using this codebook
291    pub total_bytes_encoded: u64,
292    /// Average compression ratio achieved
293    pub avg_compression_ratio: f64,
294    /// Number of semantic outliers detected
295    pub outlier_count: u64,
296    /// Distribution of coefficient magnitudes
297    pub coefficient_histogram: [u64; 16],
298}
299
300/// Configuration for codebook projection operations
301#[derive(Clone, Debug)]
302pub struct ProjectionConfig {
303    /// Size of data chunks to process (bytes)
304    pub chunk_size: usize,
305    /// Similarity threshold for basis vector relevance
306    pub similarity_threshold: f64,
307    /// Maximum number of top basis matches to use
308    pub max_basis_matches: usize,
309    /// Scaling factor for coefficient encoding
310    pub coefficient_scale: f64,
311    /// Key spacing factor for coefficient indexing
312    pub coefficient_key_spacing: u32,
313}
314
315impl Default for ProjectionConfig {
316    fn default() -> Self {
317        Self {
318            chunk_size: 64,
319            similarity_threshold: 0.3,
320            max_basis_matches: 4,
321            coefficient_scale: 1000.0,
322            coefficient_key_spacing: 1000,
323        }
324    }
325}
326
327/// Configuration for codebook training
328#[derive(Clone, Debug)]
329pub struct CodebookTrainingConfig {
330    /// Maximum number of learned basis vectors
331    pub max_basis_vectors: usize,
332    /// Minimum frequency for a pattern to become a basis
333    pub min_frequency: u64,
334    /// Whether to include byte-level basis vectors (0-255)
335    pub include_byte_basis: bool,
336    /// Whether to include position-aware basis vectors
337    pub include_position_basis: bool,
338}
339
340impl Default for CodebookTrainingConfig {
341    fn default() -> Self {
342        Self {
343            max_basis_vectors: 512,
344            min_frequency: 5,
345            include_byte_basis: true,
346            include_position_basis: true,
347        }
348    }
349}
350
351/// Result of projecting data onto the codebook
352#[derive(Clone, Debug, Serialize, Deserialize)]
353pub struct ProjectionResult {
354    /// Coefficients for each basis vector (sparse - only non-zero)
355    pub coefficients: HashMap<u32, BalancedTernaryWord>,
356    /// Residual that couldn't be expressed by the basis
357    pub residual: Vec<BalancedTernaryWord>,
358    /// Detected semantic outliers
359    pub outliers: Vec<SemanticOutlier>,
360    /// Reconstruction quality score (1.0 = perfect)
361    pub quality_score: f64,
362}
363
364impl Default for Codebook {
365    fn default() -> Self {
366        Self::new(DIM)
367    }
368}
369
370impl Codebook {
371    /// Create a new empty codebook
372    pub fn new(dimensionality: usize) -> Self {
373        Codebook {
374            version: 1,
375            dimensionality,
376            basis_vectors: Vec::new(),
377            semantic_markers: Vec::new(),
378            statistics: CodebookStatistics::default(),
379            salt: None,
380        }
381    }
382
383    /// Create a codebook with cryptographic salt for key derivation
384    pub fn with_salt(dimensionality: usize, salt: [u8; 32]) -> Self {
385        let mut codebook = Self::new(dimensionality);
386        codebook.salt = Some(salt);
387        codebook
388    }
389
390    /// Initialize with common basis vectors for text/binary data
391    pub fn initialize_standard_basis(&mut self) {
392        // Add basis vectors for common byte patterns
393        // These act as a "vocabulary" for differential encoding
394
395        // Zero runs (common in binary)
396        self.add_basis_for_pattern(0, b"\x00\x00\x00\x00", "zero_run");
397
398        // ASCII space/newline (common in text)
399        self.add_basis_for_pattern(1, b"    ", "space_run");
400        self.add_basis_for_pattern(2, b"\n\n", "newline_pair");
401
402        // Common text patterns
403        self.add_basis_for_pattern(3, b"the ", "the_space");
404        self.add_basis_for_pattern(4, b"ing ", "ing_space");
405        self.add_basis_for_pattern(5, b"tion", "tion");
406
407        // Binary markers
408        self.add_basis_for_pattern(6, b"\x89PNG", "png_header");
409        self.add_basis_for_pattern(7, b"\xFF\xD8\xFF", "jpeg_header");
410        self.add_basis_for_pattern(8, b"PK\x03\x04", "zip_header");
411
412        // Add semantic markers for entropy detection
413        self.initialize_semantic_markers();
414    }
415
416    /// Initialize with byte-level basis vectors (256 basis vectors for each byte value)
417    ///
418    /// This creates a complete basis that can represent any byte data.
419    /// Each byte value 0-255 gets its own basis vector.
420    ///
421    /// Position basis vectors (64 vectors for positions 0-63) are also added
422    /// by default. Use `initialize_byte_basis_with_config` to control this.
423    pub fn initialize_byte_basis(&mut self) {
424        self.initialize_byte_basis_with_config(true);
425    }
426
427    /// Initialize with byte-level basis vectors with optional position basis
428    ///
429    /// # Arguments
430    /// * `include_position_basis` - Whether to add position-aware basis vectors (64 vectors)
431    pub fn initialize_byte_basis_with_config(&mut self, include_position_basis: bool) {
432        use sha2::{Digest, Sha256};
433
434        // Create basis vectors for all 256 byte values
435        for byte_val in 0u8..=255 {
436            let mut hasher = Sha256::new();
437            hasher.update(b"embeddenator:byte_basis:v1:");
438            hasher.update([byte_val]);
439            hasher.update((self.dimensionality as u64).to_le_bytes());
440            if let Some(salt) = &self.salt {
441                hasher.update(salt);
442            }
443            let hash = hasher.finalize();
444            let seed: [u8; 32] = hash.into();
445
446            let vector = SparseVec::from_seed(&seed, self.dimensionality);
447            self.basis_vectors.push(BasisVector {
448                id: byte_val as u32,
449                vector,
450                label: Some(format!("byte_{:02x}", byte_val)),
451                weight: 1.0,
452            });
453        }
454
455        // Optionally add position basis vectors (for position-aware encoding)
456        // This helps distinguish the same byte at different positions
457        if include_position_basis {
458            for pos in 0..64 {
459                let mut hasher = Sha256::new();
460                hasher.update(b"embeddenator:position_basis:v1:");
461                hasher.update((pos as u64).to_le_bytes());
462                hasher.update((self.dimensionality as u64).to_le_bytes());
463                if let Some(salt) = &self.salt {
464                    hasher.update(salt);
465                }
466                let hash = hasher.finalize();
467                let seed: [u8; 32] = hash.into();
468
469                let vector = SparseVec::from_seed(&seed, self.dimensionality);
470                self.basis_vectors.push(BasisVector {
471                    id: 256 + pos as u32,
472                    vector,
473                    label: Some(format!("pos_{}", pos)),
474                    weight: 1.0,
475                });
476            }
477        }
478    }
479
480    /// Train the codebook on representative data
481    ///
482    /// This learns basis vectors by analyzing patterns in the training data.
483    /// The algorithm:
484    /// 1. Chunk the data into blocks
485    /// 2. Find frequently occurring patterns (n-grams)
486    /// 3. Create basis vectors for the most common patterns
487    /// 4. Optionally add byte-level basis as fallback
488    ///
489    /// # Arguments
490    /// * `training_data` - Slice of training samples
491    /// * `config` - Training configuration
492    ///
493    /// # Returns
494    /// Number of basis vectors learned
495    ///
496    /// # ID Allocation
497    ///
498    /// Basis vector IDs are allocated in non-overlapping ranges:
499    /// - Byte basis: 0-255
500    /// - Position basis: 256-319
501    /// - Learned patterns: 1000+
502    pub fn train(&mut self, training_data: &[&[u8]], config: &CodebookTrainingConfig) -> usize {
503        use std::collections::HashMap;
504
505        // 1. If enabled, add byte-level basis first (IDs 0-319)
506        // This ensures byte basis gets the reserved ID range
507        let mut added = 0;
508        if config.include_byte_basis {
509            self.initialize_byte_basis_with_config(config.include_position_basis);
510            added += 256; // 256 byte basis
511            if config.include_position_basis {
512                added += 64; // 64 position basis
513            }
514        }
515
516        // 2. Count n-gram frequencies across all training data
517        let mut ngram_counts: HashMap<Vec<u8>, u64> = HashMap::new();
518
519        for data in training_data {
520            for window_size in &[2usize, 3, 4, 6, 8] {
521                if data.len() >= *window_size {
522                    for window in data.windows(*window_size) {
523                        *ngram_counts.entry(window.to_vec()).or_insert(0) += 1;
524                    }
525                }
526            }
527        }
528
529        // 3. Sort by frequency and take top patterns
530        let mut patterns: Vec<(Vec<u8>, u64)> = ngram_counts.into_iter().collect();
531        patterns.sort_by(|a, b| b.1.cmp(&a.1));
532
533        // 4. Add top patterns as basis vectors with IDs starting at 1000
534        // This avoids collision with byte basis (0-319)
535        const PATTERN_ID_START: u32 = 1000;
536        let mut pattern_id = PATTERN_ID_START;
537        for (pattern, count) in patterns.iter().take(config.max_basis_vectors) {
538            if *count >= config.min_frequency && pattern.len() >= 2 {
539                let label = format!("pattern_{:02x}_{}_freq{}", pattern[0], pattern.len(), count);
540                self.add_basis_for_pattern(pattern_id, pattern, &label);
541                pattern_id += 1;
542                added += 1;
543            }
544        }
545
546        self.statistics.total_bytes_encoded = training_data.iter().map(|d| d.len() as u64).sum();
547
548        added
549    }
550
551    /// Train codebook from files on disk
552    ///
553    /// Convenience method that reads files and trains on their content.
554    pub fn train_from_files(
555        &mut self,
556        paths: &[&std::path::Path],
557        config: &CodebookTrainingConfig,
558    ) -> std::io::Result<usize> {
559        let mut training_data: Vec<Vec<u8>> = Vec::new();
560
561        for path in paths {
562            if let Ok(data) = std::fs::read(path) {
563                training_data.push(data);
564            }
565        }
566
567        let refs: Vec<&[u8]> = training_data.iter().map(|v| v.as_slice()).collect();
568        Ok(self.train(&refs, config))
569    }
570
571    /// Add a basis vector for a specific pattern
572    fn add_basis_for_pattern(&mut self, id: u32, pattern: &[u8], label: &str) {
573        use sha2::{Digest, Sha256};
574
575        // Generate deterministic sparse vector from pattern
576        let mut hasher = Sha256::new();
577        hasher.update(pattern);
578        if let Some(salt) = &self.salt {
579            hasher.update(salt);
580        }
581        let hash = hasher.finalize();
582
583        // Use hash to seed sparse vector generation
584        let seed: [u8; 32] = hash.into();
585        let vector = SparseVec::from_seed(&seed, self.dimensionality);
586
587        self.basis_vectors.push(BasisVector {
588            id,
589            vector,
590            label: Some(label.to_string()),
591            weight: 1.0,
592        });
593    }
594
595    /// Initialize semantic markers for outlier detection
596    fn initialize_semantic_markers(&mut self) {
597        use sha2::{Digest, Sha256};
598
599        let seed_for = |label: &str| -> [u8; 32] {
600            let mut hasher = Sha256::new();
601            hasher.update(b"embeddenator:semantic_marker:v1:");
602            hasher.update(label.as_bytes());
603            hasher.update((self.dimensionality as u64).to_le_bytes());
604            if let Some(salt) = &self.salt {
605                hasher.update(salt);
606            }
607            hasher.finalize().into()
608        };
609
610        // High entropy marker
611        let seed = seed_for("high_entropy");
612        self.semantic_markers
613            .push(SparseVec::from_seed(&seed, self.dimensionality));
614
615        // Repetition marker
616        let seed = seed_for("repetition");
617        self.semantic_markers
618            .push(SparseVec::from_seed(&seed, self.dimensionality));
619
620        // Boundary marker (transitions)
621        let seed = seed_for("boundary");
622        self.semantic_markers
623            .push(SparseVec::from_seed(&seed, self.dimensionality));
624    }
625
626    /// Project data onto the codebook basis
627    /// Returns coefficients, residual, and detected outliers
628    pub fn project(&self, data: &[u8]) -> ProjectionResult {
629        self.project_with_config(data, &ProjectionConfig::default())
630    }
631
632    /// Project data onto the codebook using custom configuration
633    pub fn project_with_config(&self, data: &[u8], config: &ProjectionConfig) -> ProjectionResult {
634        let mut coefficients = HashMap::new();
635        let mut residual = Vec::new();
636        let mut outliers = Vec::new();
637
638        // 1. Analyze data for semantic outliers (entropy spikes)
639        let detected_outliers = self.detect_semantic_outliers(data);
640        outliers.extend(detected_outliers);
641
642        // 2. Project data chunks onto basis vectors
643        let chunk_size = config.chunk_size;
644        for (chunk_idx, chunk) in data.chunks(chunk_size).enumerate() {
645            let chunk_vec = SparseVec::from_bytes(chunk);
646
647            // Find best matching basis vectors
648            let mut best_matches: Vec<(u32, f64)> = self
649                .basis_vectors
650                .iter()
651                .map(|basis| (basis.id, chunk_vec.cosine(&basis.vector)))
652                .filter(|(_, sim)| *sim > config.similarity_threshold)
653                .collect();
654
655            // Sort by similarity (descending), treating NaN as less than any value
656            best_matches.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
657
658            // Take top N matches
659            for (basis_id, similarity) in best_matches.iter().take(config.max_basis_matches) {
660                // Encode coefficient as balanced ternary
661                let coef_value = (*similarity * config.coefficient_scale) as i64;
662                if let Ok(word) = BalancedTernaryWord::new(coef_value, WordMetadata::Data) {
663                    coefficients.insert(
664                        *basis_id * config.coefficient_key_spacing + chunk_idx as u32,
665                        word,
666                    );
667                }
668            }
669
670            // 3. Compute residual (what basis couldn't capture)
671            let reconstructed = self.reconstruct_chunk(&coefficients, chunk_idx, chunk.len());
672            let chunk_residual = self.compute_residual(chunk, &reconstructed);
673
674            for residual_byte in chunk_residual {
675                if let Ok(word) =
676                    BalancedTernaryWord::new(residual_byte as i64, WordMetadata::Residual)
677                {
678                    residual.push(word);
679                }
680            }
681        }
682
683        // Calculate quality score
684        let quality_score = self.calculate_quality_score(data, &coefficients, &residual);
685
686        ProjectionResult {
687            coefficients,
688            residual,
689            outliers,
690            quality_score,
691        }
692    }
693
694    /// Detect semantic outliers (high entropy, rare patterns)
695    fn detect_semantic_outliers(&self, data: &[u8]) -> Vec<SemanticOutlier> {
696        let mut outliers = Vec::new();
697        let window_size = 32;
698
699        if data.len() < window_size {
700            return outliers;
701        }
702
703        for i in 0..data.len() - window_size {
704            let window = &data[i..i + window_size];
705            let entropy = self.calculate_entropy(window);
706
707            // High entropy windows are outliers (compressed/encrypted data)
708            if entropy > 7.5 {
709                let pattern_vec = SparseVec::from_bytes(window);
710
711                // Encode the outlier pattern
712                let mut encoded_pattern = Vec::new();
713                for chunk in window.chunks(8) {
714                    let value = chunk
715                        .iter()
716                        .enumerate()
717                        .fold(0i64, |acc, (j, &b)| acc + ((b as i64) << (j * 8)));
718                    if let Ok(word) = BalancedTernaryWord::new(value, WordMetadata::SemanticOutlier)
719                    {
720                        encoded_pattern.push(word);
721                    }
722                }
723
724                outliers.push(SemanticOutlier {
725                    position: i,
726                    length: window_size,
727                    entropy_score: entropy,
728                    encoded_pattern,
729                    semantic_vec: pattern_vec,
730                });
731
732                // Skip ahead to avoid overlapping outliers
733                // i += window_size / 2; // Can't mutate loop variable, handled by dedup later
734            }
735        }
736
737        // Deduplicate overlapping outliers
738        outliers.dedup_by(|a, b| a.position.abs_diff(b.position) < window_size / 2);
739
740        outliers
741    }
742
743    /// Calculate Shannon entropy of a byte slice
744    fn calculate_entropy(&self, data: &[u8]) -> f64 {
745        let mut counts = [0u32; 256];
746        for &byte in data {
747            counts[byte as usize] += 1;
748        }
749
750        let len = data.len() as f64;
751        counts
752            .iter()
753            .filter(|&&c| c > 0)
754            .map(|&c| {
755                let p = c as f64 / len;
756                -p * p.log2()
757            })
758            .sum()
759    }
760
761    /// Reconstruct a chunk from coefficients
762    ///
763    /// This implementation combines basis vectors weighted by their coefficients.
764    /// The coefficient keys are structured as: `basis_id * coefficient_key_spacing + chunk_idx`
765    ///
766    /// Note: Since basis vectors are sparse ternary representations (SparseVec),
767    /// reconstruction is an approximation. The actual byte-level reconstruction
768    /// relies on the residual corrections applied after this step.
769    fn reconstruct_chunk(
770        &self,
771        coefficients: &HashMap<u32, BalancedTernaryWord>,
772        chunk_idx: usize,
773        chunk_len: usize,
774    ) -> Vec<u8> {
775        // Early return for empty cases
776        if chunk_len == 0 || coefficients.is_empty() || self.basis_vectors.is_empty() {
777            return vec![0u8; chunk_len];
778        }
779
780        let config = ProjectionConfig::default();
781        let key_spacing = config.coefficient_key_spacing;
782        let coef_scale = config.coefficient_scale;
783
784        // Accumulator for weighted reconstruction (using i32 for intermediate values)
785        let mut reconstruction: Vec<i32> = vec![0i32; chunk_len];
786
787        // Find coefficients for this chunk
788        for basis in &self.basis_vectors {
789            let key = basis.id * key_spacing + chunk_idx as u32;
790            if let Some(coef_word) = coefficients.get(&key) {
791                let coef_value = coef_word.decode();
792                // Convert back from scaled similarity to weight factor
793                let weight = coef_value as f64 / coef_scale;
794
795                // Apply weighted contribution from this basis vector
796                // Since basis vectors are sparse ternary, we use their structure
797                // to influence reconstruction. The magnitude is based on coefficient.
798                //
799                // For reconstruction: treat pos indices as contributing +weight
800                // and neg indices as contributing -weight to nearby byte positions.
801                let chunk_weight = (weight * 128.0) as i32;
802
803                // Map basis vector indices to chunk positions using modulo.
804                // Note: This is an approximation since the basis covers the full DIM space.
805                // The modulo mapping may cause non-uniform distribution, but this is
806                // acceptable because the residual mechanism compensates for any
807                // reconstruction errors in the final output.
808                for &idx in &basis.vector.pos {
809                    let pos = idx % chunk_len;
810                    reconstruction[pos] = reconstruction[pos].saturating_add(chunk_weight);
811                }
812                for &idx in &basis.vector.neg {
813                    let pos = idx % chunk_len;
814                    reconstruction[pos] = reconstruction[pos].saturating_sub(chunk_weight);
815                }
816            }
817        }
818
819        // Normalize and clamp to u8 range
820        reconstruction
821            .iter()
822            .map(|&val| val.clamp(0, 255) as u8)
823            .collect()
824    }
825
826    /// Compute residual between original and reconstructed
827    fn compute_residual(&self, original: &[u8], reconstructed: &[u8]) -> Vec<u8> {
828        original
829            .iter()
830            .zip(reconstructed.iter())
831            .map(|(&o, &r)| o.wrapping_sub(r))
832            .collect()
833    }
834
835    /// Calculate reconstruction quality score
836    ///
837    /// Quality is measured based on how well the basis vectors capture the data.
838    /// Score of 1.0 means excellent basis coverage, near 0.0 means poor coverage.
839    ///
840    /// The score reflects:
841    /// 1. Number of basis vectors that matched the data (coefficients stored)
842    /// 2. The magnitude of similarity scores (higher = better match)
843    fn calculate_quality_score(
844        &self,
845        original: &[u8],
846        coefficients: &HashMap<u32, BalancedTernaryWord>,
847        _residual: &[BalancedTernaryWord],
848    ) -> f64 {
849        if original.is_empty() {
850            return 1.0; // Empty data is trivially reconstructed
851        }
852
853        if coefficients.is_empty() {
854            // No basis vectors matched - low quality
855            // But not zero since residual can still reconstruct
856            return 0.1;
857        }
858
859        let config = ProjectionConfig::default();
860
861        // Calculate average coefficient magnitude (similarity scores)
862        let total_coef_magnitude: f64 = coefficients
863            .values()
864            .map(|word| {
865                let val = word.decode() as f64;
866                // Convert back from scaled similarity
867                (val / config.coefficient_scale).abs()
868            })
869            .sum();
870
871        let avg_similarity = total_coef_magnitude / coefficients.len() as f64;
872
873        // Calculate coverage: how many chunks have at least one coefficient
874        let chunk_count = original.len().div_ceil(config.chunk_size);
875        let key_spacing = config.coefficient_key_spacing;
876
877        let chunks_with_coefs: std::collections::HashSet<u32> =
878            coefficients.keys().map(|&key| key % key_spacing).collect();
879
880        let coverage_ratio = chunks_with_coefs.len() as f64 / chunk_count.max(1) as f64;
881
882        // Quality combines average similarity with coverage
883        // Both factors are in [0, 1] range
884        let quality = (avg_similarity * 0.5 + coverage_ratio * 0.5).min(1.0);
885
886        // Ensure non-zero quality when coefficients exist
887        quality.max(0.1)
888    }
889
890    /// Reconstruct original data from projection result
891    pub fn reconstruct(&self, projection: &ProjectionResult, expected_size: usize) -> Vec<u8> {
892        let mut result = Vec::with_capacity(expected_size);
893
894        // 1. Reconstruct from basis coefficients
895        let chunk_size = 64;
896        let num_chunks = expected_size.div_ceil(chunk_size);
897
898        for chunk_idx in 0..num_chunks {
899            let chunk = self.reconstruct_chunk(&projection.coefficients, chunk_idx, chunk_size);
900            result.extend(chunk);
901        }
902
903        // 2. Apply residual corrections
904        for (i, residual_word) in projection.residual.iter().enumerate() {
905            if i < result.len() {
906                let correction = residual_word.decode() as u8;
907                result[i] = result[i].wrapping_add(correction);
908            }
909        }
910
911        // 3. Apply semantic outlier corrections
912        for outlier in &projection.outliers {
913            if outlier.position + outlier.length <= result.len() {
914                // Decode outlier pattern and overwrite
915                let mut decoded = Vec::new();
916                for word in &outlier.encoded_pattern {
917                    let value = word.decode();
918                    for j in 0..8 {
919                        decoded.push(((value >> (j * 8)) & 0xFF) as u8);
920                    }
921                }
922
923                for (j, &byte) in decoded.iter().enumerate().take(outlier.length) {
924                    if outlier.position + j < result.len() {
925                        result[outlier.position + j] = byte;
926                    }
927                }
928            }
929        }
930
931        result.truncate(expected_size);
932        result
933    }
934}
935
936impl SparseVec {
937    /// Create a sparse vector from a seed (deterministic)
938    pub fn from_seed(seed: &[u8; 32], dim: usize) -> Self {
939        use rand::seq::SliceRandom;
940        use rand::SeedableRng;
941
942        let mut rng = rand::rngs::StdRng::from_seed(*seed);
943        let sparsity = dim / 100; // 1% density
944
945        let mut indices: Vec<usize> = (0..dim).collect();
946        indices.shuffle(&mut rng);
947
948        let mut pos: Vec<_> = indices[..sparsity].to_vec();
949        let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
950
951        pos.sort_unstable();
952        neg.sort_unstable();
953
954        SparseVec { pos, neg }
955    }
956
957    /// Create a sparse vector directly from bytes
958    pub fn from_bytes(data: &[u8]) -> Self {
959        use sha2::{Digest, Sha256};
960
961        let mut hasher = Sha256::new();
962        hasher.update(data);
963        let hash = hasher.finalize();
964        let seed: [u8; 32] = hash.into();
965
966        Self::from_seed(&seed, DIM)
967    }
968}
969
970#[cfg(test)]
971mod tests {
972    use super::*;
973
974    #[test]
975    fn test_balanced_ternary_roundtrip() {
976        let test_values = [
977            0i64,
978            1,
979            -1,
980            100,
981            -100,
982            12345,
983            -12345,
984            BalancedTernaryWord::MAX_VALUE / 2,
985            BalancedTernaryWord::MIN_VALUE / 2,
986        ];
987
988        for &value in &test_values {
989            let word = BalancedTernaryWord::new(value, WordMetadata::Data)
990                .expect("Test value should be encodable");
991            let decoded = word.decode();
992            assert_eq!(value, decoded, "Failed roundtrip for {}", value);
993        }
994    }
995
996    #[test]
997    fn test_balanced_ternary_metadata() {
998        let word = BalancedTernaryWord::new(42, WordMetadata::SemanticOutlier)
999            .expect("42 should be encodable");
1000        assert_eq!(word.metadata(), WordMetadata::SemanticOutlier);
1001        assert_eq!(word.decode(), 42);
1002    }
1003
1004    #[test]
1005    fn test_balanced_ternary_range() {
1006        // Should succeed at boundaries
1007        assert!(
1008            BalancedTernaryWord::new(BalancedTernaryWord::MAX_VALUE, WordMetadata::Data).is_ok()
1009        );
1010        assert!(
1011            BalancedTernaryWord::new(BalancedTernaryWord::MIN_VALUE, WordMetadata::Data).is_ok()
1012        );
1013
1014        // Should fail outside boundaries
1015        assert!(
1016            BalancedTernaryWord::new(BalancedTernaryWord::MAX_VALUE + 1, WordMetadata::Data)
1017                .is_err()
1018        );
1019        assert!(
1020            BalancedTernaryWord::new(BalancedTernaryWord::MIN_VALUE - 1, WordMetadata::Data)
1021                .is_err()
1022        );
1023    }
1024
1025    #[test]
1026    fn test_codebook_projection() {
1027        let mut codebook = Codebook::new(10000);
1028        codebook.initialize_standard_basis();
1029
1030        let data = b"the quick brown fox jumps over the lazy dog";
1031        let projection = codebook.project(data);
1032
1033        assert!(projection.quality_score > 0.0);
1034        assert!(!projection.coefficients.is_empty() || !projection.residual.is_empty());
1035    }
1036
1037    #[test]
1038    fn test_parity_computation() {
1039        let word =
1040            BalancedTernaryWord::new(12345, WordMetadata::Data).expect("12345 should be encodable");
1041        let parity = word.compute_parity();
1042        assert!((-1..=1).contains(&parity));
1043    }
1044}
embeddenator_vsa/codebook.rs

embeddenator_vsa/
codebook.rs