embeddenator_vsa/
codebook.rs

1//! Codebook - Differential Encoding Base Model
2//!
3//! The codebook serves as a learned/constructed basis set for differential encoding.
4//! Data is projected onto this basis, and only the residuals (what can't be expressed
5//! by the codebook) plus semantic markers are stored in the engram.
6//!
7//! # Architecture
8//!
9//! ```text
10//! Codebook = { basis_vectors: [B₀, B₁, ..., Bₙ], semantic_markers: [...] }
11//!
12//! Encoding:  data → coefficients × basis + residual + semantic_outliers
13//! Decoding:  coefficients × basis + residual + semantic_outliers → data
14//! ```
15//!
16//! # Security Model
17//!
18//! The codebook acts as a private key:
19//! - Without the codebook, reconstruction is mathematically impossible
20//! - The engram alone is information-theoretically secure
21//! - Different codebooks = different "encryption keys"
22
23use crate::vsa::{SparseVec, DIM};
24use serde::{Deserialize, Serialize};
25use std::collections::HashMap;
26
27/// 64-bit balanced ternary encoding unit
28/// - 61 bits: data payload (39 trits worth of information)
29/// - 3 bits: parity/metadata (2 trits)
30#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
31pub struct BalancedTernaryWord {
32    /// Raw 64-bit representation
33    /// Bits 0-60: 39 trits of data (each trit = log₂(3) ≈ 1.585 bits)
34    /// Bits 61-63: parity trit + metadata trit
35    packed: u64,
36}
37
38/// Metadata flags stored in the upper 3 bits
39#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
40pub enum WordMetadata {
41    /// Standard data word
42    Data = 0b000,
43    /// Semantic outlier marker
44    SemanticOutlier = 0b001,
45    /// Residual correction word
46    Residual = 0b010,
47    /// Continuation of previous word
48    Continuation = 0b011,
49    /// End of sequence marker
50    EndOfSequence = 0b100,
51    /// Parity check word
52    Parity = 0b101,
53}
54
55impl BalancedTernaryWord {
56    /// Maximum value representable in 38 trits (signed balanced)
57    /// 3^38 = 1,350,851,717,672,992,089 (fits in 61 bits)
58    /// Range: -(3^38-1)/2 to +(3^38-1)/2
59    pub const MAX_VALUE: i64 = 675_425_858_836_496_044;
60    pub const MIN_VALUE: i64 = -675_425_858_836_496_044;
61    
62    /// Number of trits in the data portion (38 trits = 61 bits)
63    pub const DATA_TRITS: usize = 38;
64    
65    /// Number of trits for metadata/parity (stored in upper 3 bits)
66    pub const META_TRITS: usize = 2;
67
68    /// Create a new word from a signed integer value and metadata
69    pub fn new(value: i64, metadata: WordMetadata) -> Option<Self> {
70        if value < Self::MIN_VALUE || value > Self::MAX_VALUE {
71            return None;
72        }
73        
74        // Convert signed value to balanced ternary representation
75        let encoded = Self::encode_balanced_ternary(value);
76        
77        // Pack metadata into upper 3 bits
78        let meta_bits = (metadata as u64) << 61;
79        
80        Some(BalancedTernaryWord {
81            packed: encoded | meta_bits,
82        })
83    }
84
85    /// Create from raw packed representation
86    pub fn from_raw(packed: u64) -> Self {
87        BalancedTernaryWord { packed }
88    }
89
90    /// Get the raw packed value
91    pub fn raw(&self) -> u64 {
92        self.packed
93    }
94
95    /// Extract the data portion (lower 61 bits)
96    pub fn data_bits(&self) -> u64 {
97        self.packed & 0x1FFF_FFFF_FFFF_FFFF
98    }
99
100    /// Extract metadata
101    pub fn metadata(&self) -> WordMetadata {
102        match (self.packed >> 61) & 0b111 {
103            0b000 => WordMetadata::Data,
104            0b001 => WordMetadata::SemanticOutlier,
105            0b010 => WordMetadata::Residual,
106            0b011 => WordMetadata::Continuation,
107            0b100 => WordMetadata::EndOfSequence,
108            0b101 => WordMetadata::Parity,
109            _ => WordMetadata::Data, // Default fallback
110        }
111    }
112
113    /// Decode to signed integer value
114    pub fn decode(&self) -> i64 {
115        Self::decode_balanced_ternary(self.data_bits())
116    }
117
118    /// Encode a signed integer to balanced ternary packed representation
119    /// 
120    /// We store the value directly as a base-3 representation where:
121    /// - Digit 0 = trit 0
122    /// - Digit 1 = trit +1  
123    /// - Digit 2 = trit -1
124    fn encode_balanced_ternary(value: i64) -> u64 {
125        // For balanced ternary, we convert by repeatedly dividing
126        // and adjusting for the balanced representation
127        let mut v = value;
128        let mut result: u64 = 0;
129        let mut power: u64 = 1;
130        
131        for _ in 0..Self::DATA_TRITS {
132            // Get remainder in range [-1, 0, 1]
133            let mut rem = v % 3;
134            v /= 3;
135            
136            if rem == 2 {
137                rem = -1;
138                v += 1;
139            } else if rem == -2 {
140                rem = 1;
141                v -= 1;
142            }
143            
144            // Encode: -1 -> 2, 0 -> 0, +1 -> 1
145            let encoded = match rem {
146                -1 => 2u64,
147                0 => 0u64,
148                1 => 1u64,
149                _ => 0u64, // Safety fallback
150            };
151            
152            result += encoded * power;
153            power *= 3;
154        }
155        
156        result
157    }
158
159    /// Decode balanced ternary packed representation to signed integer
160    fn decode_balanced_ternary(packed: u64) -> i64 {
161        let mut result: i64 = 0;
162        let mut power: i64 = 1;
163        let mut remaining = packed;
164        
165        for _ in 0..Self::DATA_TRITS {
166            let trit = remaining % 3;
167            remaining /= 3;
168            
169            match trit {
170                0 => {}, // Add 0
171                1 => result += power,
172                2 => result -= power, // -1 in balanced ternary
173                _ => unreachable!(),
174            }
175            power *= 3;
176        }
177        
178        result
179    }
180
181    /// Negate all trits in a packed representation
182    #[allow(dead_code)]
183    fn negate_trits(packed: u64) -> u64 {
184        let mut result: u64 = 0;
185        let mut remaining = packed;
186        let mut power: u64 = 1;
187        
188        for _ in 0..Self::DATA_TRITS {
189            let trit = remaining % 3;
190            remaining /= 3;
191            
192            // Negate: 0->0, 1->2, 2->1
193            let negated = match trit {
194                0 => 0,
195                1 => 2,
196                2 => 1,
197                _ => unreachable!(),
198            };
199            result += negated * power;
200            power *= 3;
201        }
202        
203        result
204    }
205
206    /// Compute parity trit for error detection
207    pub fn compute_parity(&self) -> i8 {
208        let mut sum: i64 = 0;
209        let mut remaining = self.data_bits();
210        
211        for _ in 0..Self::DATA_TRITS {
212            let trit = (remaining % 3) as i64;
213            remaining /= 3;
214            
215            // Convert to balanced: 0->0, 1->1, 2->-1
216            sum += match trit {
217                0 => 0,
218                1 => 1,
219                2 => -1,
220                _ => 0,
221            };
222        }
223        
224        // Parity trit: makes sum divisible by 3
225        ((3 - (sum.rem_euclid(3))) % 3) as i8
226    }
227}
228
229/// Semantic outlier detected during analysis
230#[derive(Clone, Debug, Serialize, Deserialize)]
231pub struct SemanticOutlier {
232    /// Position in the original data
233    pub position: usize,
234    /// Length of the outlier pattern
235    pub length: usize,
236    /// Entropy score (higher = more unusual)
237    pub entropy_score: f64,
238    /// The outlier pattern encoded as balanced ternary words
239    pub encoded_pattern: Vec<BalancedTernaryWord>,
240    /// Semantic vector for similarity matching
241    pub semantic_vec: SparseVec,
242}
243
244/// Basis vector in the codebook
245#[derive(Clone, Debug, Serialize, Deserialize)]
246pub struct BasisVector {
247    /// Unique identifier for this basis
248    pub id: u32,
249    /// The sparse ternary representation
250    pub vector: SparseVec,
251    /// Human-readable label (optional)
252    pub label: Option<String>,
253    /// Frequency weight (how often this pattern appears)
254    pub weight: f64,
255}
256
257/// The Codebook - acts as the private key for reconstruction
258#[derive(Clone, Debug, Serialize, Deserialize)]
259pub struct Codebook {
260    /// Version for compatibility
261    pub version: u32,
262    
263    /// Dimensionality of basis vectors
264    pub dimensionality: usize,
265    
266    /// The basis vectors forming the encoding dictionary
267    /// Data is projected onto these bases
268    pub basis_vectors: Vec<BasisVector>,
269    
270    /// Semantic marker vectors for outlier detection
271    pub semantic_markers: Vec<SparseVec>,
272    
273    /// Statistics for adaptive encoding
274    pub statistics: CodebookStatistics,
275    
276    /// Cryptographic salt for key derivation (optional)
277    pub salt: Option<[u8; 32]>,
278}
279
280/// Statistics tracked by the codebook
281#[derive(Clone, Debug, Default, Serialize, Deserialize)]
282pub struct CodebookStatistics {
283    /// Total bytes encoded using this codebook
284    pub total_bytes_encoded: u64,
285    /// Average compression ratio achieved
286    pub avg_compression_ratio: f64,
287    /// Number of semantic outliers detected
288    pub outlier_count: u64,
289    /// Distribution of coefficient magnitudes
290    pub coefficient_histogram: [u64; 16],
291}
292
293/// Result of projecting data onto the codebook
294#[derive(Clone, Debug, Serialize, Deserialize)]
295pub struct ProjectionResult {
296    /// Coefficients for each basis vector (sparse - only non-zero)
297    pub coefficients: HashMap<u32, BalancedTernaryWord>,
298    /// Residual that couldn't be expressed by the basis
299    pub residual: Vec<BalancedTernaryWord>,
300    /// Detected semantic outliers
301    pub outliers: Vec<SemanticOutlier>,
302    /// Reconstruction quality score (1.0 = perfect)
303    pub quality_score: f64,
304}
305
306impl Default for Codebook {
307    fn default() -> Self {
308        Self::new(DIM)
309    }
310}
311
312impl Codebook {
313    /// Create a new empty codebook
314    pub fn new(dimensionality: usize) -> Self {
315        Codebook {
316            version: 1,
317            dimensionality,
318            basis_vectors: Vec::new(),
319            semantic_markers: Vec::new(),
320            statistics: CodebookStatistics::default(),
321            salt: None,
322        }
323    }
324
325    /// Create a codebook with cryptographic salt for key derivation
326    pub fn with_salt(dimensionality: usize, salt: [u8; 32]) -> Self {
327        let mut codebook = Self::new(dimensionality);
328        codebook.salt = Some(salt);
329        codebook
330    }
331
332    /// Initialize with common basis vectors for text/binary data
333    pub fn initialize_standard_basis(&mut self) {
334        // Add basis vectors for common byte patterns
335        // These act as a "vocabulary" for differential encoding
336        
337        // Zero runs (common in binary)
338        self.add_basis_for_pattern(0, b"\x00\x00\x00\x00", "zero_run");
339        
340        // ASCII space/newline (common in text)
341        self.add_basis_for_pattern(1, b"    ", "space_run");
342        self.add_basis_for_pattern(2, b"\n\n", "newline_pair");
343        
344        // Common text patterns
345        self.add_basis_for_pattern(3, b"the ", "the_space");
346        self.add_basis_for_pattern(4, b"ing ", "ing_space");
347        self.add_basis_for_pattern(5, b"tion", "tion");
348        
349        // Binary markers
350        self.add_basis_for_pattern(6, b"\x89PNG", "png_header");
351        self.add_basis_for_pattern(7, b"\xFF\xD8\xFF", "jpeg_header");
352        self.add_basis_for_pattern(8, b"PK\x03\x04", "zip_header");
353        
354        // Add semantic markers for entropy detection
355        self.initialize_semantic_markers();
356    }
357
358    /// Add a basis vector for a specific pattern
359    fn add_basis_for_pattern(&mut self, id: u32, pattern: &[u8], label: &str) {
360        use sha2::{Sha256, Digest};
361        
362        // Generate deterministic sparse vector from pattern
363        let mut hasher = Sha256::new();
364        hasher.update(pattern);
365        if let Some(salt) = &self.salt {
366            hasher.update(salt);
367        }
368        let hash = hasher.finalize();
369        
370        // Use hash to seed sparse vector generation
371        let seed: [u8; 32] = hash.into();
372        let vector = SparseVec::from_seed(&seed, self.dimensionality);
373        
374        self.basis_vectors.push(BasisVector {
375            id,
376            vector,
377            label: Some(label.to_string()),
378            weight: 1.0,
379        });
380    }
381
382    /// Initialize semantic markers for outlier detection
383    fn initialize_semantic_markers(&mut self) {
384        use sha2::{Digest, Sha256};
385
386        let seed_for = |label: &str| -> [u8; 32] {
387            let mut hasher = Sha256::new();
388            hasher.update(b"embeddenator:semantic_marker:v1:");
389            hasher.update(label.as_bytes());
390            hasher.update(&(self.dimensionality as u64).to_le_bytes());
391            if let Some(salt) = &self.salt {
392                hasher.update(salt);
393            }
394            hasher.finalize().into()
395        };
396
397        // High entropy marker
398        let seed = seed_for("high_entropy");
399        self.semantic_markers
400            .push(SparseVec::from_seed(&seed, self.dimensionality));
401
402        // Repetition marker
403        let seed = seed_for("repetition");
404        self.semantic_markers
405            .push(SparseVec::from_seed(&seed, self.dimensionality));
406
407        // Boundary marker (transitions)
408        let seed = seed_for("boundary");
409        self.semantic_markers
410            .push(SparseVec::from_seed(&seed, self.dimensionality));
411    }
412
413    /// Project data onto the codebook basis
414    /// Returns coefficients, residual, and detected outliers
415    pub fn project(&self, data: &[u8]) -> ProjectionResult {
416        let mut coefficients = HashMap::new();
417        let mut residual = Vec::new();
418        let mut outliers = Vec::new();
419        
420        // 1. Analyze data for semantic outliers (entropy spikes)
421        let detected_outliers = self.detect_semantic_outliers(data);
422        outliers.extend(detected_outliers);
423        
424        // 2. Project data chunks onto basis vectors
425        let chunk_size = 64; // Process in 64-byte chunks
426        for (chunk_idx, chunk) in data.chunks(chunk_size).enumerate() {
427            let chunk_vec = SparseVec::from_bytes(chunk);
428            
429            // Find best matching basis vectors
430            let mut best_matches: Vec<(u32, f64)> = self.basis_vectors
431                .iter()
432                .map(|basis| (basis.id, chunk_vec.cosine(&basis.vector)))
433                .filter(|(_, sim)| *sim > 0.3) // Threshold for relevance
434                .collect();
435            
436            // Sort by similarity (descending), treating NaN as less than any value
437            best_matches.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
438            
439            // Take top N matches
440            for (basis_id, similarity) in best_matches.iter().take(4) {
441                // Encode coefficient as balanced ternary
442                let coef_value = (*similarity * 1000.0) as i64;
443                if let Some(word) = BalancedTernaryWord::new(coef_value, WordMetadata::Data) {
444                    coefficients.insert(
445                        *basis_id * 1000 + chunk_idx as u32,
446                        word,
447                    );
448                }
449            }
450            
451            // 3. Compute residual (what basis couldn't capture)
452            let reconstructed = self.reconstruct_chunk(&coefficients, chunk_idx, chunk.len());
453            let chunk_residual = self.compute_residual(chunk, &reconstructed);
454            
455            for residual_byte in chunk_residual {
456                if let Some(word) = BalancedTernaryWord::new(residual_byte as i64, WordMetadata::Residual) {
457                    residual.push(word);
458                }
459            }
460        }
461        
462        // Calculate quality score
463        let quality_score = self.calculate_quality_score(data, &coefficients, &residual);
464        
465        ProjectionResult {
466            coefficients,
467            residual,
468            outliers,
469            quality_score,
470        }
471    }
472
473    /// Detect semantic outliers (high entropy, rare patterns)
474    fn detect_semantic_outliers(&self, data: &[u8]) -> Vec<SemanticOutlier> {
475        let mut outliers = Vec::new();
476        let window_size = 32;
477        
478        if data.len() < window_size {
479            return outliers;
480        }
481        
482        for i in 0..data.len() - window_size {
483            let window = &data[i..i + window_size];
484            let entropy = self.calculate_entropy(window);
485            
486            // High entropy windows are outliers (compressed/encrypted data)
487            if entropy > 7.5 {
488                let pattern_vec = SparseVec::from_bytes(window);
489                
490                // Encode the outlier pattern
491                let mut encoded_pattern = Vec::new();
492                for chunk in window.chunks(8) {
493                    let value = chunk.iter()
494                        .enumerate()
495                        .fold(0i64, |acc, (j, &b)| acc + ((b as i64) << (j * 8)));
496                    if let Some(word) = BalancedTernaryWord::new(value, WordMetadata::SemanticOutlier) {
497                        encoded_pattern.push(word);
498                    }
499                }
500                
501                outliers.push(SemanticOutlier {
502                    position: i,
503                    length: window_size,
504                    entropy_score: entropy,
505                    encoded_pattern,
506                    semantic_vec: pattern_vec,
507                });
508                
509                // Skip ahead to avoid overlapping outliers
510                // i += window_size / 2; // Can't mutate loop variable, handled by dedup later
511            }
512        }
513        
514        // Deduplicate overlapping outliers
515        outliers.dedup_by(|a, b| a.position.abs_diff(b.position) < window_size / 2);
516        
517        outliers
518    }
519
520    /// Calculate Shannon entropy of a byte slice
521    fn calculate_entropy(&self, data: &[u8]) -> f64 {
522        let mut counts = [0u32; 256];
523        for &byte in data {
524            counts[byte as usize] += 1;
525        }
526        
527        let len = data.len() as f64;
528        counts.iter()
529            .filter(|&&c| c > 0)
530            .map(|&c| {
531                let p = c as f64 / len;
532                -p * p.log2()
533            })
534            .sum()
535    }
536
537    /// Reconstruct a chunk from coefficients
538    fn reconstruct_chunk(
539        &self,
540        _coefficients: &HashMap<u32, BalancedTernaryWord>,
541        _chunk_idx: usize,
542        chunk_len: usize,
543    ) -> Vec<u8> {
544        // Placeholder - full implementation would combine basis vectors
545        // weighted by coefficients
546        vec![0u8; chunk_len]
547    }
548
549    /// Compute residual between original and reconstructed
550    fn compute_residual(&self, original: &[u8], reconstructed: &[u8]) -> Vec<u8> {
551        original.iter()
552            .zip(reconstructed.iter())
553            .map(|(&o, &r)| o.wrapping_sub(r))
554            .collect()
555    }
556
557    /// Calculate reconstruction quality score
558    fn calculate_quality_score(
559        &self,
560        _original: &[u8],
561        _coefficients: &HashMap<u32, BalancedTernaryWord>,
562        _residual: &[BalancedTernaryWord],
563    ) -> f64 {
564        // Placeholder - would compare reconstruction to original
565        1.0
566    }
567
568    /// Reconstruct original data from projection result
569    pub fn reconstruct(&self, projection: &ProjectionResult, expected_size: usize) -> Vec<u8> {
570        let mut result = Vec::with_capacity(expected_size);
571        
572        // 1. Reconstruct from basis coefficients
573        let chunk_size = 64;
574        let num_chunks = (expected_size + chunk_size - 1) / chunk_size;
575        
576        for chunk_idx in 0..num_chunks {
577            let chunk = self.reconstruct_chunk(&projection.coefficients, chunk_idx, chunk_size);
578            result.extend(chunk);
579        }
580        
581        // 2. Apply residual corrections
582        for (i, residual_word) in projection.residual.iter().enumerate() {
583            if i < result.len() {
584                let correction = residual_word.decode() as u8;
585                result[i] = result[i].wrapping_add(correction);
586            }
587        }
588        
589        // 3. Apply semantic outlier corrections
590        for outlier in &projection.outliers {
591            if outlier.position + outlier.length <= result.len() {
592                // Decode outlier pattern and overwrite
593                let mut decoded = Vec::new();
594                for word in &outlier.encoded_pattern {
595                    let value = word.decode();
596                    for j in 0..8 {
597                        decoded.push(((value >> (j * 8)) & 0xFF) as u8);
598                    }
599                }
600                
601                for (j, &byte) in decoded.iter().enumerate().take(outlier.length) {
602                    if outlier.position + j < result.len() {
603                        result[outlier.position + j] = byte;
604                    }
605                }
606            }
607        }
608        
609        result.truncate(expected_size);
610        result
611    }
612}
613
614impl SparseVec {
615    /// Create a sparse vector from a seed (deterministic)
616    pub fn from_seed(seed: &[u8; 32], dim: usize) -> Self {
617        use rand::SeedableRng;
618        use rand::seq::SliceRandom;
619        
620        let mut rng = rand::rngs::StdRng::from_seed(*seed);
621        let sparsity = dim / 100; // 1% density
622        
623        let mut indices: Vec<usize> = (0..dim).collect();
624        indices.shuffle(&mut rng);
625        
626        let mut pos: Vec<_> = indices[..sparsity].to_vec();
627        let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
628        
629        pos.sort_unstable();
630        neg.sort_unstable();
631        
632        SparseVec { pos, neg }
633    }
634
635    /// Create a sparse vector directly from bytes
636    pub fn from_bytes(data: &[u8]) -> Self {
637        use sha2::{Sha256, Digest};
638        
639        let mut hasher = Sha256::new();
640        hasher.update(data);
641        let hash = hasher.finalize();
642        let seed: [u8; 32] = hash.into();
643        
644        Self::from_seed(&seed, DIM)
645    }
646}
647
648#[cfg(test)]
649mod tests {
650    use super::*;
651
652    #[test]
653    fn test_balanced_ternary_roundtrip() {
654        let test_values = [0i64, 1, -1, 100, -100, 12345, -12345, 
655                          BalancedTernaryWord::MAX_VALUE / 2,
656                          BalancedTernaryWord::MIN_VALUE / 2];
657        
658        for &value in &test_values {
659            let word = BalancedTernaryWord::new(value, WordMetadata::Data).unwrap();
660            let decoded = word.decode();
661            assert_eq!(value, decoded, "Failed roundtrip for {}", value);
662        }
663    }
664
665    #[test]
666    fn test_balanced_ternary_metadata() {
667        let word = BalancedTernaryWord::new(42, WordMetadata::SemanticOutlier).unwrap();
668        assert_eq!(word.metadata(), WordMetadata::SemanticOutlier);
669        assert_eq!(word.decode(), 42);
670    }
671
672    #[test]
673    fn test_balanced_ternary_range() {
674        // Should succeed at boundaries
675        assert!(BalancedTernaryWord::new(BalancedTernaryWord::MAX_VALUE, WordMetadata::Data).is_some());
676        assert!(BalancedTernaryWord::new(BalancedTernaryWord::MIN_VALUE, WordMetadata::Data).is_some());
677        
678        // Should fail outside boundaries
679        assert!(BalancedTernaryWord::new(BalancedTernaryWord::MAX_VALUE + 1, WordMetadata::Data).is_none());
680        assert!(BalancedTernaryWord::new(BalancedTernaryWord::MIN_VALUE - 1, WordMetadata::Data).is_none());
681    }
682
683    #[test]
684    fn test_codebook_projection() {
685        let mut codebook = Codebook::new(10000);
686        codebook.initialize_standard_basis();
687        
688        let data = b"the quick brown fox jumps over the lazy dog";
689        let projection = codebook.project(data);
690        
691        assert!(projection.quality_score > 0.0);
692        assert!(!projection.coefficients.is_empty() || !projection.residual.is_empty());
693    }
694
695    #[test]
696    fn test_parity_computation() {
697        let word = BalancedTernaryWord::new(12345, WordMetadata::Data).unwrap();
698        let parity = word.compute_parity();
699        assert!(parity >= -1 && parity <= 1);
700    }
701}