1use crate::vsa::{SparseVec, DIM};
26use crate::VsaError;
27use serde::{Deserialize, Serialize};
28use std::collections::HashMap;
29
30#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
34pub struct BalancedTernaryWord {
35 packed: u64,
39}
40
41#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
43pub enum WordMetadata {
44 Data = 0b000,
46 SemanticOutlier = 0b001,
48 Residual = 0b010,
50 Continuation = 0b011,
52 EndOfSequence = 0b100,
54 Parity = 0b101,
56}
57
58impl BalancedTernaryWord {
59 pub const MAX_VALUE: i64 = 675_425_858_836_496_044;
63 pub const MIN_VALUE: i64 = -675_425_858_836_496_044;
64
65 pub const DATA_TRITS: usize = 38;
67
68 pub const META_TRITS: usize = 2;
70
71 pub fn new(value: i64, metadata: WordMetadata) -> Result<Self, VsaError> {
73 if !(Self::MIN_VALUE..=Self::MAX_VALUE).contains(&value) {
74 return Err(VsaError::ValueOutOfRange {
75 value,
76 min: Self::MIN_VALUE,
77 max: Self::MAX_VALUE,
78 });
79 }
80
81 let encoded = Self::encode_balanced_ternary(value);
83
84 let meta_bits = (metadata as u64) << 61;
86
87 Ok(BalancedTernaryWord {
88 packed: encoded | meta_bits,
89 })
90 }
91
92 pub fn from_raw(packed: u64) -> Self {
94 BalancedTernaryWord { packed }
95 }
96
97 pub fn raw(&self) -> u64 {
99 self.packed
100 }
101
102 pub fn data_bits(&self) -> u64 {
104 self.packed & 0x1FFF_FFFF_FFFF_FFFF
105 }
106
107 pub fn metadata(&self) -> WordMetadata {
109 match (self.packed >> 61) & 0b111 {
110 0b000 => WordMetadata::Data,
111 0b001 => WordMetadata::SemanticOutlier,
112 0b010 => WordMetadata::Residual,
113 0b011 => WordMetadata::Continuation,
114 0b100 => WordMetadata::EndOfSequence,
115 0b101 => WordMetadata::Parity,
116 _ => WordMetadata::Data, }
118 }
119
120 pub fn decode(&self) -> i64 {
122 Self::decode_balanced_ternary(self.data_bits())
123 }
124
125 fn encode_balanced_ternary(value: i64) -> u64 {
132 let mut v = value;
135 let mut result: u64 = 0;
136 let mut power: u64 = 1;
137
138 for _ in 0..Self::DATA_TRITS {
139 let mut rem = v % 3;
141 v /= 3;
142
143 if rem == 2 {
144 rem = -1;
145 v += 1;
146 } else if rem == -2 {
147 rem = 1;
148 v -= 1;
149 }
150
151 let encoded = match rem {
153 -1 => 2u64,
154 0 => 0u64,
155 1 => 1u64,
156 _ => 0u64, };
158
159 result += encoded * power;
160 power *= 3;
161 }
162
163 result
164 }
165
166 fn decode_balanced_ternary(packed: u64) -> i64 {
168 let mut result: i64 = 0;
169 let mut power: i64 = 1;
170 let mut remaining = packed;
171
172 for _ in 0..Self::DATA_TRITS {
173 let trit = remaining % 3;
174 remaining /= 3;
175
176 match trit {
177 0 => {} 1 => result += power,
179 2 => result -= power, _ => unreachable!(),
181 }
182 power *= 3;
183 }
184
185 result
186 }
187
188 #[allow(dead_code)]
190 fn negate_trits(packed: u64) -> u64 {
191 let mut result: u64 = 0;
192 let mut remaining = packed;
193 let mut power: u64 = 1;
194
195 for _ in 0..Self::DATA_TRITS {
196 let trit = remaining % 3;
197 remaining /= 3;
198
199 let negated = match trit {
201 0 => 0,
202 1 => 2,
203 2 => 1,
204 _ => unreachable!(),
205 };
206 result += negated * power;
207 power *= 3;
208 }
209
210 result
211 }
212
213 pub fn compute_parity(&self) -> i8 {
215 let mut sum: i64 = 0;
216 let mut remaining = self.data_bits();
217
218 for _ in 0..Self::DATA_TRITS {
219 let trit = (remaining % 3) as i64;
220 remaining /= 3;
221
222 sum += match trit {
224 0 => 0,
225 1 => 1,
226 2 => -1,
227 _ => 0,
228 };
229 }
230
231 ((3 - (sum.rem_euclid(3))) % 3) as i8
233 }
234}
235
236#[derive(Clone, Debug, Serialize, Deserialize)]
238pub struct SemanticOutlier {
239 pub position: usize,
241 pub length: usize,
243 pub entropy_score: f64,
245 pub encoded_pattern: Vec<BalancedTernaryWord>,
247 pub semantic_vec: SparseVec,
249}
250
251#[derive(Clone, Debug, Serialize, Deserialize)]
253pub struct BasisVector {
254 pub id: u32,
256 pub vector: SparseVec,
258 pub label: Option<String>,
260 pub weight: f64,
262}
263
264#[derive(Clone, Debug, Serialize, Deserialize)]
266pub struct Codebook {
267 pub version: u32,
269
270 pub dimensionality: usize,
272
273 pub basis_vectors: Vec<BasisVector>,
276
277 pub semantic_markers: Vec<SparseVec>,
279
280 pub statistics: CodebookStatistics,
282
283 pub salt: Option<[u8; 32]>,
285}
286
287#[derive(Clone, Debug, Default, Serialize, Deserialize)]
289pub struct CodebookStatistics {
290 pub total_bytes_encoded: u64,
292 pub avg_compression_ratio: f64,
294 pub outlier_count: u64,
296 pub coefficient_histogram: [u64; 16],
298}
299
300#[derive(Clone, Debug)]
302pub struct ProjectionConfig {
303 pub chunk_size: usize,
305 pub similarity_threshold: f64,
307 pub max_basis_matches: usize,
309 pub coefficient_scale: f64,
311 pub coefficient_key_spacing: u32,
313}
314
315impl Default for ProjectionConfig {
316 fn default() -> Self {
317 Self {
318 chunk_size: 64,
319 similarity_threshold: 0.3,
320 max_basis_matches: 4,
321 coefficient_scale: 1000.0,
322 coefficient_key_spacing: 1000,
323 }
324 }
325}
326
327#[derive(Clone, Debug, Serialize, Deserialize)]
329pub struct ProjectionResult {
330 pub coefficients: HashMap<u32, BalancedTernaryWord>,
332 pub residual: Vec<BalancedTernaryWord>,
334 pub outliers: Vec<SemanticOutlier>,
336 pub quality_score: f64,
338}
339
340impl Default for Codebook {
341 fn default() -> Self {
342 Self::new(DIM)
343 }
344}
345
346impl Codebook {
347 pub fn new(dimensionality: usize) -> Self {
349 Codebook {
350 version: 1,
351 dimensionality,
352 basis_vectors: Vec::new(),
353 semantic_markers: Vec::new(),
354 statistics: CodebookStatistics::default(),
355 salt: None,
356 }
357 }
358
359 pub fn with_salt(dimensionality: usize, salt: [u8; 32]) -> Self {
361 let mut codebook = Self::new(dimensionality);
362 codebook.salt = Some(salt);
363 codebook
364 }
365
366 pub fn initialize_standard_basis(&mut self) {
368 self.add_basis_for_pattern(0, b"\x00\x00\x00\x00", "zero_run");
373
374 self.add_basis_for_pattern(1, b" ", "space_run");
376 self.add_basis_for_pattern(2, b"\n\n", "newline_pair");
377
378 self.add_basis_for_pattern(3, b"the ", "the_space");
380 self.add_basis_for_pattern(4, b"ing ", "ing_space");
381 self.add_basis_for_pattern(5, b"tion", "tion");
382
383 self.add_basis_for_pattern(6, b"\x89PNG", "png_header");
385 self.add_basis_for_pattern(7, b"\xFF\xD8\xFF", "jpeg_header");
386 self.add_basis_for_pattern(8, b"PK\x03\x04", "zip_header");
387
388 self.initialize_semantic_markers();
390 }
391
392 fn add_basis_for_pattern(&mut self, id: u32, pattern: &[u8], label: &str) {
394 use sha2::{Digest, Sha256};
395
396 let mut hasher = Sha256::new();
398 hasher.update(pattern);
399 if let Some(salt) = &self.salt {
400 hasher.update(salt);
401 }
402 let hash = hasher.finalize();
403
404 let seed: [u8; 32] = hash.into();
406 let vector = SparseVec::from_seed(&seed, self.dimensionality);
407
408 self.basis_vectors.push(BasisVector {
409 id,
410 vector,
411 label: Some(label.to_string()),
412 weight: 1.0,
413 });
414 }
415
416 fn initialize_semantic_markers(&mut self) {
418 use sha2::{Digest, Sha256};
419
420 let seed_for = |label: &str| -> [u8; 32] {
421 let mut hasher = Sha256::new();
422 hasher.update(b"embeddenator:semantic_marker:v1:");
423 hasher.update(label.as_bytes());
424 hasher.update((self.dimensionality as u64).to_le_bytes());
425 if let Some(salt) = &self.salt {
426 hasher.update(salt);
427 }
428 hasher.finalize().into()
429 };
430
431 let seed = seed_for("high_entropy");
433 self.semantic_markers
434 .push(SparseVec::from_seed(&seed, self.dimensionality));
435
436 let seed = seed_for("repetition");
438 self.semantic_markers
439 .push(SparseVec::from_seed(&seed, self.dimensionality));
440
441 let seed = seed_for("boundary");
443 self.semantic_markers
444 .push(SparseVec::from_seed(&seed, self.dimensionality));
445 }
446
447 pub fn project(&self, data: &[u8]) -> ProjectionResult {
450 self.project_with_config(data, &ProjectionConfig::default())
451 }
452
453 pub fn project_with_config(&self, data: &[u8], config: &ProjectionConfig) -> ProjectionResult {
455 let mut coefficients = HashMap::new();
456 let mut residual = Vec::new();
457 let mut outliers = Vec::new();
458
459 let detected_outliers = self.detect_semantic_outliers(data);
461 outliers.extend(detected_outliers);
462
463 let chunk_size = config.chunk_size;
465 for (chunk_idx, chunk) in data.chunks(chunk_size).enumerate() {
466 let chunk_vec = SparseVec::from_bytes(chunk);
467
468 let mut best_matches: Vec<(u32, f64)> = self
470 .basis_vectors
471 .iter()
472 .map(|basis| (basis.id, chunk_vec.cosine(&basis.vector)))
473 .filter(|(_, sim)| *sim > config.similarity_threshold)
474 .collect();
475
476 best_matches.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
478
479 for (basis_id, similarity) in best_matches.iter().take(config.max_basis_matches) {
481 let coef_value = (*similarity * config.coefficient_scale) as i64;
483 if let Ok(word) = BalancedTernaryWord::new(coef_value, WordMetadata::Data) {
484 coefficients.insert(
485 *basis_id * config.coefficient_key_spacing + chunk_idx as u32,
486 word,
487 );
488 }
489 }
490
491 let reconstructed = self.reconstruct_chunk(&coefficients, chunk_idx, chunk.len());
493 let chunk_residual = self.compute_residual(chunk, &reconstructed);
494
495 for residual_byte in chunk_residual {
496 if let Ok(word) =
497 BalancedTernaryWord::new(residual_byte as i64, WordMetadata::Residual)
498 {
499 residual.push(word);
500 }
501 }
502 }
503
504 let quality_score = self.calculate_quality_score(data, &coefficients, &residual);
506
507 ProjectionResult {
508 coefficients,
509 residual,
510 outliers,
511 quality_score,
512 }
513 }
514
515 fn detect_semantic_outliers(&self, data: &[u8]) -> Vec<SemanticOutlier> {
517 let mut outliers = Vec::new();
518 let window_size = 32;
519
520 if data.len() < window_size {
521 return outliers;
522 }
523
524 for i in 0..data.len() - window_size {
525 let window = &data[i..i + window_size];
526 let entropy = self.calculate_entropy(window);
527
528 if entropy > 7.5 {
530 let pattern_vec = SparseVec::from_bytes(window);
531
532 let mut encoded_pattern = Vec::new();
534 for chunk in window.chunks(8) {
535 let value = chunk
536 .iter()
537 .enumerate()
538 .fold(0i64, |acc, (j, &b)| acc + ((b as i64) << (j * 8)));
539 if let Ok(word) = BalancedTernaryWord::new(value, WordMetadata::SemanticOutlier)
540 {
541 encoded_pattern.push(word);
542 }
543 }
544
545 outliers.push(SemanticOutlier {
546 position: i,
547 length: window_size,
548 entropy_score: entropy,
549 encoded_pattern,
550 semantic_vec: pattern_vec,
551 });
552
553 }
556 }
557
558 outliers.dedup_by(|a, b| a.position.abs_diff(b.position) < window_size / 2);
560
561 outliers
562 }
563
564 fn calculate_entropy(&self, data: &[u8]) -> f64 {
566 let mut counts = [0u32; 256];
567 for &byte in data {
568 counts[byte as usize] += 1;
569 }
570
571 let len = data.len() as f64;
572 counts
573 .iter()
574 .filter(|&&c| c > 0)
575 .map(|&c| {
576 let p = c as f64 / len;
577 -p * p.log2()
578 })
579 .sum()
580 }
581
582 fn reconstruct_chunk(
591 &self,
592 coefficients: &HashMap<u32, BalancedTernaryWord>,
593 chunk_idx: usize,
594 chunk_len: usize,
595 ) -> Vec<u8> {
596 if chunk_len == 0 || coefficients.is_empty() || self.basis_vectors.is_empty() {
598 return vec![0u8; chunk_len];
599 }
600
601 let config = ProjectionConfig::default();
602 let key_spacing = config.coefficient_key_spacing;
603 let coef_scale = config.coefficient_scale;
604
605 let mut reconstruction: Vec<i32> = vec![0i32; chunk_len];
607
608 for basis in &self.basis_vectors {
610 let key = basis.id * key_spacing + chunk_idx as u32;
611 if let Some(coef_word) = coefficients.get(&key) {
612 let coef_value = coef_word.decode();
613 let weight = coef_value as f64 / coef_scale;
615
616 let chunk_weight = (weight * 128.0) as i32;
623
624 for &idx in &basis.vector.pos {
630 let pos = idx % chunk_len;
631 reconstruction[pos] = reconstruction[pos].saturating_add(chunk_weight);
632 }
633 for &idx in &basis.vector.neg {
634 let pos = idx % chunk_len;
635 reconstruction[pos] = reconstruction[pos].saturating_sub(chunk_weight);
636 }
637 }
638 }
639
640 reconstruction
642 .iter()
643 .map(|&val| val.clamp(0, 255) as u8)
644 .collect()
645 }
646
647 fn compute_residual(&self, original: &[u8], reconstructed: &[u8]) -> Vec<u8> {
649 original
650 .iter()
651 .zip(reconstructed.iter())
652 .map(|(&o, &r)| o.wrapping_sub(r))
653 .collect()
654 }
655
656 fn calculate_quality_score(
665 &self,
666 original: &[u8],
667 coefficients: &HashMap<u32, BalancedTernaryWord>,
668 _residual: &[BalancedTernaryWord],
669 ) -> f64 {
670 if original.is_empty() {
671 return 1.0; }
673
674 if coefficients.is_empty() {
675 return 0.1;
678 }
679
680 let config = ProjectionConfig::default();
681
682 let total_coef_magnitude: f64 = coefficients
684 .values()
685 .map(|word| {
686 let val = word.decode() as f64;
687 (val / config.coefficient_scale).abs()
689 })
690 .sum();
691
692 let avg_similarity = total_coef_magnitude / coefficients.len() as f64;
693
694 let chunk_count = original.len().div_ceil(config.chunk_size);
696 let key_spacing = config.coefficient_key_spacing;
697
698 let chunks_with_coefs: std::collections::HashSet<u32> =
699 coefficients.keys().map(|&key| key % key_spacing).collect();
700
701 let coverage_ratio = chunks_with_coefs.len() as f64 / chunk_count.max(1) as f64;
702
703 let quality = (avg_similarity * 0.5 + coverage_ratio * 0.5).min(1.0);
706
707 quality.max(0.1)
709 }
710
711 pub fn reconstruct(&self, projection: &ProjectionResult, expected_size: usize) -> Vec<u8> {
713 let mut result = Vec::with_capacity(expected_size);
714
715 let chunk_size = 64;
717 let num_chunks = expected_size.div_ceil(chunk_size);
718
719 for chunk_idx in 0..num_chunks {
720 let chunk = self.reconstruct_chunk(&projection.coefficients, chunk_idx, chunk_size);
721 result.extend(chunk);
722 }
723
724 for (i, residual_word) in projection.residual.iter().enumerate() {
726 if i < result.len() {
727 let correction = residual_word.decode() as u8;
728 result[i] = result[i].wrapping_add(correction);
729 }
730 }
731
732 for outlier in &projection.outliers {
734 if outlier.position + outlier.length <= result.len() {
735 let mut decoded = Vec::new();
737 for word in &outlier.encoded_pattern {
738 let value = word.decode();
739 for j in 0..8 {
740 decoded.push(((value >> (j * 8)) & 0xFF) as u8);
741 }
742 }
743
744 for (j, &byte) in decoded.iter().enumerate().take(outlier.length) {
745 if outlier.position + j < result.len() {
746 result[outlier.position + j] = byte;
747 }
748 }
749 }
750 }
751
752 result.truncate(expected_size);
753 result
754 }
755}
756
757impl SparseVec {
758 pub fn from_seed(seed: &[u8; 32], dim: usize) -> Self {
760 use rand::seq::SliceRandom;
761 use rand::SeedableRng;
762
763 let mut rng = rand::rngs::StdRng::from_seed(*seed);
764 let sparsity = dim / 100; let mut indices: Vec<usize> = (0..dim).collect();
767 indices.shuffle(&mut rng);
768
769 let mut pos: Vec<_> = indices[..sparsity].to_vec();
770 let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
771
772 pos.sort_unstable();
773 neg.sort_unstable();
774
775 SparseVec { pos, neg }
776 }
777
778 pub fn from_bytes(data: &[u8]) -> Self {
780 use sha2::{Digest, Sha256};
781
782 let mut hasher = Sha256::new();
783 hasher.update(data);
784 let hash = hasher.finalize();
785 let seed: [u8; 32] = hash.into();
786
787 Self::from_seed(&seed, DIM)
788 }
789}
790
791#[cfg(test)]
792mod tests {
793 use super::*;
794
795 #[test]
796 fn test_balanced_ternary_roundtrip() {
797 let test_values = [
798 0i64,
799 1,
800 -1,
801 100,
802 -100,
803 12345,
804 -12345,
805 BalancedTernaryWord::MAX_VALUE / 2,
806 BalancedTernaryWord::MIN_VALUE / 2,
807 ];
808
809 for &value in &test_values {
810 let word = BalancedTernaryWord::new(value, WordMetadata::Data)
811 .expect("Test value should be encodable");
812 let decoded = word.decode();
813 assert_eq!(value, decoded, "Failed roundtrip for {}", value);
814 }
815 }
816
817 #[test]
818 fn test_balanced_ternary_metadata() {
819 let word = BalancedTernaryWord::new(42, WordMetadata::SemanticOutlier)
820 .expect("42 should be encodable");
821 assert_eq!(word.metadata(), WordMetadata::SemanticOutlier);
822 assert_eq!(word.decode(), 42);
823 }
824
825 #[test]
826 fn test_balanced_ternary_range() {
827 assert!(
829 BalancedTernaryWord::new(BalancedTernaryWord::MAX_VALUE, WordMetadata::Data).is_ok()
830 );
831 assert!(
832 BalancedTernaryWord::new(BalancedTernaryWord::MIN_VALUE, WordMetadata::Data).is_ok()
833 );
834
835 assert!(
837 BalancedTernaryWord::new(BalancedTernaryWord::MAX_VALUE + 1, WordMetadata::Data)
838 .is_err()
839 );
840 assert!(
841 BalancedTernaryWord::new(BalancedTernaryWord::MIN_VALUE - 1, WordMetadata::Data)
842 .is_err()
843 );
844 }
845
846 #[test]
847 fn test_codebook_projection() {
848 let mut codebook = Codebook::new(10000);
849 codebook.initialize_standard_basis();
850
851 let data = b"the quick brown fox jumps over the lazy dog";
852 let projection = codebook.project(data);
853
854 assert!(projection.quality_score > 0.0);
855 assert!(!projection.coefficients.is_empty() || !projection.residual.is_empty());
856 }
857
858 #[test]
859 fn test_parity_computation() {
860 let word =
861 BalancedTernaryWord::new(12345, WordMetadata::Data).expect("12345 should be encodable");
862 let parity = word.compute_parity();
863 assert!((-1..=1).contains(&parity));
864 }
865}