1use crate::vsa::{SparseVec, DIM};
24use serde::{Deserialize, Serialize};
25use std::collections::HashMap;
26
27#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
31pub struct BalancedTernaryWord {
32 packed: u64,
36}
37
38#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
40pub enum WordMetadata {
41 Data = 0b000,
43 SemanticOutlier = 0b001,
45 Residual = 0b010,
47 Continuation = 0b011,
49 EndOfSequence = 0b100,
51 Parity = 0b101,
53}
54
55impl BalancedTernaryWord {
56 pub const MAX_VALUE: i64 = 675_425_858_836_496_044;
60 pub const MIN_VALUE: i64 = -675_425_858_836_496_044;
61
62 pub const DATA_TRITS: usize = 38;
64
65 pub const META_TRITS: usize = 2;
67
68 pub fn new(value: i64, metadata: WordMetadata) -> Option<Self> {
70 if value < Self::MIN_VALUE || value > Self::MAX_VALUE {
71 return None;
72 }
73
74 let encoded = Self::encode_balanced_ternary(value);
76
77 let meta_bits = (metadata as u64) << 61;
79
80 Some(BalancedTernaryWord {
81 packed: encoded | meta_bits,
82 })
83 }
84
85 pub fn from_raw(packed: u64) -> Self {
87 BalancedTernaryWord { packed }
88 }
89
90 pub fn raw(&self) -> u64 {
92 self.packed
93 }
94
95 pub fn data_bits(&self) -> u64 {
97 self.packed & 0x1FFF_FFFF_FFFF_FFFF
98 }
99
100 pub fn metadata(&self) -> WordMetadata {
102 match (self.packed >> 61) & 0b111 {
103 0b000 => WordMetadata::Data,
104 0b001 => WordMetadata::SemanticOutlier,
105 0b010 => WordMetadata::Residual,
106 0b011 => WordMetadata::Continuation,
107 0b100 => WordMetadata::EndOfSequence,
108 0b101 => WordMetadata::Parity,
109 _ => WordMetadata::Data, }
111 }
112
113 pub fn decode(&self) -> i64 {
115 Self::decode_balanced_ternary(self.data_bits())
116 }
117
118 fn encode_balanced_ternary(value: i64) -> u64 {
125 let mut v = value;
128 let mut result: u64 = 0;
129 let mut power: u64 = 1;
130
131 for _ in 0..Self::DATA_TRITS {
132 let mut rem = v % 3;
134 v /= 3;
135
136 if rem == 2 {
137 rem = -1;
138 v += 1;
139 } else if rem == -2 {
140 rem = 1;
141 v -= 1;
142 }
143
144 let encoded = match rem {
146 -1 => 2u64,
147 0 => 0u64,
148 1 => 1u64,
149 _ => 0u64, };
151
152 result += encoded * power;
153 power *= 3;
154 }
155
156 result
157 }
158
159 fn decode_balanced_ternary(packed: u64) -> i64 {
161 let mut result: i64 = 0;
162 let mut power: i64 = 1;
163 let mut remaining = packed;
164
165 for _ in 0..Self::DATA_TRITS {
166 let trit = remaining % 3;
167 remaining /= 3;
168
169 match trit {
170 0 => {}, 1 => result += power,
172 2 => result -= power, _ => unreachable!(),
174 }
175 power *= 3;
176 }
177
178 result
179 }
180
181 #[allow(dead_code)]
183 fn negate_trits(packed: u64) -> u64 {
184 let mut result: u64 = 0;
185 let mut remaining = packed;
186 let mut power: u64 = 1;
187
188 for _ in 0..Self::DATA_TRITS {
189 let trit = remaining % 3;
190 remaining /= 3;
191
192 let negated = match trit {
194 0 => 0,
195 1 => 2,
196 2 => 1,
197 _ => unreachable!(),
198 };
199 result += negated * power;
200 power *= 3;
201 }
202
203 result
204 }
205
206 pub fn compute_parity(&self) -> i8 {
208 let mut sum: i64 = 0;
209 let mut remaining = self.data_bits();
210
211 for _ in 0..Self::DATA_TRITS {
212 let trit = (remaining % 3) as i64;
213 remaining /= 3;
214
215 sum += match trit {
217 0 => 0,
218 1 => 1,
219 2 => -1,
220 _ => 0,
221 };
222 }
223
224 ((3 - (sum.rem_euclid(3))) % 3) as i8
226 }
227}
228
229#[derive(Clone, Debug, Serialize, Deserialize)]
231pub struct SemanticOutlier {
232 pub position: usize,
234 pub length: usize,
236 pub entropy_score: f64,
238 pub encoded_pattern: Vec<BalancedTernaryWord>,
240 pub semantic_vec: SparseVec,
242}
243
244#[derive(Clone, Debug, Serialize, Deserialize)]
246pub struct BasisVector {
247 pub id: u32,
249 pub vector: SparseVec,
251 pub label: Option<String>,
253 pub weight: f64,
255}
256
257#[derive(Clone, Debug, Serialize, Deserialize)]
259pub struct Codebook {
260 pub version: u32,
262
263 pub dimensionality: usize,
265
266 pub basis_vectors: Vec<BasisVector>,
269
270 pub semantic_markers: Vec<SparseVec>,
272
273 pub statistics: CodebookStatistics,
275
276 pub salt: Option<[u8; 32]>,
278}
279
280#[derive(Clone, Debug, Default, Serialize, Deserialize)]
282pub struct CodebookStatistics {
283 pub total_bytes_encoded: u64,
285 pub avg_compression_ratio: f64,
287 pub outlier_count: u64,
289 pub coefficient_histogram: [u64; 16],
291}
292
293#[derive(Clone, Debug, Serialize, Deserialize)]
295pub struct ProjectionResult {
296 pub coefficients: HashMap<u32, BalancedTernaryWord>,
298 pub residual: Vec<BalancedTernaryWord>,
300 pub outliers: Vec<SemanticOutlier>,
302 pub quality_score: f64,
304}
305
306impl Default for Codebook {
307 fn default() -> Self {
308 Self::new(DIM)
309 }
310}
311
312impl Codebook {
313 pub fn new(dimensionality: usize) -> Self {
315 Codebook {
316 version: 1,
317 dimensionality,
318 basis_vectors: Vec::new(),
319 semantic_markers: Vec::new(),
320 statistics: CodebookStatistics::default(),
321 salt: None,
322 }
323 }
324
325 pub fn with_salt(dimensionality: usize, salt: [u8; 32]) -> Self {
327 let mut codebook = Self::new(dimensionality);
328 codebook.salt = Some(salt);
329 codebook
330 }
331
332 pub fn initialize_standard_basis(&mut self) {
334 self.add_basis_for_pattern(0, b"\x00\x00\x00\x00", "zero_run");
339
340 self.add_basis_for_pattern(1, b" ", "space_run");
342 self.add_basis_for_pattern(2, b"\n\n", "newline_pair");
343
344 self.add_basis_for_pattern(3, b"the ", "the_space");
346 self.add_basis_for_pattern(4, b"ing ", "ing_space");
347 self.add_basis_for_pattern(5, b"tion", "tion");
348
349 self.add_basis_for_pattern(6, b"\x89PNG", "png_header");
351 self.add_basis_for_pattern(7, b"\xFF\xD8\xFF", "jpeg_header");
352 self.add_basis_for_pattern(8, b"PK\x03\x04", "zip_header");
353
354 self.initialize_semantic_markers();
356 }
357
358 fn add_basis_for_pattern(&mut self, id: u32, pattern: &[u8], label: &str) {
360 use sha2::{Sha256, Digest};
361
362 let mut hasher = Sha256::new();
364 hasher.update(pattern);
365 if let Some(salt) = &self.salt {
366 hasher.update(salt);
367 }
368 let hash = hasher.finalize();
369
370 let seed: [u8; 32] = hash.into();
372 let vector = SparseVec::from_seed(&seed, self.dimensionality);
373
374 self.basis_vectors.push(BasisVector {
375 id,
376 vector,
377 label: Some(label.to_string()),
378 weight: 1.0,
379 });
380 }
381
382 fn initialize_semantic_markers(&mut self) {
384 use sha2::{Digest, Sha256};
385
386 let seed_for = |label: &str| -> [u8; 32] {
387 let mut hasher = Sha256::new();
388 hasher.update(b"embeddenator:semantic_marker:v1:");
389 hasher.update(label.as_bytes());
390 hasher.update(&(self.dimensionality as u64).to_le_bytes());
391 if let Some(salt) = &self.salt {
392 hasher.update(salt);
393 }
394 hasher.finalize().into()
395 };
396
397 let seed = seed_for("high_entropy");
399 self.semantic_markers
400 .push(SparseVec::from_seed(&seed, self.dimensionality));
401
402 let seed = seed_for("repetition");
404 self.semantic_markers
405 .push(SparseVec::from_seed(&seed, self.dimensionality));
406
407 let seed = seed_for("boundary");
409 self.semantic_markers
410 .push(SparseVec::from_seed(&seed, self.dimensionality));
411 }
412
413 pub fn project(&self, data: &[u8]) -> ProjectionResult {
416 let mut coefficients = HashMap::new();
417 let mut residual = Vec::new();
418 let mut outliers = Vec::new();
419
420 let detected_outliers = self.detect_semantic_outliers(data);
422 outliers.extend(detected_outliers);
423
424 let chunk_size = 64; for (chunk_idx, chunk) in data.chunks(chunk_size).enumerate() {
427 let chunk_vec = SparseVec::from_bytes(chunk);
428
429 let mut best_matches: Vec<(u32, f64)> = self.basis_vectors
431 .iter()
432 .map(|basis| (basis.id, chunk_vec.cosine(&basis.vector)))
433 .filter(|(_, sim)| *sim > 0.3) .collect();
435
436 best_matches.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
438
439 for (basis_id, similarity) in best_matches.iter().take(4) {
441 let coef_value = (*similarity * 1000.0) as i64;
443 if let Some(word) = BalancedTernaryWord::new(coef_value, WordMetadata::Data) {
444 coefficients.insert(
445 *basis_id * 1000 + chunk_idx as u32,
446 word,
447 );
448 }
449 }
450
451 let reconstructed = self.reconstruct_chunk(&coefficients, chunk_idx, chunk.len());
453 let chunk_residual = self.compute_residual(chunk, &reconstructed);
454
455 for residual_byte in chunk_residual {
456 if let Some(word) = BalancedTernaryWord::new(residual_byte as i64, WordMetadata::Residual) {
457 residual.push(word);
458 }
459 }
460 }
461
462 let quality_score = self.calculate_quality_score(data, &coefficients, &residual);
464
465 ProjectionResult {
466 coefficients,
467 residual,
468 outliers,
469 quality_score,
470 }
471 }
472
473 fn detect_semantic_outliers(&self, data: &[u8]) -> Vec<SemanticOutlier> {
475 let mut outliers = Vec::new();
476 let window_size = 32;
477
478 if data.len() < window_size {
479 return outliers;
480 }
481
482 for i in 0..data.len() - window_size {
483 let window = &data[i..i + window_size];
484 let entropy = self.calculate_entropy(window);
485
486 if entropy > 7.5 {
488 let pattern_vec = SparseVec::from_bytes(window);
489
490 let mut encoded_pattern = Vec::new();
492 for chunk in window.chunks(8) {
493 let value = chunk.iter()
494 .enumerate()
495 .fold(0i64, |acc, (j, &b)| acc + ((b as i64) << (j * 8)));
496 if let Some(word) = BalancedTernaryWord::new(value, WordMetadata::SemanticOutlier) {
497 encoded_pattern.push(word);
498 }
499 }
500
501 outliers.push(SemanticOutlier {
502 position: i,
503 length: window_size,
504 entropy_score: entropy,
505 encoded_pattern,
506 semantic_vec: pattern_vec,
507 });
508
509 }
512 }
513
514 outliers.dedup_by(|a, b| a.position.abs_diff(b.position) < window_size / 2);
516
517 outliers
518 }
519
520 fn calculate_entropy(&self, data: &[u8]) -> f64 {
522 let mut counts = [0u32; 256];
523 for &byte in data {
524 counts[byte as usize] += 1;
525 }
526
527 let len = data.len() as f64;
528 counts.iter()
529 .filter(|&&c| c > 0)
530 .map(|&c| {
531 let p = c as f64 / len;
532 -p * p.log2()
533 })
534 .sum()
535 }
536
537 fn reconstruct_chunk(
539 &self,
540 _coefficients: &HashMap<u32, BalancedTernaryWord>,
541 _chunk_idx: usize,
542 chunk_len: usize,
543 ) -> Vec<u8> {
544 vec![0u8; chunk_len]
547 }
548
549 fn compute_residual(&self, original: &[u8], reconstructed: &[u8]) -> Vec<u8> {
551 original.iter()
552 .zip(reconstructed.iter())
553 .map(|(&o, &r)| o.wrapping_sub(r))
554 .collect()
555 }
556
557 fn calculate_quality_score(
559 &self,
560 _original: &[u8],
561 _coefficients: &HashMap<u32, BalancedTernaryWord>,
562 _residual: &[BalancedTernaryWord],
563 ) -> f64 {
564 1.0
566 }
567
568 pub fn reconstruct(&self, projection: &ProjectionResult, expected_size: usize) -> Vec<u8> {
570 let mut result = Vec::with_capacity(expected_size);
571
572 let chunk_size = 64;
574 let num_chunks = (expected_size + chunk_size - 1) / chunk_size;
575
576 for chunk_idx in 0..num_chunks {
577 let chunk = self.reconstruct_chunk(&projection.coefficients, chunk_idx, chunk_size);
578 result.extend(chunk);
579 }
580
581 for (i, residual_word) in projection.residual.iter().enumerate() {
583 if i < result.len() {
584 let correction = residual_word.decode() as u8;
585 result[i] = result[i].wrapping_add(correction);
586 }
587 }
588
589 for outlier in &projection.outliers {
591 if outlier.position + outlier.length <= result.len() {
592 let mut decoded = Vec::new();
594 for word in &outlier.encoded_pattern {
595 let value = word.decode();
596 for j in 0..8 {
597 decoded.push(((value >> (j * 8)) & 0xFF) as u8);
598 }
599 }
600
601 for (j, &byte) in decoded.iter().enumerate().take(outlier.length) {
602 if outlier.position + j < result.len() {
603 result[outlier.position + j] = byte;
604 }
605 }
606 }
607 }
608
609 result.truncate(expected_size);
610 result
611 }
612}
613
614impl SparseVec {
615 pub fn from_seed(seed: &[u8; 32], dim: usize) -> Self {
617 use rand::SeedableRng;
618 use rand::seq::SliceRandom;
619
620 let mut rng = rand::rngs::StdRng::from_seed(*seed);
621 let sparsity = dim / 100; let mut indices: Vec<usize> = (0..dim).collect();
624 indices.shuffle(&mut rng);
625
626 let mut pos: Vec<_> = indices[..sparsity].to_vec();
627 let mut neg: Vec<_> = indices[sparsity..sparsity * 2].to_vec();
628
629 pos.sort_unstable();
630 neg.sort_unstable();
631
632 SparseVec { pos, neg }
633 }
634
635 pub fn from_bytes(data: &[u8]) -> Self {
637 use sha2::{Sha256, Digest};
638
639 let mut hasher = Sha256::new();
640 hasher.update(data);
641 let hash = hasher.finalize();
642 let seed: [u8; 32] = hash.into();
643
644 Self::from_seed(&seed, DIM)
645 }
646}
647
648#[cfg(test)]
649mod tests {
650 use super::*;
651
652 #[test]
653 fn test_balanced_ternary_roundtrip() {
654 let test_values = [0i64, 1, -1, 100, -100, 12345, -12345,
655 BalancedTernaryWord::MAX_VALUE / 2,
656 BalancedTernaryWord::MIN_VALUE / 2];
657
658 for &value in &test_values {
659 let word = BalancedTernaryWord::new(value, WordMetadata::Data).unwrap();
660 let decoded = word.decode();
661 assert_eq!(value, decoded, "Failed roundtrip for {}", value);
662 }
663 }
664
665 #[test]
666 fn test_balanced_ternary_metadata() {
667 let word = BalancedTernaryWord::new(42, WordMetadata::SemanticOutlier).unwrap();
668 assert_eq!(word.metadata(), WordMetadata::SemanticOutlier);
669 assert_eq!(word.decode(), 42);
670 }
671
672 #[test]
673 fn test_balanced_ternary_range() {
674 assert!(BalancedTernaryWord::new(BalancedTernaryWord::MAX_VALUE, WordMetadata::Data).is_some());
676 assert!(BalancedTernaryWord::new(BalancedTernaryWord::MIN_VALUE, WordMetadata::Data).is_some());
677
678 assert!(BalancedTernaryWord::new(BalancedTernaryWord::MAX_VALUE + 1, WordMetadata::Data).is_none());
680 assert!(BalancedTernaryWord::new(BalancedTernaryWord::MIN_VALUE - 1, WordMetadata::Data).is_none());
681 }
682
683 #[test]
684 fn test_codebook_projection() {
685 let mut codebook = Codebook::new(10000);
686 codebook.initialize_standard_basis();
687
688 let data = b"the quick brown fox jumps over the lazy dog";
689 let projection = codebook.project(data);
690
691 assert!(projection.quality_score > 0.0);
692 assert!(!projection.coefficients.is_empty() || !projection.residual.is_empty());
693 }
694
695 #[test]
696 fn test_parity_computation() {
697 let word = BalancedTernaryWord::new(12345, WordMetadata::Data).unwrap();
698 let parity = word.compute_parity();
699 assert!(parity >= -1 && parity <= 1);
700 }
701}