1use super::versioned::types::{ChunkId, ChunkOffset};
51
52#[derive(Debug, Clone, PartialEq, Eq)]
54pub enum DeltaType {
55 ByteReplace {
59 offset: usize,
61 old_value: u8,
63 new_value: u8,
65 },
66
67 MultiByteReplace {
71 changes: Vec<(usize, u8, u8)>,
73 },
74
75 RangeReplace {
79 offset: usize,
81 old_data: Vec<u8>,
83 new_data: Vec<u8>,
85 },
86
87 Insert {
91 offset: usize,
93 data: Vec<u8>,
95 },
96
97 Delete {
101 offset: usize,
103 length: usize,
105 deleted_data: Vec<u8>,
107 },
108
109 Append {
113 data: Vec<u8>,
115 },
116
117 Truncate {
121 new_length: usize,
123 truncated_data: Vec<u8>,
125 },
126}
127
128impl DeltaType {
129 pub fn is_non_shifting(&self) -> bool {
134 matches!(
135 self,
136 DeltaType::ByteReplace { .. }
137 | DeltaType::MultiByteReplace { .. }
138 | DeltaType::RangeReplace { .. }
139 )
140 }
141
142 pub fn changes_length(&self) -> bool {
144 matches!(
145 self,
146 DeltaType::Insert { .. }
147 | DeltaType::Delete { .. }
148 | DeltaType::Append { .. }
149 | DeltaType::Truncate { .. }
150 )
151 }
152
153 pub fn affected_chunk_count(&self, chunk_size: usize) -> usize {
155 match self {
156 DeltaType::ByteReplace { .. } => 1,
157 DeltaType::MultiByteReplace { changes } => {
158 if changes.is_empty() {
159 return 0;
160 }
161 let min_offset = changes.iter().map(|(o, _, _)| *o).min().unwrap_or(0);
162 let max_offset = changes.iter().map(|(o, _, _)| *o).max().unwrap_or(0);
163 (max_offset / chunk_size) - (min_offset / chunk_size) + 1
164 }
165 DeltaType::RangeReplace {
166 offset, new_data, ..
167 } => {
168 let end = offset + new_data.len();
169 (end / chunk_size) - (offset / chunk_size) + 1
170 }
171 DeltaType::Insert { offset: _, data } => {
172 let direct_chunks = data.len().div_ceil(chunk_size);
175 direct_chunks.max(1)
176 }
177 DeltaType::Delete { offset, length, .. } => {
178 let end = offset + length;
179 (end / chunk_size) - (offset / chunk_size) + 1
180 }
181 DeltaType::Append { data } => data.len().div_ceil(chunk_size),
182 DeltaType::Truncate { .. } => 1, }
184 }
185}
186
187#[derive(Debug, Clone)]
189pub struct Delta {
190 pub delta_type: DeltaType,
192
193 pub expected_version: Option<u64>,
195
196 pub verify_old_data: bool,
198}
199
200impl Delta {
201 pub fn new(delta_type: DeltaType) -> Self {
203 Self {
204 delta_type,
205 expected_version: None,
206 verify_old_data: true,
207 }
208 }
209
210 pub fn with_version(delta_type: DeltaType, expected_version: u64) -> Self {
212 Self {
213 delta_type,
214 expected_version: Some(expected_version),
215 verify_old_data: true,
216 }
217 }
218
219 pub fn without_verification(mut self) -> Self {
221 self.verify_old_data = false;
222 self
223 }
224}
225
226#[derive(Debug, Clone)]
228pub struct AffectedChunks {
229 pub modified_chunks: Vec<ChunkModification>,
231
232 pub reencoded_chunks: Vec<ChunkId>,
234
235 pub new_chunks: Vec<NewChunk>,
237
238 pub removed_chunks: Vec<ChunkId>,
240}
241
242#[derive(Debug, Clone)]
244pub struct ChunkModification {
245 pub chunk_id: ChunkId,
247
248 pub byte_changes: Vec<(usize, u8, u8)>,
250
251 pub chunk_offset: Option<ChunkOffset>,
253}
254
255#[derive(Debug, Clone)]
257pub struct NewChunk {
258 pub data: Vec<u8>,
260
261 pub file_offset: usize,
263}
264
265impl AffectedChunks {
266 pub fn empty() -> Self {
268 Self {
269 modified_chunks: Vec::new(),
270 reencoded_chunks: Vec::new(),
271 new_chunks: Vec::new(),
272 removed_chunks: Vec::new(),
273 }
274 }
275
276 pub fn is_empty(&self) -> bool {
278 self.modified_chunks.is_empty()
279 && self.reencoded_chunks.is_empty()
280 && self.new_chunks.is_empty()
281 && self.removed_chunks.is_empty()
282 }
283
284 pub fn total_affected(&self) -> usize {
286 self.modified_chunks.len()
287 + self.reencoded_chunks.len()
288 + self.new_chunks.len()
289 + self.removed_chunks.len()
290 }
291}
292
293pub fn analyze_delta(
304 delta: &Delta,
305 file_size: usize,
306 chunk_size: usize,
307 chunk_offsets: Option<&[ChunkOffset]>,
308) -> AffectedChunks {
309 let mut result = AffectedChunks::empty();
310
311 match &delta.delta_type {
312 DeltaType::ByteReplace {
313 offset,
314 old_value,
315 new_value,
316 } => {
317 if *offset >= file_size {
318 return result; }
320 let chunk_idx = offset / chunk_size;
321 let offset_in_chunk = offset % chunk_size;
322
323 result.modified_chunks.push(ChunkModification {
324 chunk_id: get_chunk_id(chunk_idx, chunk_offsets),
325 byte_changes: vec![(offset_in_chunk, *old_value, *new_value)],
326 chunk_offset: chunk_offsets.and_then(|co| co.get(chunk_idx).copied()),
327 });
328 }
329
330 DeltaType::MultiByteReplace { changes } => {
331 let mut changes_by_chunk: std::collections::HashMap<usize, Vec<(usize, u8, u8)>> =
333 std::collections::HashMap::new();
334
335 for &(offset, old_val, new_val) in changes {
336 if offset >= file_size {
337 continue;
338 }
339 let chunk_idx = offset / chunk_size;
340 let offset_in_chunk = offset % chunk_size;
341 changes_by_chunk.entry(chunk_idx).or_default().push((
342 offset_in_chunk,
343 old_val,
344 new_val,
345 ));
346 }
347
348 for (chunk_idx, byte_changes) in changes_by_chunk {
349 result.modified_chunks.push(ChunkModification {
350 chunk_id: get_chunk_id(chunk_idx, chunk_offsets),
351 byte_changes,
352 chunk_offset: chunk_offsets.and_then(|co| co.get(chunk_idx).copied()),
353 });
354 }
355 }
356
357 DeltaType::RangeReplace {
358 offset,
359 old_data,
360 new_data,
361 } => {
362 if old_data.len() != new_data.len() {
363 let start_chunk = offset / chunk_size;
366 let end_chunk = (offset + old_data.len().max(new_data.len())) / chunk_size;
367 for chunk_idx in start_chunk..=end_chunk {
368 result
369 .reencoded_chunks
370 .push(get_chunk_id(chunk_idx, chunk_offsets));
371 }
372 } else {
373 let mut changes_by_chunk: std::collections::HashMap<usize, Vec<(usize, u8, u8)>> =
375 std::collections::HashMap::new();
376
377 for (i, (&old_byte, &new_byte)) in old_data.iter().zip(new_data.iter()).enumerate()
378 {
379 if old_byte != new_byte {
380 let file_offset = offset + i;
381 let chunk_idx = file_offset / chunk_size;
382 let offset_in_chunk = file_offset % chunk_size;
383 changes_by_chunk.entry(chunk_idx).or_default().push((
384 offset_in_chunk,
385 old_byte,
386 new_byte,
387 ));
388 }
389 }
390
391 for (chunk_idx, byte_changes) in changes_by_chunk {
392 result.modified_chunks.push(ChunkModification {
393 chunk_id: get_chunk_id(chunk_idx, chunk_offsets),
394 byte_changes,
395 chunk_offset: chunk_offsets.and_then(|co| co.get(chunk_idx).copied()),
396 });
397 }
398 }
399 }
400
401 DeltaType::Insert { offset, data } => {
402 let start_chunk = offset / chunk_size;
405 let total_chunks = file_size.div_ceil(chunk_size);
406
407 for chunk_idx in start_chunk..total_chunks {
408 result
409 .reencoded_chunks
410 .push(get_chunk_id(chunk_idx, chunk_offsets));
411 }
412
413 let new_total_size = file_size + data.len();
415 let new_total_chunks = new_total_size.div_ceil(chunk_size);
416 if new_total_chunks > total_chunks {
417 for _ in total_chunks..new_total_chunks {
418 result.new_chunks.push(NewChunk {
419 data: Vec::new(), file_offset: 0, });
422 }
423 }
424 }
425
426 DeltaType::Delete { offset, length, .. } => {
427 let start_chunk = offset / chunk_size;
429 let total_chunks = file_size.div_ceil(chunk_size);
430
431 for chunk_idx in start_chunk..total_chunks {
432 result
433 .reencoded_chunks
434 .push(get_chunk_id(chunk_idx, chunk_offsets));
435 }
436
437 let new_total_size = file_size.saturating_sub(*length);
439 let new_total_chunks = new_total_size.div_ceil(chunk_size);
440 if new_total_chunks < total_chunks {
441 for chunk_idx in new_total_chunks..total_chunks {
442 result
443 .removed_chunks
444 .push(get_chunk_id(chunk_idx, chunk_offsets));
445 }
446 }
447 }
448
449 DeltaType::Append { data } => {
450 let current_chunks = file_size.div_ceil(chunk_size);
452 let bytes_in_last_chunk = if file_size == 0 {
453 0
454 } else {
455 ((file_size - 1) % chunk_size) + 1
456 };
457 let remaining_in_last = if bytes_in_last_chunk > 0 {
458 chunk_size - bytes_in_last_chunk
459 } else {
460 0
461 };
462
463 if remaining_in_last > 0 && !data.is_empty() {
464 if current_chunks > 0 {
466 result
467 .reencoded_chunks
468 .push(get_chunk_id(current_chunks - 1, chunk_offsets));
469 }
470 }
471
472 let data_for_new_chunks = if remaining_in_last >= data.len() {
474 0
475 } else {
476 data.len() - remaining_in_last
477 };
478 let new_chunk_count = data_for_new_chunks.div_ceil(chunk_size);
479
480 for i in 0..new_chunk_count {
481 let chunk_start = remaining_in_last + i * chunk_size;
482 let chunk_end = (chunk_start + chunk_size).min(data.len());
483 result.new_chunks.push(NewChunk {
484 data: data[chunk_start..chunk_end].to_vec(),
485 file_offset: file_size + chunk_start,
486 });
487 }
488 }
489
490 DeltaType::Truncate { new_length, .. } => {
491 if *new_length >= file_size {
492 return result; }
494
495 let new_chunks = (*new_length).div_ceil(chunk_size);
496 let old_chunks = file_size.div_ceil(chunk_size);
497
498 if *new_length % chunk_size != 0 && new_chunks > 0 {
500 result
501 .reencoded_chunks
502 .push(get_chunk_id(new_chunks - 1, chunk_offsets));
503 }
504
505 for chunk_idx in new_chunks..old_chunks {
507 result
508 .removed_chunks
509 .push(get_chunk_id(chunk_idx, chunk_offsets));
510 }
511 }
512 }
513
514 result
515}
516
517fn get_chunk_id(chunk_idx: usize, chunk_offsets: Option<&[ChunkOffset]>) -> ChunkId {
519 chunk_offsets
520 .and_then(|co| co.get(chunk_idx))
521 .map(|c| c.chunk_id)
522 .unwrap_or(chunk_idx)
523}
524
525pub mod vsa_delta {
544 use embeddenator_vsa::{ReversibleVSAEncoder, SparseVec};
545
546 pub fn reencode_chunk(
559 encoder: &mut ReversibleVSAEncoder,
560 modified_data: &[u8],
561 start_position: usize,
562 ) -> SparseVec {
563 if modified_data.is_empty() {
564 return SparseVec::new();
565 }
566
567 let max_pos = start_position + modified_data.len() - 1;
569 if max_pos < embeddenator_vsa::MAX_POSITIONS {
570 encoder.ensure_positions(max_pos);
571 }
572
573 let byte_vectors = encoder.get_byte_vectors();
574
575 let pos_vec = encoder.get_position_vector_ref(start_position);
577 let mut result = pos_vec.bind(&byte_vectors[modified_data[0] as usize]);
578
579 for (i, &byte) in modified_data.iter().enumerate().skip(1) {
581 let pos_vec = encoder.get_position_vector_ref(start_position + i);
582 let encoded = pos_vec.bind(&byte_vectors[byte as usize]);
583 result = result.bundle(&encoded);
584 }
585
586 result
587 }
588
589 pub fn update_root_chunk(
605 root: &SparseVec,
606 old_chunk: &SparseVec,
607 new_chunk: &SparseVec,
608 ) -> SparseVec {
609 let unbound = root.bind(old_chunk);
611 unbound.bundle(new_chunk)
613 }
614
615 pub fn encode_new_chunk(
619 encoder: &mut ReversibleVSAEncoder,
620 data: &[u8],
621 start_position: usize,
622 ) -> SparseVec {
623 reencode_chunk(encoder, data, start_position)
624 }
625
626 pub fn add_chunk_to_root(root: &SparseVec, new_chunk: &SparseVec) -> SparseVec {
630 root.bundle(new_chunk)
631 }
632
633 pub fn remove_chunk_from_root(root: &SparseVec, chunk: &SparseVec) -> SparseVec {
639 root.bind(chunk)
640 }
641}
642
643#[derive(Debug)]
645pub struct DeltaResult {
646 pub modified_chunks: Vec<(ChunkId, Vec<u8>)>,
648
649 pub new_chunks: Vec<(Vec<u8>, usize)>,
651
652 pub removed_chunks: Vec<ChunkId>,
654
655 pub new_size: usize,
657
658 pub needs_correction_update: bool,
660}
661
662impl DeltaResult {
663 pub fn empty(current_size: usize) -> Self {
665 Self {
666 modified_chunks: Vec::new(),
667 new_chunks: Vec::new(),
668 removed_chunks: Vec::new(),
669 new_size: current_size,
670 needs_correction_update: false,
671 }
672 }
673}
674
675#[cfg(test)]
676mod tests {
677 use super::*;
678
679 #[test]
680 fn test_byte_replace_analysis() {
681 let delta = Delta::new(DeltaType::ByteReplace {
682 offset: 100,
683 old_value: b'A',
684 new_value: b'B',
685 });
686
687 let result = analyze_delta(&delta, 1000, 64, None);
688
689 assert_eq!(result.modified_chunks.len(), 1);
690 assert_eq!(result.modified_chunks[0].chunk_id, 1); assert_eq!(result.modified_chunks[0].byte_changes.len(), 1);
692 assert_eq!(result.modified_chunks[0].byte_changes[0], (36, b'A', b'B'));
693 }
695
696 #[test]
697 fn test_multi_byte_replace_groups_by_chunk() {
698 let delta = Delta::new(DeltaType::MultiByteReplace {
699 changes: vec![
700 (10, b'A', b'X'), (20, b'B', b'Y'), (100, b'C', b'Z'), ],
704 });
705
706 let result = analyze_delta(&delta, 1000, 64, None);
707
708 assert_eq!(result.modified_chunks.len(), 2);
709 }
710
711 #[test]
712 fn test_append_creates_new_chunks() {
713 let delta = Delta::new(DeltaType::Append {
714 data: vec![0u8; 200], });
716
717 let result = analyze_delta(&delta, 64, 64, None);
719
720 assert!(result.new_chunks.len() >= 2);
721 }
722
723 #[test]
724 fn test_truncate_removes_chunks() {
725 let delta = Delta::new(DeltaType::Truncate {
726 new_length: 100,
727 truncated_data: vec![0u8; 156], });
729
730 let result = analyze_delta(&delta, 256, 64, None);
732
733 assert_eq!(result.removed_chunks.len(), 2); }
735
736 #[test]
737 fn test_delta_is_non_shifting() {
738 assert!(DeltaType::ByteReplace {
739 offset: 0,
740 old_value: 0,
741 new_value: 1
742 }
743 .is_non_shifting());
744 assert!(DeltaType::RangeReplace {
745 offset: 0,
746 old_data: vec![0],
747 new_data: vec![1]
748 }
749 .is_non_shifting());
750 assert!(!DeltaType::Insert {
751 offset: 0,
752 data: vec![1]
753 }
754 .is_non_shifting());
755 assert!(!DeltaType::Append { data: vec![1] }.is_non_shifting());
756 }
757
758 #[test]
759 fn test_vsa_reencode_chunk() {
760 use embeddenator_vsa::ReversibleVSAEncoder;
761
762 let mut encoder = ReversibleVSAEncoder::new();
763
764 let modified_data = b"Hallo";
768
769 let chunk = vsa_delta::reencode_chunk(&mut encoder, modified_data, 0);
771
772 let decoded = encoder.decode(&chunk, 5);
774
775 assert_eq!(decoded[1], b'a', "Modified byte should be 'a'");
777 assert_eq!(decoded[0], b'H', "First byte should be 'H'");
778 }
779
780 #[test]
781 fn test_vsa_update_root_chunk() {
782 use embeddenator_vsa::ReversibleVSAEncoder;
783
784 let mut encoder = ReversibleVSAEncoder::new();
785
786 let chunk1_data = b"AAAA";
788 let chunk2_data = b"BBBB";
789
790 let chunk1 = encoder.encode(chunk1_data);
791 let chunk2 = vsa_delta::reencode_chunk(&mut encoder, chunk2_data, 4);
792
793 let root = chunk1.bundle(&chunk2);
795
796 let new_chunk1_data = b"CCCC";
798 let new_chunk1 = encoder.encode(new_chunk1_data);
799
800 let new_root = vsa_delta::update_root_chunk(&root, &chunk1, &new_chunk1);
802
803 let chunk1_sim = new_root.cosine(&new_chunk1);
806 assert!(
807 chunk1_sim > 0.0,
808 "New chunk should have positive correlation with root"
809 );
810 }
811
812 #[test]
813 fn test_vsa_encode_new_chunk() {
814 use embeddenator_vsa::ReversibleVSAEncoder;
815
816 let mut encoder = ReversibleVSAEncoder::new();
817
818 let data = b"New";
820 let chunk = vsa_delta::encode_new_chunk(&mut encoder, data, 64);
821
822 let self_sim = chunk.cosine(&chunk);
824 assert!(self_sim > 0.99, "New chunk should have valid structure");
825
826 let pos_vec_64 = encoder.get_position_vector_ref(64);
828 let query = chunk.bind(pos_vec_64);
829
830 let byte_vectors = encoder.get_byte_vectors();
832 let mut best_byte = 0u8;
833 let mut best_sim = f64::NEG_INFINITY;
834 for (i, bv) in byte_vectors.iter().enumerate() {
835 let sim = query.cosine(bv);
836 if sim > best_sim {
837 best_sim = sim;
838 best_byte = i as u8;
839 }
840 }
841
842 assert_eq!(best_byte, b'N', "First byte at position 64 should be 'N'");
843 }
844
845 #[test]
846 fn test_vsa_add_chunk_to_root() {
847 use embeddenator_vsa::ReversibleVSAEncoder;
848
849 let mut encoder = ReversibleVSAEncoder::new();
850
851 let chunk1 = encoder.encode(b"Hello");
853
854 let chunk2 = vsa_delta::encode_new_chunk(&mut encoder, b"World", 5);
856 let root = vsa_delta::add_chunk_to_root(&chunk1, &chunk2);
857
858 let sim1 = root.cosine(&chunk1);
860 let sim2 = root.cosine(&chunk2);
861
862 assert!(sim1 > 0.0, "Root should correlate with first chunk");
863 assert!(sim2 > 0.0, "Root should correlate with second chunk");
864 }
865}