use super::versioned::types::{ChunkId, ChunkOffset};
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DeltaType {
ByteReplace {
offset: usize,
old_value: u8,
new_value: u8,
},
MultiByteReplace {
changes: Vec<(usize, u8, u8)>,
},
RangeReplace {
offset: usize,
old_data: Vec<u8>,
new_data: Vec<u8>,
},
Insert {
offset: usize,
data: Vec<u8>,
},
Delete {
offset: usize,
length: usize,
deleted_data: Vec<u8>,
},
Append {
data: Vec<u8>,
},
Truncate {
new_length: usize,
truncated_data: Vec<u8>,
},
}
impl DeltaType {
pub fn is_non_shifting(&self) -> bool {
matches!(
self,
DeltaType::ByteReplace { .. }
| DeltaType::MultiByteReplace { .. }
| DeltaType::RangeReplace { .. }
)
}
pub fn changes_length(&self) -> bool {
matches!(
self,
DeltaType::Insert { .. }
| DeltaType::Delete { .. }
| DeltaType::Append { .. }
| DeltaType::Truncate { .. }
)
}
pub fn affected_chunk_count(&self, chunk_size: usize) -> usize {
match self {
DeltaType::ByteReplace { .. } => 1,
DeltaType::MultiByteReplace { changes } => {
if changes.is_empty() {
return 0;
}
let min_offset = changes.iter().map(|(o, _, _)| *o).min().unwrap_or(0);
let max_offset = changes.iter().map(|(o, _, _)| *o).max().unwrap_or(0);
(max_offset / chunk_size) - (min_offset / chunk_size) + 1
}
DeltaType::RangeReplace {
offset, new_data, ..
} => {
let end = offset + new_data.len();
(end / chunk_size) - (offset / chunk_size) + 1
}
DeltaType::Insert { offset: _, data } => {
let direct_chunks = data.len().div_ceil(chunk_size);
direct_chunks.max(1)
}
DeltaType::Delete { offset, length, .. } => {
let end = offset + length;
(end / chunk_size) - (offset / chunk_size) + 1
}
DeltaType::Append { data } => data.len().div_ceil(chunk_size),
DeltaType::Truncate { .. } => 1, }
}
}
#[derive(Debug, Clone)]
pub struct Delta {
pub delta_type: DeltaType,
pub expected_version: Option<u64>,
pub verify_old_data: bool,
}
impl Delta {
pub fn new(delta_type: DeltaType) -> Self {
Self {
delta_type,
expected_version: None,
verify_old_data: true,
}
}
pub fn with_version(delta_type: DeltaType, expected_version: u64) -> Self {
Self {
delta_type,
expected_version: Some(expected_version),
verify_old_data: true,
}
}
pub fn without_verification(mut self) -> Self {
self.verify_old_data = false;
self
}
}
#[derive(Debug, Clone)]
pub struct AffectedChunks {
pub modified_chunks: Vec<ChunkModification>,
pub reencoded_chunks: Vec<ChunkId>,
pub new_chunks: Vec<NewChunk>,
pub removed_chunks: Vec<ChunkId>,
}
#[derive(Debug, Clone)]
pub struct ChunkModification {
pub chunk_id: ChunkId,
pub byte_changes: Vec<(usize, u8, u8)>,
pub chunk_offset: Option<ChunkOffset>,
}
#[derive(Debug, Clone)]
pub struct NewChunk {
pub data: Vec<u8>,
pub file_offset: usize,
}
impl AffectedChunks {
pub fn empty() -> Self {
Self {
modified_chunks: Vec::new(),
reencoded_chunks: Vec::new(),
new_chunks: Vec::new(),
removed_chunks: Vec::new(),
}
}
pub fn is_empty(&self) -> bool {
self.modified_chunks.is_empty()
&& self.reencoded_chunks.is_empty()
&& self.new_chunks.is_empty()
&& self.removed_chunks.is_empty()
}
pub fn total_affected(&self) -> usize {
self.modified_chunks.len()
+ self.reencoded_chunks.len()
+ self.new_chunks.len()
+ self.removed_chunks.len()
}
}
pub fn analyze_delta(
delta: &Delta,
file_size: usize,
chunk_size: usize,
chunk_offsets: Option<&[ChunkOffset]>,
) -> AffectedChunks {
let mut result = AffectedChunks::empty();
match &delta.delta_type {
DeltaType::ByteReplace {
offset,
old_value,
new_value,
} => {
if *offset >= file_size {
return result; }
let chunk_idx = offset / chunk_size;
let offset_in_chunk = offset % chunk_size;
result.modified_chunks.push(ChunkModification {
chunk_id: get_chunk_id(chunk_idx, chunk_offsets),
byte_changes: vec![(offset_in_chunk, *old_value, *new_value)],
chunk_offset: chunk_offsets.and_then(|co| co.get(chunk_idx).copied()),
});
}
DeltaType::MultiByteReplace { changes } => {
let mut changes_by_chunk: std::collections::HashMap<usize, Vec<(usize, u8, u8)>> =
std::collections::HashMap::new();
for &(offset, old_val, new_val) in changes {
if offset >= file_size {
continue;
}
let chunk_idx = offset / chunk_size;
let offset_in_chunk = offset % chunk_size;
changes_by_chunk.entry(chunk_idx).or_default().push((
offset_in_chunk,
old_val,
new_val,
));
}
for (chunk_idx, byte_changes) in changes_by_chunk {
result.modified_chunks.push(ChunkModification {
chunk_id: get_chunk_id(chunk_idx, chunk_offsets),
byte_changes,
chunk_offset: chunk_offsets.and_then(|co| co.get(chunk_idx).copied()),
});
}
}
DeltaType::RangeReplace {
offset,
old_data,
new_data,
} => {
if old_data.len() != new_data.len() {
let start_chunk = offset / chunk_size;
let end_chunk = (offset + old_data.len().max(new_data.len())) / chunk_size;
for chunk_idx in start_chunk..=end_chunk {
result
.reencoded_chunks
.push(get_chunk_id(chunk_idx, chunk_offsets));
}
} else {
let mut changes_by_chunk: std::collections::HashMap<usize, Vec<(usize, u8, u8)>> =
std::collections::HashMap::new();
for (i, (&old_byte, &new_byte)) in old_data.iter().zip(new_data.iter()).enumerate()
{
if old_byte != new_byte {
let file_offset = offset + i;
let chunk_idx = file_offset / chunk_size;
let offset_in_chunk = file_offset % chunk_size;
changes_by_chunk.entry(chunk_idx).or_default().push((
offset_in_chunk,
old_byte,
new_byte,
));
}
}
for (chunk_idx, byte_changes) in changes_by_chunk {
result.modified_chunks.push(ChunkModification {
chunk_id: get_chunk_id(chunk_idx, chunk_offsets),
byte_changes,
chunk_offset: chunk_offsets.and_then(|co| co.get(chunk_idx).copied()),
});
}
}
}
DeltaType::Insert { offset, data } => {
let start_chunk = offset / chunk_size;
let total_chunks = file_size.div_ceil(chunk_size);
for chunk_idx in start_chunk..total_chunks {
result
.reencoded_chunks
.push(get_chunk_id(chunk_idx, chunk_offsets));
}
let new_total_size = file_size + data.len();
let new_total_chunks = new_total_size.div_ceil(chunk_size);
if new_total_chunks > total_chunks {
for _ in total_chunks..new_total_chunks {
result.new_chunks.push(NewChunk {
data: Vec::new(), file_offset: 0, });
}
}
}
DeltaType::Delete { offset, length, .. } => {
let start_chunk = offset / chunk_size;
let total_chunks = file_size.div_ceil(chunk_size);
for chunk_idx in start_chunk..total_chunks {
result
.reencoded_chunks
.push(get_chunk_id(chunk_idx, chunk_offsets));
}
let new_total_size = file_size.saturating_sub(*length);
let new_total_chunks = new_total_size.div_ceil(chunk_size);
if new_total_chunks < total_chunks {
for chunk_idx in new_total_chunks..total_chunks {
result
.removed_chunks
.push(get_chunk_id(chunk_idx, chunk_offsets));
}
}
}
DeltaType::Append { data } => {
let current_chunks = file_size.div_ceil(chunk_size);
let bytes_in_last_chunk = if file_size == 0 {
0
} else {
((file_size - 1) % chunk_size) + 1
};
let remaining_in_last = if bytes_in_last_chunk > 0 {
chunk_size - bytes_in_last_chunk
} else {
0
};
if remaining_in_last > 0 && !data.is_empty() {
if current_chunks > 0 {
result
.reencoded_chunks
.push(get_chunk_id(current_chunks - 1, chunk_offsets));
}
}
let data_for_new_chunks = if remaining_in_last >= data.len() {
0
} else {
data.len() - remaining_in_last
};
let new_chunk_count = data_for_new_chunks.div_ceil(chunk_size);
for i in 0..new_chunk_count {
let chunk_start = remaining_in_last + i * chunk_size;
let chunk_end = (chunk_start + chunk_size).min(data.len());
result.new_chunks.push(NewChunk {
data: data[chunk_start..chunk_end].to_vec(),
file_offset: file_size + chunk_start,
});
}
}
DeltaType::Truncate { new_length, .. } => {
if *new_length >= file_size {
return result; }
let new_chunks = (*new_length).div_ceil(chunk_size);
let old_chunks = file_size.div_ceil(chunk_size);
if *new_length % chunk_size != 0 && new_chunks > 0 {
result
.reencoded_chunks
.push(get_chunk_id(new_chunks - 1, chunk_offsets));
}
for chunk_idx in new_chunks..old_chunks {
result
.removed_chunks
.push(get_chunk_id(chunk_idx, chunk_offsets));
}
}
}
result
}
fn get_chunk_id(chunk_idx: usize, chunk_offsets: Option<&[ChunkOffset]>) -> ChunkId {
chunk_offsets
.and_then(|co| co.get(chunk_idx))
.map(|c| c.chunk_id)
.unwrap_or(chunk_idx)
}
pub mod vsa_delta {
use embeddenator_vsa::{ReversibleVSAEncoder, SparseVec};
pub fn reencode_chunk(
encoder: &mut ReversibleVSAEncoder,
modified_data: &[u8],
start_position: usize,
) -> SparseVec {
if modified_data.is_empty() {
return SparseVec::new();
}
let max_pos = start_position + modified_data.len() - 1;
if max_pos < embeddenator_vsa::MAX_POSITIONS {
encoder.ensure_positions(max_pos);
}
let byte_vectors = encoder.get_byte_vectors();
let pos_vec = encoder.get_position_vector_ref(start_position);
let mut result = pos_vec.bind(&byte_vectors[modified_data[0] as usize]);
for (i, &byte) in modified_data.iter().enumerate().skip(1) {
let pos_vec = encoder.get_position_vector_ref(start_position + i);
let encoded = pos_vec.bind(&byte_vectors[byte as usize]);
result = result.bundle(&encoded);
}
result
}
pub fn update_root_chunk(
root: &SparseVec,
old_chunk: &SparseVec,
new_chunk: &SparseVec,
) -> SparseVec {
let unbound = root.bind(old_chunk);
unbound.bundle(new_chunk)
}
pub fn encode_new_chunk(
encoder: &mut ReversibleVSAEncoder,
data: &[u8],
start_position: usize,
) -> SparseVec {
reencode_chunk(encoder, data, start_position)
}
pub fn add_chunk_to_root(root: &SparseVec, new_chunk: &SparseVec) -> SparseVec {
root.bundle(new_chunk)
}
pub fn remove_chunk_from_root(root: &SparseVec, chunk: &SparseVec) -> SparseVec {
root.bind(chunk)
}
}
#[derive(Debug)]
pub struct DeltaResult {
pub modified_chunks: Vec<(ChunkId, Vec<u8>)>,
pub new_chunks: Vec<(Vec<u8>, usize)>,
pub removed_chunks: Vec<ChunkId>,
pub new_size: usize,
pub needs_correction_update: bool,
}
impl DeltaResult {
pub fn empty(current_size: usize) -> Self {
Self {
modified_chunks: Vec::new(),
new_chunks: Vec::new(),
removed_chunks: Vec::new(),
new_size: current_size,
needs_correction_update: false,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_byte_replace_analysis() {
let delta = Delta::new(DeltaType::ByteReplace {
offset: 100,
old_value: b'A',
new_value: b'B',
});
let result = analyze_delta(&delta, 1000, 64, None);
assert_eq!(result.modified_chunks.len(), 1);
assert_eq!(result.modified_chunks[0].chunk_id, 1); assert_eq!(result.modified_chunks[0].byte_changes.len(), 1);
assert_eq!(result.modified_chunks[0].byte_changes[0], (36, b'A', b'B'));
}
#[test]
fn test_multi_byte_replace_groups_by_chunk() {
let delta = Delta::new(DeltaType::MultiByteReplace {
changes: vec![
(10, b'A', b'X'), (20, b'B', b'Y'), (100, b'C', b'Z'), ],
});
let result = analyze_delta(&delta, 1000, 64, None);
assert_eq!(result.modified_chunks.len(), 2);
}
#[test]
fn test_append_creates_new_chunks() {
let delta = Delta::new(DeltaType::Append {
data: vec![0u8; 200], });
let result = analyze_delta(&delta, 64, 64, None);
assert!(result.new_chunks.len() >= 2);
}
#[test]
fn test_truncate_removes_chunks() {
let delta = Delta::new(DeltaType::Truncate {
new_length: 100,
truncated_data: vec![0u8; 156], });
let result = analyze_delta(&delta, 256, 64, None);
assert_eq!(result.removed_chunks.len(), 2); }
#[test]
fn test_delta_is_non_shifting() {
assert!(DeltaType::ByteReplace {
offset: 0,
old_value: 0,
new_value: 1
}
.is_non_shifting());
assert!(DeltaType::RangeReplace {
offset: 0,
old_data: vec![0],
new_data: vec![1]
}
.is_non_shifting());
assert!(!DeltaType::Insert {
offset: 0,
data: vec![1]
}
.is_non_shifting());
assert!(!DeltaType::Append { data: vec![1] }.is_non_shifting());
}
#[test]
fn test_vsa_reencode_chunk() {
use embeddenator_vsa::ReversibleVSAEncoder;
let mut encoder = ReversibleVSAEncoder::new();
let modified_data = b"Hallo";
let chunk = vsa_delta::reencode_chunk(&mut encoder, modified_data, 0);
let decoded = encoder.decode(&chunk, 5);
assert_eq!(decoded[1], b'a', "Modified byte should be 'a'");
assert_eq!(decoded[0], b'H', "First byte should be 'H'");
}
#[test]
fn test_vsa_update_root_chunk() {
use embeddenator_vsa::ReversibleVSAEncoder;
let mut encoder = ReversibleVSAEncoder::new();
let chunk1_data = b"AAAA";
let chunk2_data = b"BBBB";
let chunk1 = encoder.encode(chunk1_data);
let chunk2 = vsa_delta::reencode_chunk(&mut encoder, chunk2_data, 4);
let root = chunk1.bundle(&chunk2);
let new_chunk1_data = b"CCCC";
let new_chunk1 = encoder.encode(new_chunk1_data);
let new_root = vsa_delta::update_root_chunk(&root, &chunk1, &new_chunk1);
let chunk1_sim = new_root.cosine(&new_chunk1);
assert!(
chunk1_sim > 0.0,
"New chunk should have positive correlation with root"
);
}
#[test]
fn test_vsa_encode_new_chunk() {
use embeddenator_vsa::ReversibleVSAEncoder;
let mut encoder = ReversibleVSAEncoder::new();
let data = b"New";
let chunk = vsa_delta::encode_new_chunk(&mut encoder, data, 64);
let self_sim = chunk.cosine(&chunk);
assert!(self_sim > 0.99, "New chunk should have valid structure");
let pos_vec_64 = encoder.get_position_vector_ref(64);
let query = chunk.bind(pos_vec_64);
let byte_vectors = encoder.get_byte_vectors();
let mut best_byte = 0u8;
let mut best_sim = f64::NEG_INFINITY;
for (i, bv) in byte_vectors.iter().enumerate() {
let sim = query.cosine(bv);
if sim > best_sim {
best_sim = sim;
best_byte = i as u8;
}
}
assert_eq!(best_byte, b'N', "First byte at position 64 should be 'N'");
}
#[test]
fn test_vsa_add_chunk_to_root() {
use embeddenator_vsa::ReversibleVSAEncoder;
let mut encoder = ReversibleVSAEncoder::new();
let chunk1 = encoder.encode(b"Hello");
let chunk2 = vsa_delta::encode_new_chunk(&mut encoder, b"World", 5);
let root = vsa_delta::add_chunk_to_root(&chunk1, &chunk2);
let sim1 = root.cosine(&chunk1);
let sim2 = root.cosine(&chunk2);
assert!(sim1 > 0.0, "Root should correlate with first chunk");
assert!(sim2 > 0.0, "Root should correlate with second chunk");
}
}