use std::convert::TryInto;
use fnv::FnvHashMap;
const DEBUG: bool = false;
const STATS: bool = false;
type ForkID = u32;
type NodeID = u32;
type Position = u32;
const POSITION_BYTES: usize = 4;
#[derive(Debug)]
pub enum DiffError {
InvalidScanShift,
TooBigInput,
TooBigOutput,
TooManyForks,
TooManyNodes,
}
impl std::fmt::Display for DiffError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self)
}
}
impl std::error::Error for DiffError {}
impl From<DiffError> for String {
fn from(error: DiffError) -> Self {
format!("{}", error)
}
}
pub struct State {
nodes: Vec<Node>,
forks: Vec<Fork>,
allocated_forks: ForkID,
}
impl State {
pub fn new() -> State {
let mut state = State { nodes: vec![], forks: vec![], allocated_forks: 0 };
state.allocate_node(Node::Location(0)).unwrap();
state
}
fn allocate_fork(&mut self, first_location: Position, fork_at_depth: Position) -> Result<ForkID, DiffError> {
const CHUNK_SIZE: usize = 1024;
if self.allocated_forks % CHUNK_SIZE as Position == 0 {
if self.allocated_forks > 2u32.pow(30) {
return Err(DiffError::TooManyForks);
};
let node = Fork { first_location: 0, fork_at_depth: 0, tines: NodeHash::new() };
self.forks.resize((self.allocated_forks as usize / CHUNK_SIZE + 1) * CHUNK_SIZE, node);
};
self.forks[self.allocated_forks as usize].first_location = first_location;
self.forks[self.allocated_forks as usize].fork_at_depth = fork_at_depth;
self.allocated_forks += 1;
Ok(self.allocated_forks - 1)
}
fn allocate_node(&mut self, node: Node) -> Result<NodeID, DiffError> {
self.nodes.push(node);
let len = self.nodes.len() as NodeID;
if len > 2u32.pow(31) {
return Err(DiffError::TooManyNodes);
};
Ok((len - 1) as NodeID)
}
#[inline(always)]
fn longest_match(&self, old_data: &[u8], new_data: &[u8], search_start_location: Position) -> (Position, Position) {
let mut node_id = 0;
let mut matching_depth = 0;
loop {
match self.nodes[node_id as usize] {
Node::Location(location) => {
let deepest_posible_match = std::cmp::min((old_data.len() as Position) - location, (new_data.len() as Position) - search_start_location);
while matching_depth < deepest_posible_match
&& old_data[(location + matching_depth) as usize] == new_data[(search_start_location + matching_depth) as usize]
{
matching_depth += 1;
}
return (location, matching_depth);
}
Node::Fork(fork_id) => {
let fork = &self.forks[fork_id as usize];
let deepest_posible_match =
std::cmp::min((old_data.len() as Position) - fork.first_location, (new_data.len() as Position) - search_start_location);
while matching_depth < fork.fork_at_depth
&& matching_depth < deepest_posible_match
&& old_data[(fork.first_location + matching_depth) as usize] == new_data[(search_start_location + matching_depth) as usize]
{
matching_depth += 1;
}
if matching_depth == fork.fork_at_depth && matching_depth < deepest_posible_match.saturating_sub(3) {
if let Some(new_node_id) = fork.tines.get(symbol_at(new_data, search_start_location + matching_depth)) {
matching_depth += POSITION_BYTES as Position;
node_id = new_node_id;
} else {
return (fork.first_location, matching_depth);
}
} else {
return (fork.first_location, matching_depth);
}
}
}
}
}
pub fn diff(&self, old_data: &[u8], new_data: &[u8], scan_shift: Position, cursors_max_count: usize) -> Result<(Vec<u8>, State), DiffError> {
if scan_shift == 0 || scan_shift % POSITION_BYTES as Position != 0 || scan_shift > 1024 * 2024 {
return Err(DiffError::InvalidScanShift);
};
if old_data.len() > (Position::MAX as usize - POSITION_BYTES) {
return Err(DiffError::TooBigInput);
};
if new_data.len() > (Position::MAX as usize - POSITION_BYTES) {
return Err(DiffError::TooBigInput);
};
let mut cursors: Vec<TreeCursor> = Vec::with_capacity(cursors_max_count);
let mut new_state = State::new();
let mut max_cursor_count_encountered = 0;
let mut pick_next_escape_after_location = 0;
let mut output = vec![];
let mut escape_byte = 255;
let mut next_output_symbol_at_index: Position = 0;
for index in 0..new_data.len() as Position {
if index == next_output_symbol_at_index {
let (old_data_match_position_absolute, old_data_match_depth) = self.longest_match(old_data, new_data, index as Position);
let old_data_match_position = if old_data_match_position_absolute <= index {
(index - old_data_match_position_absolute) * 2
} else {
(old_data_match_position_absolute - index - 1) * 2 + 1
};
let bytes_saved_on_old_data_reference = old_data_match_depth as i64
- (1 + Self::encoding_length(old_data_match_position) as i64 + Self::encoding_length(old_data_match_depth) as i64);
let (new_data_match_position, new_data_match_depth, bytes_saved_on_new_data_reference) = if index > 0 {
let (new_data_match_position_absolute, new_data_match_depth) = new_state.longest_match(new_data, new_data, index as Position);
let new_data_match_position = index - new_data_match_position_absolute;
let bytes_saved_on_new_data_reference = new_data_match_depth as i64
- (1 + Self::encoding_length(new_data_match_position) as i64 + Self::encoding_length(new_data_match_depth) as i64);
assert_ne!(new_data_match_position_absolute, index);
(new_data_match_position, new_data_match_depth, bytes_saved_on_new_data_reference)
} else {
(0, 0, 0)
};
if bytes_saved_on_old_data_reference > 0 || bytes_saved_on_new_data_reference > 0 {
if bytes_saved_on_old_data_reference > bytes_saved_on_new_data_reference {
Self::push_chunk(&mut output, escape_byte, true, old_data_match_position, old_data_match_depth);
next_output_symbol_at_index += old_data_match_depth;
} else {
Self::push_chunk(&mut output, escape_byte, false, new_data_match_position, new_data_match_depth);
next_output_symbol_at_index += new_data_match_depth;
}
} else {
if DEBUG {
println!("{}", new_data[index as usize] as char);
}
if new_data[index as usize] == escape_byte {
output.push(escape_byte);
if index < pick_next_escape_after_location {
output.push(escape_byte);
} else {
let (new_escape, new_pick_nex_escape_after_location) = Self::pick_escape_byte(escape_byte, new_data, index);
if new_escape == escape_byte {
output.push(255);
} else {
output.push(127);
output.push(new_escape);
escape_byte = new_escape;
pick_next_escape_after_location = new_pick_nex_escape_after_location;
}
}
} else {
output.push(new_data[index as usize]);
};
next_output_symbol_at_index += 1;
}
}
if output.len() > (Position::MAX as usize - POSITION_BYTES) {
return Err(DiffError::TooBigOutput);
}
if (index + POSITION_BYTES as Position) < new_data.len() as Position && index % (POSITION_BYTES as Position) == 0 {
let symbol = symbol_at(new_data, index);
if DEBUG {
println!("-------------------------------------------------------------- {} - {:?}", index, symbol_to_str(symbol));
};
let mut i = 0;
let mut len = cursors.len();
while i < len {
if DEBUG {
println!("{}", cursors[i].inspect());
};
let depth = index - cursors[i].location;
if cursors[i].push(&mut new_state, depth, new_data, symbol)? {
i += 1;
} else {
len -= 1;
cursors.swap_remove(i);
};
}
if index % scan_shift == 0 {
if cursors.len() == cursors_max_count {
let mut deepest_cursor_index = None;
let mut deepest_cursor_location = Position::MAX;
for (i, cursor) in cursors.iter().enumerate() {
if cursor.location < deepest_cursor_location {
deepest_cursor_location = cursor.location;
deepest_cursor_index = Some(i as Position);
}
}
cursors.swap_remove(deepest_cursor_index.unwrap() as usize);
}
let mut new_cursor = TreeCursor { location: index, node_id: 0 };
if new_cursor.push(&mut new_state, index - new_cursor.location, new_data, symbol)? {
cursors.push(new_cursor);
}
}
if DEBUG {
println!("---------------------------------");
for cursor in cursors.iter() {
println!("{}", cursor.inspect());
}
}
if STATS && cursors.len() > max_cursor_count_encountered {
max_cursor_count_encountered = cursors.len()
}
}
}
if STATS {
println!("\n\nOutput length: {}", output.len());
println!("Max cursors count: {}", max_cursor_count_encountered);
println!("Allocated forks count: {}", new_state.allocated_forks);
println!("Allocated nodes count: {}", new_state.nodes.len());
}
Ok((output, new_state))
}
fn pick_escape_byte(current: u8, data: &[u8], start_at: Position) -> (u8, Position) {
let mut occurences: [bool; 256] = [false; 256];
let till = std::cmp::min(start_at + 255, data.len() as u32);
for i in start_at..till {
occurences[data[i as usize] as usize] = true;
}
if occurences[current as usize] {
for (i, occurs) in occurences.iter().enumerate() {
if !occurs {
return (i as u8, start_at + 255);
}
}
unreachable!();
} else {
(current, start_at + 255)
}
}
fn encoding_length(value: Position) -> usize {
if value < 124 {
1
} else if value < 2u32.pow(16) {
3
} else if value < 2u32.pow(24) {
4
} else {
5
}
}
fn push_chunk(output: &mut Vec<u8>, escape: u8, use_data1: bool, location: Position, length: Position) {
output.push(escape);
let use_data1_bit: u8 = if use_data1 { 128 } else { 0 };
if location < 124 {
output.push(location as u8 + use_data1_bit);
} else if location < 2u32.pow(16) {
output.push(126 + use_data1_bit);
output.extend_from_slice(&location.to_be_bytes()[2..4]);
} else if location < 2u32.pow(24) {
output.push(125 + use_data1_bit);
output.extend_from_slice(&location.to_be_bytes()[1..4]);
} else {
output.push(124 + use_data1_bit);
output.extend_from_slice(&location.to_be_bytes());
};
if length < 253 {
output.push(length as u8);
} else if length < 2u32.pow(16) {
output.push(255);
output.extend_from_slice(&length.to_be_bytes()[2..4]);
} else if length < 2u32.pow(24) {
output.push(254);
output.extend_from_slice(&length.to_be_bytes()[1..4]);
} else {
output.push(253);
output.extend_from_slice(&length.to_be_bytes());
};
}
}
pub fn patch(old_data: &[u8], diff: &[u8]) -> Result<Vec<u8>, DiffError> {
let mut escape_byte = 0xff;
let mut output = vec![];
let mut index = 0;
while index < diff.len() {
if diff[index] == escape_byte {
index += 1;
let reference_is_old_data = diff[index] / 128 == 1;
let (position_bytes, position_bytes_count) = match diff[index] {
127 => {
output.push(escape_byte);
escape_byte = diff[index + 1];
index += 2;
continue;
}
255 => {
output.push(escape_byte);
index += 1;
continue;
}
124 | 252 => ([diff[index + 1], diff[index + 2], diff[index + 3], diff[index + 4]], 5),
125 | 253 => ([0, diff[index + 1], diff[index + 2], diff[index + 3]], 4),
126 | 254 => ([0, 0, diff[index + 1], diff[index + 2]], 3),
_ => ([0, 0, 0, diff[index] - if reference_is_old_data { 128 } else { 0 }], 1),
};
index += position_bytes_count;
let (length_bytes, consume) = match diff[index] {
253 => ([diff[index + 1], diff[index + 2], diff[index + 3], diff[index + 4]], 5),
254 => ([0, diff[index + 1], diff[index + 2], diff[index + 3]], 4),
255 => ([0, 0, diff[index + 1], diff[index + 2]], 3),
_ => ([0, 0, 0, diff[index]], 1),
};
index += consume;
let position = Position::from_be_bytes(position_bytes); let length = Position::from_be_bytes(length_bytes);
let reference_position = output.len();
if reference_is_old_data {
if position % 2 == 0 {
output.extend_from_slice(
&old_data[(reference_position - position as usize / 2)..(reference_position - position as usize / 2 + length as usize)],
)
} else {
output.extend_from_slice(
&old_data[(reference_position + position as usize / 2 + 1)..(reference_position + position as usize / 2 + length as usize + 1)],
)
}
} else {
for i in (reference_position - position as usize)..(reference_position - position as usize + length as usize) {
output.push(output[i]);
}
}
} else {
output.push(diff[index]);
index += 1;
}
}
Ok(output)
}
#[derive(Debug, Clone)]
pub struct NodeHash {
tines: FnvHashMap<u32, NodeID>,
}
impl NodeHash {
fn new() -> NodeHash {
NodeHash { tines: FnvHashMap::default() }
}
fn get(&self, symbol: u32) -> Option<NodeID> {
self.tines.get(&symbol).cloned()
}
fn set(&mut self, symbol: u32, node_id: NodeID) {
self.tines.insert(symbol, node_id);
}
fn iter(&self) -> impl Iterator<Item = (&u32, &NodeID)> {
self.tines.iter()
}
}
#[derive(Debug, Copy, Clone)]
enum Node {
Location(Position),
Fork(ForkID),
}
#[derive(Debug, Clone)]
struct Fork {
first_location: Position,
fork_at_depth: Position,
tines: NodeHash,
}
impl Fork {
#[allow(dead_code)]
fn inspect(&self, tree: &State, symbol: u32, depth: Position) -> String {
let mut ret = "\t".repeat(depth as usize);
ret += &format!("{:?} {} @ {}\n", symbol_to_str(symbol), self.fork_at_depth, self.first_location);
for (symbol, node_id) in self.tines.iter() {
ret += &match &tree.nodes[*node_id as usize] {
Node::Location(location) => {
format!("{}{:?} @ {}\n", "\t".repeat(depth as usize + 1), symbol_to_str(*symbol), location)
}
Node::Fork(fork_id) => tree.forks[*fork_id as usize].inspect(tree, *symbol, depth + 1),
}
}
ret
}
}
#[derive(Debug)]
struct TreeCursor {
location: Position,
node_id: NodeID,
}
fn symbol_at(input: &[u8], location: Position) -> u32 {
u32::from_be_bytes(input[location as usize..(location as usize + POSITION_BYTES)].try_into().unwrap())
}
fn symbol_to_str(symbol: u32) -> String {
let bytes: [u8; POSITION_BYTES] = symbol.to_ne_bytes();
String::from_utf8_lossy(&bytes).into_owned()
}
impl TreeCursor {
fn push(&mut self, tree: &mut State, depth: Position, input: &[u8], symbol: u32) -> Result<bool, DiffError> {
match tree.nodes[self.node_id as usize] {
Node::Location(location) =>
if symbol_at(input, location + depth) == symbol {
if DEBUG {
println!("location-continue");
};
Ok(true)
} else {
if DEBUG {
println!("location-slice");
};
let old_branch_node_id = tree.allocate_node(Node::Location(location))?;
let new_branch_node_id = tree.allocate_node(Node::Location(self.location))?;
let new_common_fork_id = tree.allocate_fork(location, depth)?;
let new_common_fork = &mut tree.forks[new_common_fork_id as usize];
let old_symbol_at_fork = symbol_at(input, new_common_fork.first_location + new_common_fork.fork_at_depth);
new_common_fork.tines.set(old_symbol_at_fork, old_branch_node_id);
new_common_fork.tines.set(symbol, new_branch_node_id);
tree.nodes[self.node_id as usize] = Node::Fork(new_common_fork_id);
Ok(false)
},
Node::Fork(fork_id) => {
let fork_at_depth = tree.forks[fork_id as usize].fork_at_depth;
match depth.cmp(&fork_at_depth) {
std::cmp::Ordering::Less =>
if symbol_at(input, tree.forks[fork_id as usize].first_location + depth) == symbol {
if DEBUG {
println!("continue");
};
Ok(true)
} else {
if DEBUG {
println!("slice");
};
let old_branch_node_id = tree.allocate_node(Node::Fork(fork_id))?;
let new_branch_node_id = tree.allocate_node(Node::Location(self.location))?;
let new_common_fork_id = tree.allocate_fork(tree.forks[fork_id as usize].first_location, depth)?;
let new_common_fork = &mut tree.forks[new_common_fork_id as usize];
let old_symbol_at_fork = symbol_at(input, new_common_fork.first_location + new_common_fork.fork_at_depth);
new_common_fork.tines.set(old_symbol_at_fork, old_branch_node_id);
new_common_fork.tines.set(symbol, new_branch_node_id);
tree.nodes[self.node_id as usize] = Node::Fork(new_common_fork_id);
Ok(false)
},
std::cmp::Ordering::Equal =>
if let Some(new_node_id) = tree.forks[fork_id as usize].tines.get(symbol) {
if DEBUG {
println!("forward");
};
self.node_id = new_node_id;
Ok(true)
} else {
if DEBUG {
println!("fork");
};
let new_node_id = tree.allocate_node(Node::Location(self.location))?;
tree.forks[fork_id as usize].tines.set(symbol, new_node_id);
Ok(false)
},
std::cmp::Ordering::Greater => {
if DEBUG {
println!("beyond");
};
let node = &mut tree.forks[fork_id as usize];
let symbol_at_fork = symbol_at(input, node.first_location + node.fork_at_depth);
let new_node_id = node.tines.get(symbol_at_fork).unwrap();
self.node_id = new_node_id;
self.push(tree, depth, input, symbol)
}
}
}
}
}
fn inspect(&self) -> String {
format!("Cursor: location:{:?} node_id:{:?}", self.location, self.node_id)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn can_diff_and_patch_select_data_sets() {
let data_sets: Vec<Vec<Vec<u8>>> = vec![
vec![b"abc1abc2ab3".to_vec()],
vec![vec![141, 98, 73, 247, 16, 37, 141, 225, 200, 168, 58, 195, 113, 218, 78, 232, 36, 240, 81, 168]],
vec![vec![124, 194, 53, 107, 161, 53]],
vec![vec![14, 207, 135, 86, 14]],
vec![vec![45, 100, 45, 100, 45, 36, 152, 182, 116, 102]],
vec![vec![206, 131, 206, 131, 206, 179, 122]],
vec![b"ababacd".to_vec()],
vec![b"abcdef".to_vec()],
vec![vec![
116, 112, 118, 98, 101, 107, 120, 107, 116, 117, 104, 119, 112, 105, 120, 110, 107, 106, 118, 108, 122, 107, 119, 113, 101, 120, 107, 116, 115,
115, 108, 100, 111, 112, 109, 97, 109, 120, 107, 107, 104, 111, 111, 101, 97, 109, 117, 99, 115, 110, 101, 121, 106, 115, 103, 120, 109, 101,
102, 101, 107, 101, 103, 113, 101, 117, 117, 108, 123, 104, 105, 108, 105, 120, 110, 102, 98, 103, 102, 112, 110, 97, 99, 117, 102, 120, 112,
114, 98, 108, 115, 118, 117, 103, 97, 112, 120, 107, 116, 104,
]],
vec![b"abcdaabcababcd".to_vec()],
vec![b"abcdaabcaba".to_vec()],
vec![b"abcdaabcababcd".to_vec(), b"#abcdaab#cababcd##".to_vec()],
vec![b"aaaabbbbcccc1111aaaabbbbcccc2222aaaabbbb3333".to_vec(), b"aaaabbbbcccc1111aaaab##bcccc2222aaaabbbb3333".to_vec()],
vec![b"a".to_vec(), b"a".to_vec(), b"a".to_vec(), b"a".to_vec(), b"a".to_vec()],
vec![b"a".to_vec(), b"b".to_vec(), b"c".to_vec(), b"d".to_vec(), b"e".to_vec()],
vec![(0..=255).into_iter().collect::<Vec<u8>>().repeat(5)],
];
for data_set in data_sets {
let mut state = State::new();
let mut old_data = vec![];
for new_data in data_set {
let (diff, new_state) = state.diff(&old_data, &new_data, 8, 6).unwrap();
let patched = patch(&old_data, &diff).unwrap();
assert_eq!(&patched, &new_data);
old_data = patched;
state = new_state;
}
}
}
#[test]
fn can_diff_and_patch_random_data_sets_of_fixed_length() {
let mut state = State::new();
let mut old_data = vec![];
for _ in 0..10 {
let new_data: Vec<u8> =
if cfg!(debug_assertions) { (0..1000).map(|_| rand::random::<u8>()).collect() } else { (0..1000000).map(|_| rand::random::<u8>()).collect() };
let (diff, new_state) = state.diff(&old_data, &new_data, 8, 6).unwrap();
let patched = patch(&old_data, &diff).unwrap();
assert_eq!(&patched, &new_data);
old_data = patched;
state = new_state;
}
}
#[test]
fn can_diff_and_patch_random_data_sets_of_varying_length() {
let mut state = State::new();
let mut old_data = vec![];
for _ in 0..10 {
let length = if cfg!(debug_assertions) { rand::random::<usize>() % 1000 } else { rand::random::<usize>() % 1000000 };
let new_data: Vec<u8> = (0..length).map(|_| rand::random::<u8>()).collect();
let (diff, new_state) = state.diff(&old_data, &new_data, 8, 6).unwrap();
let patched = patch(&old_data, &diff).unwrap();
assert_eq!(&patched, &new_data);
old_data = patched;
state = new_state;
}
}
#[test]
fn can_diff_and_patch_data_sets_contstructed_by_copy_pasting_from_the_origin() {
let mut state = State::new();
let mut old_data = vec![];
for _ in 0..10 {
let new_data: Vec<u8> = if old_data.is_empty() {
if cfg!(debug_assertions) {
(0..100).map(|_| 97 + rand::random::<u8>() % 25).collect()
} else {
(0..1000000).map(|_| rand::random::<u8>()).collect()
}
} else {
let mut ret = old_data.clone();
for _ in 0..100 {
let locations = (rand::random::<usize>() % old_data.len(), rand::random::<usize>() % old_data.len());
let from = std::cmp::min(locations.0, locations.1);
let to = std::cmp::max(locations.0, locations.1);
let target = rand::random::<usize>() % (old_data.len() - (to - from));
for i in from..to {
ret[target + i - from] = old_data[i];
}
}
ret
};
let (diff, new_state) = state.diff(&old_data, &new_data, 8, 6).unwrap();
let patched = patch(&old_data, &diff).unwrap();
assert_eq!(&patched, &new_data);
old_data = patched;
state = new_state;
}
}
#[test]
#[ignore]
fn can_diff_and_patch_wiki_titles() {
let old_data = vec![];
let new_data = std::fs::read("test-data/enwiki-latest-all-titles-in-ns0").unwrap();
let state = State::new();
let (diff, _new_state) = state.diff(&old_data, &new_data, 8, 6).unwrap();
println!("{}", diff.len());
let patched = patch(&old_data, &diff).unwrap();
assert_eq!(&patched, &new_data);
}
}