hakuban 0.7.2 - Docs.rs

use std::convert::TryInto;

use fnv::FnvHashMap;

/*
This is basically a working draft.
It's slow and memory hungry. There is probably some lib out there which can do it well, but I haven't found it yet.
It's basically LZ77, with dictionary initialized by the "old" data.
The dictionary is kept for **entirety** of the old data. There is no moving window.
Code simplicity was a goal. Especially "patch" simplicity. It turned out much more complex than I wanted, but at 500loc it's still managable.
There is deliberately no explicit binary arithmetics like & and |. (There are multiplications and divisions for shifting though, meh).
The code is tuned for diffing, not compression, but it does back-refer to "new" data too, so it **is** a compressor too. Just shitty one.
Another goal was to have no quadratic complexity wrt input/output size, with any content. I have no idea if that goal has been reached.

TO-DO:
	* clean up
	* handle self repeating structures properly, not with cursor count limit
	* evolve best hash function
*/


const DEBUG: bool = false;
const STATS: bool = false;

type ForkID = u32;
type NodeID = u32;
type Position = u32;
const POSITION_BYTES: usize = 4;


#[derive(Debug)]
pub enum DiffError {
	InvalidScanShift,
	TooBigInput,
	TooBigOutput,
	TooManyForks,
	TooManyNodes,
}

impl std::fmt::Display for DiffError {
	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
		write!(f, "{:?}", self)
	}
}

impl std::error::Error for DiffError {}


impl From<DiffError> for String {
	fn from(error: DiffError) -> Self {
		format!("{}", error)
	}
}

pub struct State {
	nodes: Vec<Node>,
	forks: Vec<Fork>,
	allocated_forks: ForkID,
}

impl State {
	pub fn new() -> State {
		let mut state = State { nodes: vec![], forks: vec![], allocated_forks: 0 };
		state.allocate_node(Node::Location(0)).unwrap();
		state
	}

	fn allocate_fork(&mut self, first_location: Position, fork_at_depth: Position) -> Result<ForkID, DiffError> {
		const CHUNK_SIZE: usize = 1024;
		if self.allocated_forks % CHUNK_SIZE as Position == 0 {
			if self.allocated_forks > 2u32.pow(30) {
				return Err(DiffError::TooManyForks);
			};
			let node = Fork { first_location: 0, fork_at_depth: 0, tines: NodeHash::new() };
			self.forks.resize((self.allocated_forks as usize / CHUNK_SIZE + 1) * CHUNK_SIZE, node);
		};
		self.forks[self.allocated_forks as usize].first_location = first_location;
		self.forks[self.allocated_forks as usize].fork_at_depth = fork_at_depth;
		self.allocated_forks += 1;
		Ok(self.allocated_forks - 1)
	}

	fn allocate_node(&mut self, node: Node) -> Result<NodeID, DiffError> {
		self.nodes.push(node);
		let len = self.nodes.len() as NodeID;
		if len > 2u32.pow(31) {
			return Err(DiffError::TooManyNodes);
		};
		Ok((len - 1) as NodeID)
	}

	// this inline gives 10% of execution time reduction
	#[inline(always)]
	fn longest_match(&self, old_data: &[u8], new_data: &[u8], search_start_location: Position) -> (Position, Position) {
		let mut node_id = 0;
		let mut matching_depth = 0;
		loop {
			match self.nodes[node_id as usize] {
				Node::Location(location) => {
					let deepest_posible_match = std::cmp::min((old_data.len() as Position) - location, (new_data.len() as Position) - search_start_location);
					while matching_depth < deepest_posible_match
						&& old_data[(location + matching_depth) as usize] == new_data[(search_start_location + matching_depth) as usize]
					{
						matching_depth += 1;
					}
					return (location, matching_depth);
				}
				Node::Fork(fork_id) => {
					let fork = &self.forks[fork_id as usize];
					let deepest_posible_match =
						std::cmp::min((old_data.len() as Position) - fork.first_location, (new_data.len() as Position) - search_start_location);
					while matching_depth < fork.fork_at_depth
						&& matching_depth < deepest_posible_match
						&& old_data[(fork.first_location + matching_depth) as usize] == new_data[(search_start_location + matching_depth) as usize]
					{
						matching_depth += 1;
					}
					if matching_depth == fork.fork_at_depth && matching_depth < deepest_posible_match.saturating_sub(3) {
						if let Some(new_node_id) = fork.tines.get(symbol_at(new_data, search_start_location + matching_depth)) {
							matching_depth += POSITION_BYTES as Position;
							node_id = new_node_id;
						} else {
							return (fork.first_location, matching_depth);
						}
					} else {
						return (fork.first_location, matching_depth);
					}
				}
			}
		}
	}

	pub fn diff(&self, old_data: &[u8], new_data: &[u8], scan_shift: Position, cursors_max_count: usize) -> Result<(Vec<u8>, State), DiffError> {
		if scan_shift == 0 || scan_shift % POSITION_BYTES as Position != 0 || scan_shift > 1024 * 2024 {
			return Err(DiffError::InvalidScanShift);
		};
		if old_data.len() > (Position::MAX as usize - POSITION_BYTES) {
			return Err(DiffError::TooBigInput);
		};
		if new_data.len() > (Position::MAX as usize - POSITION_BYTES) {
			return Err(DiffError::TooBigInput);
		};

		let mut cursors: Vec<TreeCursor> = Vec::with_capacity(cursors_max_count);
		let mut new_state = State::new();

		let mut max_cursor_count_encountered = 0;
		let mut pick_next_escape_after_location = 0;

		let mut output = vec![];
		let mut escape_byte = 255;
		let mut next_output_symbol_at_index: Position = 0;

		for index in 0..new_data.len() as Position {
			if index == next_output_symbol_at_index {
				let (old_data_match_position_absolute, old_data_match_depth) = self.longest_match(old_data, new_data, index as Position);
				let old_data_match_position = if old_data_match_position_absolute <= index {
					(index - old_data_match_position_absolute) * 2
				} else {
					(old_data_match_position_absolute - index - 1) * 2 + 1
				};
				let bytes_saved_on_old_data_reference = old_data_match_depth as i64
					- (1 + Self::encoding_length(old_data_match_position) as i64 + Self::encoding_length(old_data_match_depth) as i64);
				let (new_data_match_position, new_data_match_depth, bytes_saved_on_new_data_reference) = if index > 0 {
					let (new_data_match_position_absolute, new_data_match_depth) = new_state.longest_match(new_data, new_data, index as Position);
					let new_data_match_position = index - new_data_match_position_absolute;
					let bytes_saved_on_new_data_reference = new_data_match_depth as i64
						- (1 + Self::encoding_length(new_data_match_position) as i64 + Self::encoding_length(new_data_match_depth) as i64);
					assert_ne!(new_data_match_position_absolute, index);
					(new_data_match_position, new_data_match_depth, bytes_saved_on_new_data_reference)
				} else {
					(0, 0, 0)
				};
				if bytes_saved_on_old_data_reference > 0 || bytes_saved_on_new_data_reference > 0 {
					if bytes_saved_on_old_data_reference > bytes_saved_on_new_data_reference {
						Self::push_chunk(&mut output, escape_byte, true, old_data_match_position, old_data_match_depth);
						next_output_symbol_at_index += old_data_match_depth;
					} else {
						Self::push_chunk(&mut output, escape_byte, false, new_data_match_position, new_data_match_depth);
						next_output_symbol_at_index += new_data_match_depth;
					}
				} else {
					if DEBUG {
						println!("{}", new_data[index as usize] as char);
					}
					if new_data[index as usize] == escape_byte {
						output.push(escape_byte);
						if index < pick_next_escape_after_location {
							output.push(escape_byte);
						} else {
							let (new_escape, new_pick_nex_escape_after_location) = Self::pick_escape_byte(escape_byte, new_data, index);
							if new_escape == escape_byte {
								output.push(255);
							} else {
								output.push(127);
								output.push(new_escape);
								escape_byte = new_escape;
								pick_next_escape_after_location = new_pick_nex_escape_after_location;
							}
						}
					} else {
						output.push(new_data[index as usize]);
					};
					next_output_symbol_at_index += 1;
				}
			}

			if output.len() > (Position::MAX as usize - POSITION_BYTES) {
				return Err(DiffError::TooBigOutput);
			}

			if (index + POSITION_BYTES as Position) < new_data.len() as Position && index % (POSITION_BYTES as Position) == 0 {
				let symbol = symbol_at(new_data, index);
				if DEBUG {
					println!("-------------------------------------------------------------- {} - {:?}", index, symbol_to_str(symbol));
				};
				let mut i = 0;
				let mut len = cursors.len();
				//TODO: check if keeping vec always sorted speeds things up
				while i < len {
					if DEBUG {
						println!("{}", cursors[i].inspect());
					};
					let depth = index - cursors[i].location;
					if cursors[i].push(&mut new_state, depth, new_data, symbol)? {
						i += 1;
					} else {
						len -= 1;
						cursors.swap_remove(i);
					};
				}
				if index % scan_shift == 0 {
					if cursors.len() == cursors_max_count {
						let mut deepest_cursor_index = None;
						let mut deepest_cursor_location = Position::MAX;
						for (i, cursor) in cursors.iter().enumerate() {
							if cursor.location < deepest_cursor_location {
								deepest_cursor_location = cursor.location;
								deepest_cursor_index = Some(i as Position);
							}
						}
						//println!("removing {:?}", deepest_cursor_index);
						cursors.swap_remove(deepest_cursor_index.unwrap() as usize);
					}
					let mut new_cursor = TreeCursor { location: index, node_id: 0 };
					if new_cursor.push(&mut new_state, index - new_cursor.location, new_data, symbol)? {
						cursors.push(new_cursor);
					}
				}
				if DEBUG {
					println!("---------------------------------");
					for cursor in cursors.iter() {
						println!("{}", cursor.inspect());
					}
				}
				if STATS && cursors.len() > max_cursor_count_encountered {
					max_cursor_count_encountered = cursors.len()
				}
			}
		}


		if STATS {
			//println!("\n\n{:?}", output);
			println!("\n\nOutput length: {}", output.len());
			println!("Max cursors count: {}", max_cursor_count_encountered);
			println!("Allocated forks count: {}", new_state.allocated_forks);
			println!("Allocated nodes count: {}", new_state.nodes.len());
		}

		Ok((output, new_state))
	}

	fn pick_escape_byte(current: u8, data: &[u8], start_at: Position) -> (u8, Position) {
		let mut occurences: [bool; 256] = [false; 256];
		let till = std::cmp::min(start_at + 255, data.len() as u32);
		for i in start_at..till {
			occurences[data[i as usize] as usize] = true;
		}
		if occurences[current as usize] {
			for (i, occurs) in occurences.iter().enumerate() {
				if !occurs {
					return (i as u8, start_at + 255);
				}
			}
			unreachable!();
		} else {
			(current, start_at + 255)
		}
	}

	fn encoding_length(value: Position) -> usize {
		if value < 124 {
			1
		} else if value < 2u32.pow(16) {
			3
		} else if value < 2u32.pow(24) {
			4
		} else {
			5
		}
	}

	fn push_chunk(output: &mut Vec<u8>, escape: u8, use_data1: bool, location: Position, length: Position) {
		output.push(escape);
		let use_data1_bit: u8 = if use_data1 { 128 } else { 0 };
		if location < 124 {
			output.push(location as u8 + use_data1_bit);
		} else if location < 2u32.pow(16) {
			output.push(126 + use_data1_bit);
			output.extend_from_slice(&location.to_be_bytes()[2..4]);
		} else if location < 2u32.pow(24) {
			output.push(125 + use_data1_bit);
			output.extend_from_slice(&location.to_be_bytes()[1..4]);
		} else {
			output.push(124 + use_data1_bit);
			output.extend_from_slice(&location.to_be_bytes());
		};
		if length < 253 {
			output.push(length as u8);
		} else if length < 2u32.pow(16) {
			output.push(255);
			output.extend_from_slice(&length.to_be_bytes()[2..4]);
		} else if length < 2u32.pow(24) {
			output.push(254);
			output.extend_from_slice(&length.to_be_bytes()[1..4]);
		} else {
			output.push(253);
			output.extend_from_slice(&length.to_be_bytes());
		};
	}
}


pub fn patch(old_data: &[u8], diff: &[u8]) -> Result<Vec<u8>, DiffError> {
	let mut escape_byte = 0xff;
	let mut output = vec![];
	let mut index = 0;
	while index < diff.len() {
		if diff[index] == escape_byte {
			index += 1;
			let reference_is_old_data = diff[index] / 128 == 1;
			let (position_bytes, position_bytes_count) = match diff[index] {
				127 => {
					output.push(escape_byte);
					escape_byte = diff[index + 1];
					index += 2;
					continue;
				}
				255 => {
					output.push(escape_byte);
					index += 1;
					continue;
				}
				124 | 252 => ([diff[index + 1], diff[index + 2], diff[index + 3], diff[index + 4]], 5),
				125 | 253 => ([0, diff[index + 1], diff[index + 2], diff[index + 3]], 4),
				126 | 254 => ([0, 0, diff[index + 1], diff[index + 2]], 3),
				_ => ([0, 0, 0, diff[index] - if reference_is_old_data { 128 } else { 0 }], 1),
			};
			index += position_bytes_count;
			let (length_bytes, consume) = match diff[index] {
				253 => ([diff[index + 1], diff[index + 2], diff[index + 3], diff[index + 4]], 5),
				254 => ([0, diff[index + 1], diff[index + 2], diff[index + 3]], 4),
				255 => ([0, 0, diff[index + 1], diff[index + 2]], 3),
				_ => ([0, 0, 0, diff[index]], 1),
			};
			index += consume;
			let position = Position::from_be_bytes(position_bytes); //- if reference_is_data1 { 128 } else { 0 };
			let length = Position::from_be_bytes(length_bytes);
			let reference_position = output.len();
			if reference_is_old_data {
				if position % 2 == 0 {
					output.extend_from_slice(
						&old_data[(reference_position - position as usize / 2)..(reference_position - position as usize / 2 + length as usize)],
					)
				} else {
					output.extend_from_slice(
						&old_data[(reference_position + position as usize / 2 + 1)..(reference_position + position as usize / 2 + length as usize + 1)],
					)
				}
			} else {
				//TODO: extend_from_within
				for i in (reference_position - position as usize)..(reference_position - position as usize + length as usize) {
					output.push(output[i]);
				}
			}
		} else {
			output.push(diff[index]);
			index += 1;
		}
	}
	Ok(output)
}


#[derive(Debug, Clone)]
pub struct NodeHash {
	tines: FnvHashMap<u32, NodeID>,
}

impl NodeHash {
	fn new() -> NodeHash {
		NodeHash { tines: FnvHashMap::default() }
	}

	fn get(&self, symbol: u32) -> Option<NodeID> {
		self.tines.get(&symbol).cloned()
	}

	fn set(&mut self, symbol: u32, node_id: NodeID) {
		self.tines.insert(symbol, node_id);
	}

	fn iter(&self) -> impl Iterator<Item = (&u32, &NodeID)> {
		//->only usize
		self.tines.iter()
	}
}



#[derive(Debug, Copy, Clone)]
enum Node {
	Location(Position),
	Fork(ForkID),
}

#[derive(Debug, Clone)]
struct Fork {
	first_location: Position,
	fork_at_depth: Position,
	tines: NodeHash,
}


impl Fork {
	// only useful for debugging
	#[allow(dead_code)]
	fn inspect(&self, tree: &State, symbol: u32, depth: Position) -> String {
		let mut ret = "\t".repeat(depth as usize);
		ret += &format!("{:?} {} @ {}\n", symbol_to_str(symbol), self.fork_at_depth, self.first_location);
		for (symbol, node_id) in self.tines.iter() {
			ret += &match &tree.nodes[*node_id as usize] {
				Node::Location(location) => {
					format!("{}{:?} @ {}\n", "\t".repeat(depth as usize + 1), symbol_to_str(*symbol), location)
				}
				Node::Fork(fork_id) => tree.forks[*fork_id as usize].inspect(tree, *symbol, depth + 1),
			}
		}
		ret
	}
}


#[derive(Debug)]
struct TreeCursor {
	location: Position,
	node_id: NodeID,
}

fn symbol_at(input: &[u8], location: Position) -> u32 {
	u32::from_be_bytes(input[location as usize..(location as usize + POSITION_BYTES)].try_into().unwrap())
}

fn symbol_to_str(symbol: u32) -> String {
	let bytes: [u8; POSITION_BYTES] = symbol.to_ne_bytes();
	String::from_utf8_lossy(&bytes).into_owned()
}


impl TreeCursor {
	fn push(&mut self, tree: &mut State, depth: Position, input: &[u8], symbol: u32) -> Result<bool, DiffError> {
		match tree.nodes[self.node_id as usize] {
			Node::Location(location) =>
				if symbol_at(input, location + depth) == symbol {
					if DEBUG {
						println!("location-continue");
					};
					Ok(true)
				} else {
					if DEBUG {
						println!("location-slice");
					};
					let old_branch_node_id = tree.allocate_node(Node::Location(location))?;
					let new_branch_node_id = tree.allocate_node(Node::Location(self.location))?;
					let new_common_fork_id = tree.allocate_fork(location, depth)?;
					let new_common_fork = &mut tree.forks[new_common_fork_id as usize];
					let old_symbol_at_fork = symbol_at(input, new_common_fork.first_location + new_common_fork.fork_at_depth);
					new_common_fork.tines.set(old_symbol_at_fork, old_branch_node_id);
					new_common_fork.tines.set(symbol, new_branch_node_id);
					tree.nodes[self.node_id as usize] = Node::Fork(new_common_fork_id);
					Ok(false)
				},
			Node::Fork(fork_id) => {
				let fork_at_depth = tree.forks[fork_id as usize].fork_at_depth;
				match depth.cmp(&fork_at_depth) {
					std::cmp::Ordering::Less =>
						if symbol_at(input, tree.forks[fork_id as usize].first_location + depth) == symbol {
							if DEBUG {
								println!("continue");
							};
							Ok(true)
						} else {
							if DEBUG {
								println!("slice");
							};
							let old_branch_node_id = tree.allocate_node(Node::Fork(fork_id))?;
							let new_branch_node_id = tree.allocate_node(Node::Location(self.location))?;
							let new_common_fork_id = tree.allocate_fork(tree.forks[fork_id as usize].first_location, depth)?;
							let new_common_fork = &mut tree.forks[new_common_fork_id as usize];
							let old_symbol_at_fork = symbol_at(input, new_common_fork.first_location + new_common_fork.fork_at_depth);
							new_common_fork.tines.set(old_symbol_at_fork, old_branch_node_id);
							new_common_fork.tines.set(symbol, new_branch_node_id);
							tree.nodes[self.node_id as usize] = Node::Fork(new_common_fork_id);
							Ok(false)
						},
					std::cmp::Ordering::Equal =>
						if let Some(new_node_id) = tree.forks[fork_id as usize].tines.get(symbol) {
							if DEBUG {
								println!("forward");
							};
							self.node_id = new_node_id;
							Ok(true)
						} else {
							if DEBUG {
								println!("fork");
							};
							let new_node_id = tree.allocate_node(Node::Location(self.location))?;
							tree.forks[fork_id as usize].tines.set(symbol, new_node_id);
							Ok(false)
						},
					std::cmp::Ordering::Greater => {
						if DEBUG {
							println!("beyond");
						};
						let node = &mut tree.forks[fork_id as usize];
						let symbol_at_fork = symbol_at(input, node.first_location + node.fork_at_depth);
						let new_node_id = node.tines.get(symbol_at_fork).unwrap();
						self.node_id = new_node_id;
						self.push(tree, depth, input, symbol)
					}
				}
			}
		}
	}

	fn inspect(&self) -> String {
		format!("Cursor:  location:{:?}  node_id:{:?}", self.location, self.node_id)
	}
}



#[cfg(test)]
mod tests {

	use super::*;

	#[test]
	fn can_diff_and_patch_select_data_sets() {
		let data_sets: Vec<Vec<Vec<u8>>> = vec![
			vec![b"abc1abc2ab3".to_vec()],
			vec![vec![141, 98, 73, 247, 16, 37, 141, 225, 200, 168, 58, 195, 113, 218, 78, 232, 36, 240, 81, 168]],
			vec![vec![124, 194, 53, 107, 161, 53]],
			vec![vec![14, 207, 135, 86, 14]],
			vec![vec![45, 100, 45, 100, 45, 36, 152, 182, 116, 102]],
			vec![vec![206, 131, 206, 131, 206, 179, 122]],
			vec![b"ababacd".to_vec()],
			vec![b"abcdef".to_vec()],
			vec![vec![
				116, 112, 118, 98, 101, 107, 120, 107, 116, 117, 104, 119, 112, 105, 120, 110, 107, 106, 118, 108, 122, 107, 119, 113, 101, 120, 107, 116, 115,
				115, 108, 100, 111, 112, 109, 97, 109, 120, 107, 107, 104, 111, 111, 101, 97, 109, 117, 99, 115, 110, 101, 121, 106, 115, 103, 120, 109, 101,
				102, 101, 107, 101, 103, 113, 101, 117, 117, 108, 123, 104, 105, 108, 105, 120, 110, 102, 98, 103, 102, 112, 110, 97, 99, 117, 102, 120, 112,
				114, 98, 108, 115, 118, 117, 103, 97, 112, 120, 107, 116, 104,
			]],
			vec![b"abcdaabcababcd".to_vec()],
			vec![b"abcdaabcaba".to_vec()],
			vec![b"abcdaabcababcd".to_vec(), b"#abcdaab#cababcd##".to_vec()],
			vec![b"aaaabbbbcccc1111aaaabbbbcccc2222aaaabbbb3333".to_vec(), b"aaaabbbbcccc1111aaaab##bcccc2222aaaabbbb3333".to_vec()],
			vec![b"a".to_vec(), b"a".to_vec(), b"a".to_vec(), b"a".to_vec(), b"a".to_vec()],
			vec![b"a".to_vec(), b"b".to_vec(), b"c".to_vec(), b"d".to_vec(), b"e".to_vec()],
			vec![(0..=255).into_iter().collect::<Vec<u8>>().repeat(5)],
		];

		for data_set in data_sets {
			let mut state = State::new();
			let mut old_data = vec![];
			for new_data in data_set {
				let (diff, new_state) = state.diff(&old_data, &new_data, 8, 6).unwrap();
				let patched = patch(&old_data, &diff).unwrap();
				assert_eq!(&patched, &new_data);
				old_data = patched;
				state = new_state;
			}
		}
	}

	#[test]
	fn can_diff_and_patch_random_data_sets_of_fixed_length() {
		let mut state = State::new();
		let mut old_data = vec![];
		for _ in 0..10 {
			let new_data: Vec<u8> =
				if cfg!(debug_assertions) { (0..1000).map(|_| rand::random::<u8>()).collect() } else { (0..1000000).map(|_| rand::random::<u8>()).collect() };
			let (diff, new_state) = state.diff(&old_data, &new_data, 8, 6).unwrap();
			let patched = patch(&old_data, &diff).unwrap();
			assert_eq!(&patched, &new_data);
			old_data = patched;
			state = new_state;
		}
	}

	#[test]
	fn can_diff_and_patch_random_data_sets_of_varying_length() {
		let mut state = State::new();
		let mut old_data = vec![];
		for _ in 0..10 {
			let length = if cfg!(debug_assertions) { rand::random::<usize>() % 1000 } else { rand::random::<usize>() % 1000000 };
			let new_data: Vec<u8> = (0..length).map(|_| rand::random::<u8>()).collect();
			let (diff, new_state) = state.diff(&old_data, &new_data, 8, 6).unwrap();
			let patched = patch(&old_data, &diff).unwrap();
			assert_eq!(&patched, &new_data);
			old_data = patched;
			state = new_state;
		}
	}

	#[test]
	fn can_diff_and_patch_data_sets_contstructed_by_copy_pasting_from_the_origin() {
		let mut state = State::new();
		let mut old_data = vec![];
		for _ in 0..10 {
			let new_data: Vec<u8> = if old_data.is_empty() {
				if cfg!(debug_assertions) {
					(0..100).map(|_| 97 + rand::random::<u8>() % 25).collect()
				} else {
					(0..1000000).map(|_| rand::random::<u8>()).collect()
				}
			} else {
				let mut ret = old_data.clone();
				for _ in 0..100 {
					let locations = (rand::random::<usize>() % old_data.len(), rand::random::<usize>() % old_data.len());
					let from = std::cmp::min(locations.0, locations.1);
					let to = std::cmp::max(locations.0, locations.1);
					let target = rand::random::<usize>() % (old_data.len() - (to - from));
					for i in from..to {
						ret[target + i - from] = old_data[i];
					}
				}
				ret
			};
			let (diff, new_state) = state.diff(&old_data, &new_data, 8, 6).unwrap();
			let patched = patch(&old_data, &diff).unwrap();
			assert_eq!(&patched, &new_data);
			old_data = patched;
			state = new_state;
		}
	}


	#[test]
	#[ignore]
	fn can_diff_and_patch_wiki_titles() {
		let old_data = vec![];
		let new_data = std::fs::read("test-data/enwiki-latest-all-titles-in-ns0").unwrap();
		let state = State::new();
		let (diff, _new_state) = state.diff(&old_data, &new_data, 8, 6).unwrap();
		println!("{}", diff.len());
		let patched = patch(&old_data, &diff).unwrap();
		assert_eq!(&patched, &new_data);
	}
}