use std::borrow::Borrow;
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use crate::ChunkOffset;
use crate::{chunk_location_map::ChunkLocationMap, HashSum};
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ReorderOp<'a> {
Copy {
hash: &'a HashSum,
size: usize,
source: u64,
dest: Vec<u64>,
},
StoreInMem {
hash: &'a HashSum,
size: usize,
source: u64,
},
}
#[derive(Eq, PartialEq)]
struct MoveChunk<'a> {
hash: &'a HashSum,
size: usize,
source: u64,
}
impl Ord for MoveChunk<'_> {
fn cmp(&self, other: &Self) -> Ordering {
self.source.cmp(&other.source)
}
}
impl PartialOrd for MoveChunk<'_> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct ChunkLocation {
size: usize,
offsets: Vec<u64>,
}
impl ChunkLocation {
fn add_offset_sorted(&mut self, offset: u64) {
if let Err(idx) = self.offsets.binary_search(&offset) {
self.offsets.insert(idx, offset);
}
}
pub fn size(&self) -> usize {
self.size
}
pub fn offsets(&self) -> &[u64] {
&self.offsets[..]
}
}
#[derive(Clone, Debug)]
pub struct ChunkIndex {
map: HashMap<HashSum, ChunkLocation>,
hash_length: usize,
}
pub trait HashSumKey {
fn sum(&self) -> &[u8];
}
impl HashSumKey for HashSum {
fn sum(&self) -> &[u8] {
self.slice()
}
}
impl<'a> Borrow<dyn HashSumKey + 'a> for HashSum {
fn borrow(&self) -> &(dyn HashSumKey + 'a) {
self
}
}
struct TruncatedHashSum<'a> {
hash: &'a HashSum,
truncate_len: usize,
}
impl HashSumKey for TruncatedHashSum<'_> {
fn sum(&self) -> &[u8] {
let hash = self.hash.slice();
if hash.len() > self.truncate_len {
&hash[..self.truncate_len]
} else {
hash
}
}
}
impl Eq for dyn HashSumKey + '_ {}
impl PartialEq for dyn HashSumKey + '_ {
fn eq(&self, other: &dyn HashSumKey) -> bool {
self.sum() == other.sum()
}
}
impl std::hash::Hash for dyn HashSumKey + '_ {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.sum().hash(state)
}
}
impl ChunkIndex {
pub fn new_empty(hash_length: usize) -> Self {
Self {
map: HashMap::new(),
hash_length,
}
}
pub fn add_chunk(&mut self, mut hash: HashSum, size: usize, offsets: &[u64]) {
hash.truncate(self.hash_length);
let location = self.map.entry(hash).or_insert(ChunkLocation {
size,
offsets: vec![],
});
offsets
.iter()
.for_each(|offset| location.add_offset_sorted(*offset));
}
pub fn remove(&mut self, hash: &HashSum) -> Option<ChunkLocation> {
self.map.remove(&TruncatedHashSum {
hash,
truncate_len: self.hash_length,
} as &dyn HashSumKey)
}
pub fn contains(&self, hash: &HashSum) -> bool {
self.map.contains_key(&TruncatedHashSum {
hash,
truncate_len: self.hash_length,
} as &dyn HashSumKey)
}
fn get(&self, hash: &HashSum) -> Option<&ChunkLocation> {
self.map.get(hash)
}
fn get_first_offset(&self, hash: &HashSum) -> Option<u64> {
self.get(hash)
.map(|ChunkLocation { offsets, .. }| offsets[0])
}
pub fn len(&self) -> usize {
self.map.len()
}
pub fn is_empty(&self) -> bool {
self.map.is_empty()
}
pub fn offsets<'a>(&'a self, hash: &HashSum) -> Option<impl Iterator<Item = u64> + 'a> {
self.map.get(hash).map(|d| d.offsets.iter().copied())
}
pub fn keys(&self) -> impl Iterator<Item = &HashSum> {
self.map.keys()
}
pub fn strip_chunks_already_in_place(&self, chunk_set: &mut ChunkIndex) -> (usize, u64) {
let mut num_alread_in_place = 0;
let mut total_size = 0;
let new_set: HashMap<HashSum, ChunkLocation> = chunk_set
.map
.iter()
.filter_map(|(hash, cd)| {
let mut cd = cd.clone();
if let Some(ChunkLocation { offsets, size }) = self.get(hash) {
offsets.iter().for_each(|remove_offset| {
cd.offsets
.iter()
.position(|offset| *offset == *remove_offset)
.map(|pos| cd.offsets.remove(pos));
});
let offsets_in_place = offsets.len() - cd.offsets.len();
num_alread_in_place += offsets_in_place;
total_size += (*size * offsets_in_place) as u64;
if cd.offsets.is_empty() {
return None;
}
}
Some((hash.clone(), cd))
})
.collect();
chunk_set.map = new_set;
(num_alread_in_place, total_size)
}
fn build_reorder_ops<'a>(
&'a self,
root_chunk: MoveChunk<'a>,
new_order: &ChunkIndex,
source_layout: &ChunkLocationMap<&'a HashSum>,
visited: &mut HashSet<&'a HashSum>,
ops: &mut Vec<ReorderOp<'a>>,
) {
let mut stack: Vec<(MoveChunk, Option<ReorderOp>)> = vec![(root_chunk, None)];
while let Some((chunk, op)) = stack.last_mut() {
if !visited.contains(chunk.hash) {
visited.insert(chunk.hash);
let mut child_stack = Vec::new();
let mut destinations = Vec::new();
if let Some(ChunkLocation {
size,
offsets: target_offsets,
}) = new_order.get(chunk.hash)
{
target_offsets.iter().for_each(|&target_offset| {
source_layout
.iter_overlapping(ChunkOffset::new(target_offset, *size))
.filter(|(_, &ovhash)| chunk.hash != ovhash)
.for_each(|(location, &ovhash)| {
let first_offset = self.get_first_offset(ovhash).unwrap();
if visited.contains(ovhash) {
ops.push(ReorderOp::StoreInMem {
hash: ovhash,
size: location.size,
source: first_offset,
});
} else {
child_stack.push((
MoveChunk {
hash: ovhash,
size: location.size,
source: first_offset,
},
None,
));
}
});
destinations.push(target_offset);
});
}
*op = Some(ReorderOp::Copy {
hash: chunk.hash,
size: chunk.size,
source: chunk.source,
dest: destinations,
});
stack.append(&mut child_stack);
} else if let Some((_chunk, Some(op))) = stack.pop() {
ops.push(op);
}
}
}
pub fn reorder_ops(&self, new_order: &ChunkIndex) -> Vec<ReorderOp<'_>> {
let mut source_layout: ChunkLocationMap<&HashSum> = ChunkLocationMap::new();
let chunks_to_move: Vec<MoveChunk> = {
let mut chunks = self
.map
.iter()
.filter_map(|(hash, location)| {
if new_order.contains(hash) {
let size = location.size;
let first_offset = self.get_first_offset(hash).unwrap();
source_layout.insert(
ChunkOffset {
size,
offset: first_offset,
},
hash,
);
Some(MoveChunk {
hash,
size,
source: first_offset,
})
} else {
None
}
})
.collect::<Vec<MoveChunk>>();
chunks.sort();
chunks
};
let mut ops: Vec<ReorderOp> = Vec::new();
let mut chunks_processed: HashSet<&HashSum> = HashSet::new();
chunks_to_move.into_iter().for_each(|chunk| {
if !chunks_processed.contains(&chunk.hash) {
let mut chunks_in_tree = HashSet::new();
self.build_reorder_ops(
chunk,
new_order,
&source_layout,
&mut chunks_in_tree,
&mut ops,
);
chunks_in_tree.into_iter().for_each(|hash| {
let ChunkLocation { size, offsets } = &self.get(hash).unwrap();
offsets.iter().for_each(|offset| {
source_layout.remove(&ChunkOffset::new(*offset, *size));
});
chunks_processed.insert(hash);
});
}
});
ops
}
pub fn iter_chunks(&self) -> impl Iterator<Item = (&HashSum, &ChunkLocation)> {
self.map.iter()
}
}
#[cfg(test)]
mod tests {
use super::*;
impl From<(usize, &[u64])> for ChunkLocation {
fn from((size, offsets): (usize, &[u64])) -> Self {
Self {
size,
offsets: offsets.to_vec(),
}
}
}
#[test]
fn location_sorted_offsets() {
let mut l = ChunkLocation {
size: 10,
offsets: vec![],
};
l.add_offset_sorted(20);
l.add_offset_sorted(10);
l.add_offset_sorted(5);
l.add_offset_sorted(8);
l.add_offset_sorted(0);
assert_eq!(&l.offsets[..], &[0, 5, 8, 10, 20]);
}
#[test]
fn location_no_duplicate_offset() {
let mut l = ChunkLocation {
size: 10,
offsets: vec![],
};
l.add_offset_sorted(5);
l.add_offset_sorted(5);
assert_eq!(&l.offsets[..], &[5]);
}
#[test]
fn add_chunk_multiple_offsets() {
let mut ci = ChunkIndex::new_empty(HashSum::MAX_LEN);
ci.add_chunk(HashSum::from(&[1]), 10, &[5]);
ci.add_chunk(HashSum::from(&[1]), 10, &[15]);
let mut it = ci.iter_chunks();
assert_eq!(
it.next(),
Some((
&HashSum::from(&[1]),
&ChunkLocation {
size: 10,
offsets: vec![5, 15]
}
))
);
}
#[test]
fn reorder_with_overlap_and_loop() {
let mut current_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
current_index.add_chunk(HashSum::from(&[1]), 10, &[0]);
current_index.add_chunk(HashSum::from(&[2]), 20, &[10]);
current_index.add_chunk(HashSum::from(&[3]), 20, &[30, 50]);
let mut target_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
target_index.add_chunk(HashSum::from(&[1]), 10, &[60]);
target_index.add_chunk(HashSum::from(&[2]), 20, &[50]);
target_index.add_chunk(HashSum::from(&[3]), 20, &[10, 30]);
target_index.add_chunk(HashSum::from(&[4]), 5, &[0, 5]);
let ops = current_index.reorder_ops(&target_index);
assert_eq!(
ops[0],
ReorderOp::Copy {
hash: &HashSum::from(&[1]),
size: 10,
source: 0,
dest: vec![60],
}
);
assert_eq!(
ops[1],
ReorderOp::Copy {
hash: &HashSum::from(&[2]),
size: 20,
source: 10,
dest: vec![50],
}
);
assert_eq!(
ops[2],
ReorderOp::Copy {
hash: &HashSum::from(&[3]),
size: 20,
source: 30,
dest: vec![10, 30],
}
);
}
#[test]
fn reorder_but_do_not_copy_to_self() {
let mut current_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
current_index.add_chunk(HashSum::from(&[1]), 10, &[0, 20]);
let mut target_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
target_index.add_chunk(HashSum::from(&[1]), 10, &[20, 40]);
current_index.strip_chunks_already_in_place(&mut target_index);
let ops = current_index.reorder_ops(&target_index);
assert_eq!(
ops[0],
ReorderOp::Copy {
hash: &HashSum::from(&[1]),
size: 10,
source: 0,
dest: vec![40],
}
);
}
#[test]
fn filter_one_chunk_in_place() {
let mut current_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
current_index.add_chunk(HashSum::from(&[1]), 10, &[0]);
let mut target_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
target_index.add_chunk(HashSum::from(&[1]), 10, &[0]);
current_index.strip_chunks_already_in_place(&mut target_index);
assert_eq!(target_index.len(), 0);
}
#[test]
fn filter_one_chunk_multiple_offsets_in_place() {
let mut current_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
current_index.add_chunk(HashSum::from(&[1]), 10, &[20, 0]);
let mut target_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
target_index.add_chunk(HashSum::from(&[1]), 10, &[0, 20]);
current_index.strip_chunks_already_in_place(&mut target_index);
assert_eq!(target_index.len(), 0);
}
#[test]
fn filter_one_chunk_multiple_offsets_not_in_place() {
let mut current_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
current_index.add_chunk(HashSum::from(&[1]), 10, &[0, 20]);
let mut target_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
target_index.add_chunk(HashSum::from(&[1]), 10, &[20, 30]);
current_index.strip_chunks_already_in_place(&mut target_index);
assert_eq!(target_index.len(), 1);
assert_eq!(
target_index.map.get(&HashSum::from(&[1])).unwrap().offsets,
&[30]
);
}
#[test]
fn filter_multiple_chunks_in_place() {
let mut current_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
current_index.add_chunk(HashSum::from(&[1]), 10, &[0]);
current_index.add_chunk(HashSum::from(&[2]), 20, &[20]);
current_index.add_chunk(HashSum::from(&[3]), 20, &[30, 50]);
current_index.add_chunk(HashSum::from(&[4]), 5, &[70, 80]);
let mut target_index = ChunkIndex::new_empty(HashSum::MAX_LEN);
target_index.add_chunk(HashSum::from(&[1]), 10, &[10]);
target_index.add_chunk(HashSum::from(&[2]), 20, &[20]);
target_index.add_chunk(HashSum::from(&[3]), 20, &[30, 50]);
target_index.add_chunk(HashSum::from(&[4]), 5, &[80, 90]);
current_index.strip_chunks_already_in_place(&mut target_index);
assert_eq!(target_index.len(), 2);
assert_eq!(
target_index.get(&HashSum::from(&[1])).unwrap(),
&ChunkLocation {
size: 10,
offsets: vec![10],
}
);
assert_eq!(
target_index.get(&HashSum::from(&[4])).unwrap(),
&ChunkLocation {
size: 5,
offsets: vec![90],
}
);
}
#[test]
fn lookup_truncated_hash_sum() {
let mut index = ChunkIndex::new_empty(4);
index.add_chunk(HashSum::from([1, 2, 3, 4, 99, 99]), 10, &[0]);
assert!(index.contains(&HashSum::from([1, 2, 3, 4, 5, 6])));
index.remove(&HashSum::from([1, 2, 3, 4, 5, 6])).unwrap();
}
}