librsyncr 0.1.1

librsyncr is a Rust library to calculate and apply deltas between two files without having access to both files on the same system.
Documentation
/*
 * Copyright 2018 Jordan Miner
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 */

use std::cmp::min;
use std::io::{self, Read, Write};
use std::ops::Range;
use std::ptr;

use blake2_rfc::blake2b::Blake2b;
use byteorder::{BE, WriteBytesExt};

use super::rolling_checksum::RollingChecksum;
use super::signature_lookup::SignatureLookup;
use super::DELTA_MAGIC;

pub struct DeltaCommandWriter<W: Write> {
    writer: W,
    block_size: u32,
    // (start, count)
    matched_blocks: Option<(u64, u64)>,
    literal_buffer: Vec<u8>,
}

impl<W: Write> DeltaCommandWriter<W> {
    pub fn new(mut writer: W, block_size: u32) -> io::Result<Self> {
        writer.write_u32::<BE>(DELTA_MAGIC)?; // header
        Ok(DeltaCommandWriter {
            writer,
            block_size,
            matched_blocks: None,
            literal_buffer: vec![],
        })
    }

  // `block_index` is the index of the block in the old file that matched the current location
  // in the new file.
    pub fn record_matched_block(&mut self, block_index: u64) -> io::Result<()> {
        self.write_literal_command()?;
        if let Some((start, count)) = self.matched_blocks {
            if start + count == block_index {
                self.matched_blocks = Some((start, count + 1));
            } else {
                self.write_current_copy_command()?;
                self.matched_blocks = Some((block_index, 1));
            }
        } else {
            self.matched_blocks = Some((block_index, 1));
        }
        Ok(())
    }

    pub fn add_literal_data(&mut self, data: &[u8]) -> io::Result<()> {
        self.write_current_copy_command()?;
        self.literal_buffer.extend(data);
        Ok(())
    }

    pub fn finish(&mut self) -> io::Result<()> {
        self.write_literal_command()?;
        self.write_current_copy_command()?;
        self.writer.write_u8(0)?;
        Ok(())
    }

    fn write_current_copy_command(&mut self) -> io::Result<()> {
        let block_size = self.block_size as u64;
        if let Some((start, count)) = self.matched_blocks {
            self.write_copy_command(start * block_size, count * block_size)?;
            self.matched_blocks = None;
        }
        Ok(())
    }

    fn write_literal_command(&mut self) -> io::Result<()> {
        if self.literal_buffer.is_empty() {
            return Ok(());
        }
        let length = self.literal_buffer.len() as u64;
        let len_size_class = get_uint_size_class(length);
        if length <= 64 {
            self.writer.write_u8(length as u8)?;
        } else {
            self.writer.write_u8(65 + len_size_class)?;
            self.write_uint_big_endian(length, len_size_class)?;
        }
        self.writer.write(&self.literal_buffer)?;
        self.literal_buffer.clear();
        Ok(())
    }

    fn write_copy_command(&mut self, start: u64, length: u64) -> io::Result<()> {
        let start_size_class = get_uint_size_class(start);
        let len_size_class = get_uint_size_class(length);
        self.writer.write_u8(69 + start_size_class * 4 + len_size_class)?;
        self.write_uint_big_endian(start, start_size_class)?;
        self.write_uint_big_endian(length, len_size_class)?;
        Ok(())
    }

    fn write_uint_big_endian(&mut self, n: u64, size_class: u8) -> io::Result<()> {
        match size_class {
            0 => self.writer.write_u8(n as u8),
            1 => self.writer.write_u16::<BE>(n as u16),
            2 => self.writer.write_u32::<BE>(n as u32),
            3 => self.writer.write_u64::<BE>(n as u64),
            _ => panic!(),
        }
    }
}

// Returns the number of bytes needed to store the specified unsigned integer.
fn get_uint_size_class(n: u64) -> u8 {
    return if n <= 0xFF {
        0
    } else if n <= 0xFFFF {
        1
    } else if n <= 0xFFFFFFFF {
        2
    } else /*if n <= 0xFFFFFFFFFFFFFFFF*/ {
        3
    };
}

pub struct DeltaGenerator<W: Write> {
    pub buffer: Vec<u8>,
    pub buffer_end: usize,
    sig_lookup: SignatureLookup,
    block_size: usize,
    processed_count: usize,
    cmd_writer: DeltaCommandWriter<W>,
    rolling_checksum: RollingChecksum,
}

impl<W: Write> DeltaGenerator<W> {
    pub fn new(writer: W, sig_lookup: SignatureLookup) -> io::Result<Self> {
        let block_size = sig_lookup.block_size();
        Ok(DeltaGenerator {
            buffer: vec![0u8; block_size as usize * 10],
            buffer_end: 0,
            sig_lookup,
            block_size: block_size as usize,
            processed_count: 0,
            cmd_writer: DeltaCommandWriter::new(writer, block_size)?,
            rolling_checksum: RollingChecksum::new(),
        })
    }

    pub fn generate<R: Read>(mut self, mut new_file: R) -> io::Result<()> {
        loop {
            let buffer_len = self.buffer.len();
            let num_read = new_file.read(&mut self.buffer[self.buffer_end..buffer_len])?;
            self.buffer_end += num_read;
            if num_read == 0 {
                self.process_buffer(true)?;
                break;
            }
            if self.buffer_end == self.buffer.len() {
                self.process_buffer(false)?;
            }
        }
        Ok(())
    }

    fn process_buffer(&mut self, finish: bool) -> io::Result<()> {
        if self.buffer_end < self.block_size {
            assert!(finish);
            self.cmd_writer.add_literal_data(&self.buffer[0..self.buffer_end])?;
            self.cmd_writer.finish()?;
            return Ok(());
        }

        // There are two cases where processed_count is less than block_size. One is the first time
        // the buffer is processed and the other is if a match was found within a block size of the
        // end of the buffer.
        self.rolling_checksum.add_slice(
            &self.buffer[self.processed_count..self.block_size]);
        let mut i = self.block_size;
        let mut literal_start = 0;

        loop {
            let matching_blocks = self.sig_lookup.get_blocks(self.rolling_checksum.get());
            let mut matching_block = None;
            if let Some(matching_blocks) = matching_blocks {
                if !matching_blocks.is_empty() {
                    let mut hash = Blake2b::new(32);
                    hash.update(&self.buffer[i - self.block_size..i]);
                    let res = hash.finalize();
                    let hash = &res.as_bytes()[0..self.sig_lookup.strong_sum_size() as usize];
                    matching_block =
                        matching_blocks.iter().find(|block| { return block.strong_sum == hash; });
                }
            }

            if let Some(matching_block) = matching_block {
                self.cmd_writer.add_literal_data(
                    &self.buffer[literal_start..i - self.block_size])?;
                self.cmd_writer.record_matched_block(matching_block.block_index)?;
                literal_start = i;

                self.rolling_checksum = RollingChecksum::new();
                self.rolling_checksum.add_slice(
                    &self.buffer[i..min(i + self.block_size, self.buffer_end)]);
                i += self.block_size;
                if i > self.buffer_end {
                    break;
                }
            } else {
                if i >= self.buffer_end {
                    break;
                }
                self.rolling_checksum.rotate(self.buffer[i], self.buffer[i - self.block_size]);
                i += 1;
            }
        }
        self.cmd_writer.add_literal_data(&self.buffer[literal_start..i - self.block_size])?;

        // Remove data that has been processed (remove 0..i - block_size)
        let rem_data_range = i - self.block_size..self.buffer_end;
        copy_within_slice(&mut self.buffer, rem_data_range.clone(), 0);
        self.buffer_end = rem_data_range.end - rem_data_range.start;
        self.processed_count = self.buffer_end;

        if finish {
            self.cmd_writer.add_literal_data(&self.buffer[0..self.buffer_end])?;
            self.cmd_writer.finish()?;
        }

        Ok(())
    }
}

fn copy_within_slice<T: Copy>(slice: &mut [T], src_range: Range<usize>, dest: usize) {
    unsafe {
        let src_len = src_range.end - src_range.start;
        assert!(src_range.start <= slice.len());
        assert!(src_range.end <= slice.len());
        assert!(dest <= slice.len() - src_len);

        ptr::copy(slice.as_ptr().offset(src_range.start as isize),
            slice.as_mut_ptr().offset(dest as isize),
            src_len);
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_delta_cmd_gen_simple() {
        // Has a literal at the beginning and end.
        let mut cmds = vec![];
        {
            let mut cmd_writer = DeltaCommandWriter::new(&mut cmds, 30).unwrap();
            cmd_writer.add_literal_data(&[0xF8]).unwrap();
            cmd_writer.record_matched_block(3).unwrap();
            cmd_writer.add_literal_data(&[0x3D, 0xF8]).unwrap();
            cmd_writer.finish().unwrap();
        }
        assert_eq!(cmds, &[
            0x72, 0x73, 0x02, 0x36, // header
            0x01, 0xF8, // literal cmd
            0x45, 0x5A, 0x1E, // copy cmd for block 3
            0x02, 0x3D, 0xF8, // literal cmd
            0x00, // end of file
        ]);
    }

    #[test]
    fn test_delta_cmd_gen() {
        // Has a record at the beginning and end and tests coalescing matched blocks
        let mut cmds = vec![];
        {
            let mut cmd_writer = DeltaCommandWriter::new(&mut cmds, 2000).unwrap();
            cmd_writer.record_matched_block(3).unwrap();
            cmd_writer.add_literal_data(&[0x83]).unwrap();
            cmd_writer.add_literal_data(&[0x3D, 0xF8, 0x23]).unwrap();
            cmd_writer.record_matched_block(4).unwrap();
            cmd_writer.record_matched_block(5).unwrap();
            cmd_writer.record_matched_block(6).unwrap();
            cmd_writer.record_matched_block(42).unwrap();
            cmd_writer.record_matched_block(1).unwrap();
            cmd_writer.finish().unwrap();
        }
        assert_eq!(cmds, &[
            0x72, 0x73, 0x02, 0x36, // header
            0x4A, 0x17, 0x70, 0x07, 0xD0, // copy cmd for block 3
            0x04, 0x83, 0x3D, 0xF8, 0x23, // literal cmd
            0x4A, 0x1F, 0x40, 0x17, 0x70, // copy cmd for blocks 4, 5, 6
            0x4E, 0x00, 0x01, 0x48, 0x20, 0x07, 0xD0, // copy cmd for block 42
            0x4A, 0x07, 0xD0, 0x07, 0xD0, // copy cmd for block 1
            0x00, // end of file
        ]);
    }
}