librsyncr 0.1.1

librsyncr is a Rust library to calculate and apply deltas between two files without having access to both files on the same system.
Documentation
/*
 * Copyright 2018 Jordan Miner
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 */

use std::fmt::{self, Debug};
use std::io::{self, BufReader, BufWriter, Read, Seek, SeekFrom, Write};

use blake2_rfc::blake2b::Blake2b;
use byteorder::{BE, ReadBytesExt, WriteBytesExt};

use delta_generator::DeltaGenerator;
use rolling_checksum::RollingChecksum;
use signature_lookup::SignatureLookup;

mod delta_generator;
mod rolling_checksum;
mod signature_lookup;

// TODO: move ProgressReader to its own crate

///  A `Read` adaptor that calls a callback every so often as data is read from it so that the
///  progress of an operation can be reported.
pub struct ProgressReader<R: Read> {
    inner: R,
    total_len: u64,
    read_len: u64,
    // The number of bytes to read between calls to the callback. Precalculated to avoid a division
    // every read call.
    progress_step: u64,
    // The value of `read_len` the last time the callback was called
    last_read_len: u64,
    callback: Box<dyn FnMut(f32)>,
}

impl<R: Read> ProgressReader<R> {
    /// Creates a new `ProgressReader`. The `total_len` is the total number of bytes in the
    /// operation and the `callback` is the function to be called after every (currently) 1% has
    /// been read.
    pub fn new(inner: R, total_len: u64, callback: Box<dyn FnMut(f32)>) -> Self {
        ProgressReader {
            inner,
            total_len,
            read_len: 0,
            progress_step: total_len / 100,
            last_read_len: 0,
            callback,
        }
    }

    /// Returns the number of bytes that have been read so far.
    pub fn read_len(&self) -> u64 {
        self.read_len
    }
}

impl<R: Read> Read for ProgressReader<R> {
    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
        // Call the callback first because now the previously read data has been processed.
        if self.read_len - self.last_read_len >= self.progress_step {
            (self.callback)(self.read_len as f32 / self.total_len as f32);
            self.last_read_len = self.read_len;
        }

        let res = self.inner.read(buf);
        if let Ok(len) = res {
            self.read_len += len as u64;
        }
        res
    }
}

#[derive(Eq, PartialEq)]
struct HexPrinter<'a>(&'a [u8]);

impl<'a> Debug for HexPrinter<'a> {
    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
        write!(f, "[")?;
        let mut first = true;
        for b in self.0 {
            if !first {
                write!(f, " ")?;
            }
            write!(f, "{:02X}", b)?;
            first = false;
        }
        write!(f, "]")?;
        Ok(())
    }
}


// It should be fine to truncate the strong hash to 128 bits. Even 128 bits is overkill. I don't
// know why they used 256. If you had a 2 GB file (using 2KB blocks) containing 1,000,000 blocks and
// you synced 1 billion such files, there is a 99.999999999999999+% chance that there would never be
// a collision.
//
// 1 - e^(-1000000^2/(2*2^128)) = 1.47 * 10^-27
// (1.47 * 10^-27) ^ 1000000000 = 99.999999+%

//const SIG_MAGIC_MD4: u32 = 0x72730136;
const SIG_MAGIC_BLAKE2: u32 = 0x72730137;
const DELTA_MAGIC: u32 = 0x72730236;

//const MD4_SUM_SIZE: u32 = 16;
const BLAKE2_SUM_SIZE: u32 = 32;

#[derive(Debug)]
pub enum Error {
    UnsupportedSignatureFormat(u32),
    UnsupportedDeltaFormat(u32),
    UnsupportedDeltaCommand(u8),
    Io(io::Error),
}

impl From<io::Error> for Error {
    fn from(err: io::Error) -> Self {
        Error::Io(err)
    }
}

/// Calculates the signature file from an old version of a file using the default block size
/// (currently 2048) and strong sum size (currently 32).
pub fn calculate_signature<R: Read, W: Write>(old_file: R, sig_file: W) -> io::Result<()> {
    calculate_signature_with_sizes(old_file, sig_file, 2048, None)
}

/// Calculates the signature file from an old version of a file.
///
/// The signature file contains a weak sum and strong sum (currently Blake2b) for each block in the
/// old file.
///
/// Using a smaller `block_size` results in more blocks and a larger signature file. However, with
/// randomly distributed differences, it will also make it more likely that matching blocks can be
/// found when generating the delta file, producing a smaller delta file.
///
/// Using a smaller `strong_sum_size` results in a smaller signature file, but if it is small enough
/// that it results in a collision, then an invalid delta file could be produced that results in a
/// corrupt new file. If `None`, a default of 32 is used. Using 16 is fine, but you should check the
/// odds of a collision before using smaller than that.
pub fn calculate_signature_with_sizes<R: Read, W: Write>(
    mut old_file: R,
    sig_file: W,
    block_size: u32,
    strong_sum_size: Option<u32>
) -> io::Result<()> {
    let strong_sum_size = strong_sum_size.unwrap_or(BLAKE2_SUM_SIZE);
    debug_assert!(strong_sum_size >= 1 && strong_sum_size <= BLAKE2_SUM_SIZE);

    // No BufReader for old_file because we're already buffering it to a block size below.
    let mut sig_file = BufWriter::new(sig_file);

    sig_file.write_u32::<BE>(SIG_MAGIC_BLAKE2)?;
    sig_file.write_u32::<BE>(block_size)?;
    sig_file.write_u32::<BE>(strong_sum_size)?;

    let block_size = block_size as usize;
    let strong_sum_size = strong_sum_size as usize;

    let mut buffer = vec![0u8; block_size];
    let mut buffer_end = 0;
    loop {
        let num_read = old_file.read(&mut buffer[buffer_end..block_size])?;
        buffer_end += num_read;
        if num_read == 0 || buffer_end == block_size {
            let mut rolling_checksum = RollingChecksum::new();
            rolling_checksum.add_slice(&buffer[0..buffer_end]);
            sig_file.write_u32::<BE>(rolling_checksum.get())?;

            // Setting the Blake2b digest size changes the hash, so it has to be 32 for
            // compatibility with librsync.
            let mut hash = Blake2b::new(32);
            hash.update(&buffer[0..buffer_end]);
            let res = hash.finalize();
            sig_file.write(&res.as_bytes()[0..strong_sum_size])?;

            buffer_end = 0;
        }
        if num_read == 0 {
            break;
        }
    }

    Ok(())
}

/// Calculates the delta file from the new file, referencing the signature file.
pub fn calculate_delta<SR: Read, NR: Read, W: Write>(
    sig_file: SR,
    new_file: NR,
    delta_file: W
) -> Result<(), Error> {
    let sig_lookup = SignatureLookup::new(sig_file)?;

    let delta_gen = DeltaGenerator::new(delta_file, sig_lookup)?;
    delta_gen.generate(new_file)?;

    Ok(())
}

/// Creates the new file from the old file and the delta file.
pub fn apply_delta<OR: Read + Seek, DR: Read, W: Write>(
    mut old_file: OR,
    delta_file: DR,
    new_file: W
) -> Result<(), Error> {
    let mut delta_file = BufReader::new(delta_file);
    let mut new_file = BufWriter::new(new_file);

    fn read_uint<DR: Read>(mut reader: DR, size_class: u8) -> io::Result<u64> {
        Ok(match size_class {
            0 => reader.read_u8()? as u64,
            1 => reader.read_u16::<BE>()? as u64,
            2 => reader.read_u32::<BE>()? as u64,
            3 => reader.read_u64::<BE>()?,
            _ => panic!(),
        })
    };

    let magic = delta_file.read_u32::<BE>()?;
    if magic != DELTA_MAGIC {
        return Err(Error::UnsupportedDeltaFormat(magic));
    }
    loop {
        let cmd = delta_file.read_u8()?;
        match cmd {
            0 => break,
            1..=64 => { // literal
                let literal_len = cmd as u64;

                let mut take = delta_file.take(literal_len);
                io::copy(&mut take, &mut new_file)?;
                delta_file = take.into_inner();
            }
            65..=68 => { // literal
                let len_size_class = cmd - 65;
                let literal_len = read_uint(&mut delta_file, len_size_class)?;

                let mut take = delta_file.take(literal_len);
                io::copy(&mut take, &mut new_file)?;
                delta_file = take.into_inner();
            }
            69..=84 => { // copy
                let cmd_offset = cmd - 69;
                let start_size_class = cmd_offset / 4;
                let len_size_class = cmd_offset % 4;
                let copy_start = read_uint(&mut delta_file, start_size_class)?;
                let copy_len = read_uint(&mut delta_file, len_size_class)?;

                old_file.seek(SeekFrom::Start(copy_start))?;
                let mut take = old_file.take(copy_len);
                io::copy(&mut take, &mut new_file)?;
                old_file = take.into_inner();
            }
            _ => {
                return Err(Error::UnsupportedDeltaCommand(cmd));
            }
        }
    }

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Cursor;

    #[test]
    fn test_small_signature() {
        let old_file: &[u8] = &[
            // "Red car"
            0x52, 0x65, 0x64, 0x20, 0x63, 0x61, 0x72
        ];
        let mut sig_file = vec![];
        calculate_signature_with_sizes(old_file, &mut sig_file, 2, Some(8)).unwrap();
        assert_eq!(sig_file, &[
            // Copied from rdiff's output.
            0x72, 0x73, 0x01, 0x37, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, // header
            0x01, 0x66, 0x00, 0xF5, 0xAF, 0x6B, 0x83, 0x63, 0xC7, 0x6D, 0x47, 0x51, // block 0
            0x01, 0x45, 0x00, 0xC2, 0x84, 0x10, 0xCD, 0x9D, 0xB3, 0x5F, 0x5F, 0x04, // block 1
            0x01, 0x84, 0x01, 0x02, 0xE4, 0xCF, 0x4F, 0xC9, 0xCE, 0xB1, 0x3D, 0xB0, // block 2
            0x00, 0x91, 0x00, 0x91, 0xCE, 0xA4, 0x07, 0xCA, 0x10, 0x6D, 0xCF, 0x64, // block 3
        ][..]);
    }

    #[test]
    fn test_signature_lookup() {
        let sig_file: &[u8] = &[
            // Copied from the test above.
            0x72, 0x73, 0x01, 0x37, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, // header
            0x01, 0x66, 0x00, 0xF5, 0xAF, 0x6B, 0x83, 0x63, 0xC7, 0x6D, 0x47, 0x51, // block 0
            0x01, 0x45, 0x00, 0xC2, 0x84, 0x10, 0xCD, 0x9D, 0xB3, 0x5F, 0x5F, 0x04, // block 1
            0x01, 0x84, 0x01, 0x02, 0xE4, 0xCF, 0x4F, 0xC9, 0xCE, 0xB1, 0x3D, 0xB0, // block 2
            0x00, 0x91, 0x00, 0x91, 0xCE, 0xA4, 0x07, 0xCA, 0x10, 0x6D, 0xCF, 0x64, // block 3
        ];

        let sig_lookup = SignatureLookup::new(sig_file).expect("failed to create SignatureLookup");

        let blocks0 = sig_lookup.get_blocks(0x016600F5);
        assert!(blocks0.is_some());
        let blocks0 = blocks0.unwrap();
        assert_eq!(blocks0.len(), 1);
        assert_eq!(blocks0[0].strong_sum, &[
            0xAF, 0x6B, 0x83, 0x63, 0xC7, 0x6D, 0x47, 0x51
        ]);

        let blocks3 = sig_lookup.get_blocks(0x00910091);
        assert!(blocks3.is_some());
        let blocks3 = blocks3.unwrap();
        assert_eq!(blocks3.len(), 1);
        assert_eq!(blocks3[0].strong_sum, &[
            0xCE, 0xA4, 0x07, 0xCA, 0x10, 0x6D, 0xCF, 0x64
        ]);
    }

    #[test]
    fn test_small_delta_gen() {
        let old_file: &[u8] = &[
            // "car ice "
            0x63, 0x61, 0x72, 0x20, 0x69, 0x63, 0x65, 0x20
        ];
        let new_file: &[u8] = &[
            // "mug nice jog car try lip "
            0x6D, 0x75, 0x67, 0x20,
            0x6E, 0x69, 0x63, 0x65, 0x20, // "nice "
            0x6A, 0x6F, 0x67, 0x20, // "jog "
            0x63, 0x61, 0x72, 0x20,
            0x74, 0x72, 0x79, 0x20, // "try "
            0x6C, 0x69, 0x70, 0x20
        ];
        let mut sig_file = vec![];
        calculate_signature_with_sizes(old_file, &mut sig_file, 4, Some(8)).unwrap();
        let mut delta_file = vec![];
        calculate_delta(&sig_file[..], new_file, &mut delta_file).unwrap();
        assert_eq!(HexPrinter(&delta_file[..]), HexPrinter(&[
            0x72, 0x73, 0x02, 0x36, // header
            0x05, 0x6D, 0x75, 0x67, 0x20, 0x6E, // literal cmd "mug n"
            0x45, 0x04, 0x04, // copy cmd for block 1 "ice "
            0x04, 0x6A, 0x6F, 0x67, 0x20, // literal cmd "jog "
            0x45, 0x00, 0x04, // copy cmd for block 0 "car "
            0x08, 0x74, 0x72, 0x79, 0x20, 0x6C, 0x69, 0x70, 0x20, // literal cmd "try lip "
            0x00, // end of file
        ]));
    }

    #[test]
    fn test_applying_delta() {
        // This is the same data as the above test.
        let old_file: &[u8] = &[
            // "car ice "
            0x63, 0x61, 0x72, 0x20, 0x69, 0x63, 0x65, 0x20
        ];
        let delta_file: &[u8] = &[
            0x72, 0x73, 0x02, 0x36, // header
            0x05, 0x6D, 0x75, 0x67, 0x20, 0x6E, // literal cmd "mug n"
            0x45, 0x04, 0x04, // copy cmd for block 1 "ice "
            0x04, 0x6A, 0x6F, 0x67, 0x20, // literal cmd "jog "
            0x45, 0x00, 0x04, // copy cmd for block 0 "car "
            0x08, 0x74, 0x72, 0x79, 0x20, 0x6C, 0x69, 0x70, 0x20, // literal cmd "try lip "
            0x00, // end of file
        ];
        let mut new_file = vec![];
        apply_delta(Cursor::new(old_file), delta_file, &mut new_file).unwrap();
        assert_eq!(HexPrinter(&new_file[..]), HexPrinter(&[
            // "mug nice jog car try lip "
            0x6D, 0x75, 0x67, 0x20,
            0x6E, 0x69, 0x63, 0x65, 0x20, // "nice "
            0x6A, 0x6F, 0x67, 0x20, // "jog "
            0x63, 0x61, 0x72, 0x20,
            0x74, 0x72, 0x79, 0x20, // "try "
            0x6C, 0x69, 0x70, 0x20
        ]));
    }
}