deflate 0.7.0 - Docs.rs

//! This module contains functionality for doing lz77 compression of data.
use std::cmp;
use std::ops::Range;

use input_buffer::InputBuffer;
use matching::longest_match;
use lzvalue::LZValue;
use huffman_table;
use chained_hash_table::ChainedHashTable;
use compression_options::{HIGH_MAX_HASH_CHECKS, HIGH_LAZY_IF_LESS_THAN};
use output_writer::{OutputWriter, FixedWriter};
use compress::Flush;

const MAX_MATCH: usize = huffman_table::MAX_MATCH as usize;
const MIN_MATCH: usize = huffman_table::MIN_MATCH as usize;

/// An enum describing whether we use lazy or greedy matching.
#[derive(Clone, Copy, Debug)]
pub enum MatchingType {
    /// Use lazy matching: after finding a match, the next input byte is checked, to see
    /// if there is a better match starting at that byte.
    Lazy,
    /// Use greedy matching: the matching algorithm simply uses a match right away
    /// if found.
    Greedy,
}

/// A struct that contains the hash table, and keeps track of where we are in the input data
pub struct LZ77State {
    /// Struct containing hash chains that will be used to find matches.
    hash_table: ChainedHashTable,
    /// True if this is the first window that is being processed.
    is_first_window: bool,
    /// Set to true when the last block has been processed.
    is_last_block: bool,
    /// How many bytes the last match in the previous window extended into the current one.
    overlap: usize,
    /// The maximum number of hash entries to search.
    max_hash_checks: u16,
    /// Only lazy match if we have a match length less than this.
    lazy_if_less_than: u16,
    /// Whether to use greedy or lazy parsing
    matching_type: MatchingType,
}

impl LZ77State {
    fn from_starting_values(b0: u8,
                            b1: u8,
                            max_hash_checks: u16,
                            lazy_if_less_than: u16,
                            matching_type: MatchingType)
                            -> LZ77State {
        LZ77State {
            hash_table: ChainedHashTable::from_starting_values(b0, b1),
            is_first_window: true,
            is_last_block: false,
            overlap: 0,
            max_hash_checks: max_hash_checks,
            lazy_if_less_than: lazy_if_less_than,
            matching_type: matching_type,
        }
    }

    /// Creates a new LZ77 state, adding the first to bytes to the hash value
    /// to warm it up
    pub fn _new_warmup(data: &[u8],
                       max_hash_checks: u16,
                       lazy_if_less_than: u16,
                       matching_type: MatchingType)
                       -> LZ77State {
        LZ77State::from_starting_values(data[0],
                                        data[1],
                                        max_hash_checks,
                                        lazy_if_less_than,
                                        matching_type)
    }

    /// Creates a new LZ77 state
    /// Uses two arbitrary values to warm up the hash
    pub fn new(max_hash_checks: u16,
               lazy_if_less_than: u16,
               matching_type: MatchingType)
               -> LZ77State {
        // Not sure if warming up the hash is actually needed.
        LZ77State::from_starting_values(55, 23, max_hash_checks, lazy_if_less_than, matching_type)
    }

    /// Resets the state excluding max_hash_checks and lazy_if_less_than
    pub fn reset(&mut self) {
        self.hash_table.reset();
        self.is_first_window = true;
        self.is_last_block = false;
        self.overlap = 0;
    }

    pub fn set_last(&mut self) {
        self.is_last_block = true;
    }

    pub fn is_last_block(&self) -> bool {
        self.is_last_block
    }

    pub fn is_first_window(&self) -> bool {
        self.is_first_window
    }
}

/// A structure representing either a literal, length or distance value
#[derive(Debug, PartialEq, Eq, Copy, Clone)]
pub enum LDPair {
    Literal(u8),
    Length(u16),
    Distance(u16),
}

const DEFAULT_WINDOW_SIZE: usize = 32768;

fn process_chunk<W: OutputWriter>(data: &[u8],
                                  iterated_data: Range<usize>,
                                  hash_table: &mut ChainedHashTable,
                                  writer: &mut W,
                                  max_hash_checks: u16,
                                  lazy_if_less_than: usize,
                                  matching_type: MatchingType)
                                  -> usize {
    match matching_type {
        MatchingType::Greedy => {
            process_chunk_greedy(data, iterated_data, hash_table, writer, max_hash_checks)
        }
        MatchingType::Lazy => {
            process_chunk_lazy(data,
                               iterated_data,
                               hash_table,
                               writer,
                               max_hash_checks,
                               lazy_if_less_than)
        }
    }
}

fn process_chunk_lazy<W: OutputWriter>(data: &[u8],
                                       iterated_data: Range<usize>,
                                       hash_table: &mut ChainedHashTable,
                                       writer: &mut W,
                                       max_hash_checks: u16,
                                       lazy_if_less_than: usize)
                                       -> usize {
    let end = cmp::min(data.len(), iterated_data.end);
    let start = iterated_data.start;
    let current_chunk = &data[start..end];

    let mut insert_it = current_chunk.iter().enumerate();
    let mut hash_it = {
        let hash_start = if end - start > 2 {
            start + 2
        } else {
            data.len()
        };
        (&data[hash_start..]).iter()
    };

    const NO_LENGTH: usize = MIN_MATCH as usize - 1;

    // The byte before the currently read one in the stream.
    let mut prev_byte = 0u8;
    // The previous match length, if any.
    let mut prev_length = NO_LENGTH;
    // The distance of the previous match if any.
    let mut prev_distance = 0;
    // Whether prev_byte should be output if we move one byte forward to find a better match
    // (or at the end of the stream).
    let mut add = false;
    // The number of bytes past end that was added due to finding a match that extends into
    // the lookahead window.
    let mut overlap = 0;

    // Set to true if we found a match that is equal to or longer than `lazy_if_less_than`,
    // indicating that we won't lazy match (check for a better match at the next byte).
    let mut ignore_next = false;

    // Iterate through the slice, adding literals or length/distance pairs
    while let Some((n, &b)) = insert_it.next() {
        if let Some(&hash_byte) = hash_it.next() {
            let position = n + start;
            hash_table.add_hash_value(position, hash_byte);

            // Only lazy match if we have a match shorter than a set value
            // TODO: This should be cleaned up a bit
            let (match_len, match_dist) = if !ignore_next {
                let (match_len, match_dist) = {
                    // If there already was a decent match at the previous byte
                    // and we are lazy matching, do less match checks in this step.
                    let max_hash_checks = if prev_length >= 32 {
                        max_hash_checks >> 2
                    } else {
                        max_hash_checks
                    };

                    // Check if we can find a better match here than the one we had at
                    // the previous byte.
                    longest_match(data, hash_table, position, prev_length, max_hash_checks)
                };
                if match_len > lazy_if_less_than {
                    // We found a decent match, so we won't check for a better one at the next byte.
                    ignore_next = true;
                }
                (match_len, match_dist)
            } else {
                // We already had a decent match, so we don't bother checking for another one.
                (NO_LENGTH, 0)
            };

            if prev_length >= match_len && prev_length >= MIN_MATCH as usize && prev_distance > 0 {
                // The previous match was better so we add it
                // Casting note: length and distance is already bounded by the longest match
                // function. Usize is just used for convenience
                writer.write_length_distance(prev_length as u16, prev_distance as u16);

                // We add the bytes to the hash table and checksum.
                // Since we've already added two of them, we need to add two less than
                // the length
                let bytes_to_add = prev_length - 2;
                let taker = insert_it.by_ref().take(bytes_to_add);
                let mut hash_taker = hash_it.by_ref().take(bytes_to_add);

                // Advance the iterators and add the bytes we jump over to the hash table and
                // checksum
                for (ipos, _) in taker {
                    if let Some(&i_hash_byte) = hash_taker.next() {
                        hash_table.add_hash_value(ipos + start, i_hash_byte);
                    }
                }

                // If the match is longer than the current window, we have note how many
                // bytes we overlap, since we don't need to do any matching on these bytes
                // in the next call of this function.
                if position + prev_length > end {
                    // We need to subtract 1 since the byte at pos is also included
                    overlap = position + prev_length - end - 1;
                };

                add = false;
                ignore_next = false;

            } else if add {
                // We found a better match (or there was no previous match)
                // so output the previous byte
                writer.write_literal(prev_byte);
            } else {
                add = true
            }

            prev_length = match_len;
            prev_distance = match_dist;
            prev_byte = b;
        } else {
            if add {
                // We may still have a leftover byte at this point, so we add it here if needed.
                writer.write_literal(prev_byte);
                add = false;
            }
            // We are at the last two bytes we want to add, so there is no point
            // searching for matches here.
            writer.write_literal(b);
        }
    }
    if add {
        // We may still have a leftover byte at this point, so we add it here if needed.
        writer.write_literal(prev_byte);
    }
    overlap
}

fn process_chunk_greedy<W: OutputWriter>(data: &[u8],
                                         iterated_data: Range<usize>,
                                         hash_table: &mut ChainedHashTable,
                                         writer: &mut W,
                                         max_hash_checks: u16)
                                         -> usize {
    let end = cmp::min(data.len(), iterated_data.end);
    let start = iterated_data.start;
    let current_chunk = &data[start..end];

    let mut insert_it = current_chunk.iter().enumerate();
    let mut hash_it = {
        let hash_start = if end - start > 2 {
            start + 2
        } else {
            data.len()
        };
        (&data[hash_start..]).iter()
    };

    const NO_LENGTH: usize = MIN_MATCH as usize - 1;

    // The byte before the currently read one in the stream.
    let mut prev_byte = 0u8;
    // Whether prev_byte should be output if we move one byte forward to find a better match
    // (or at the end of the stream).
    let mut add = false;
    // The number of bytes past end that was added due to finding a match that extends into
    // the lookahead window.
    let mut overlap = 0;

    // Iterate through the slice, adding literals or length/distance pairs
    while let Some((n, &b)) = insert_it.next() {
        if let Some(&hash_byte) = hash_it.next() {
            let position = n + start;
            hash_table.add_hash_value(position, hash_byte);

            // TODO: This should be cleaned up a bit
            let (match_len, match_dist) = {
                longest_match(data, hash_table, position, NO_LENGTH, max_hash_checks)
            };

            if match_len >= MIN_MATCH as usize && match_dist > 0 {
                // Casting note: length and distance is already bounded by the longest match
                // function. Usize is just used for convenience
                writer.write_length_distance(match_len as u16, match_dist as u16);

                // We add the bytes to the hash table and checksum.
                // Since we've already added one of them, we need to add one less than
                // the length
                let bytes_to_add = match_len - 1;
                let taker = insert_it.by_ref().take(bytes_to_add);
                let mut hash_taker = hash_it.by_ref().take(bytes_to_add);

                // Advance the iterators and add the bytes we jump over to the hash table and
                // checksum
                for (ipos, _) in taker {
                    if let Some(&i_hash_byte) = hash_taker.next() {
                        hash_table.add_hash_value(ipos + start, i_hash_byte);
                    }
                }

                // If the match is longer than the current window, we have note how many
                // bytes we overlap, since we don't need to do any matching on these bytes
                // in the next call of this function.
                if position + match_len > end {
                    // We need to subtract 1 since the byte at pos is also included
                    overlap = position + match_len - end;
                };

                add = false;
                // There was no match

            } else {
                writer.write_literal(b);
            }
            prev_byte = b;
        } else {
            if add {
                // We may still have a leftover byte at this point, so we add it here if needed.
                writer.write_literal(prev_byte);
                add = false;
            }
            // We are at the last two bytes we want to add, so there is no point
            // searching for matches here.
            writer.write_literal(b);
        }
    }
    if add {
        // We may still have a leftover byte at this point, so we add it here if needed.
        writer.write_literal(prev_byte);
    }
    overlap
}


#[derive(Eq, PartialEq, Clone, Copy, Debug)]
pub enum LZ77Status {
    NeedInput,
    EndBlock,
    Finished,
}

pub fn lz77_compress_block_finish<W: OutputWriter>(data: &[u8],
                                                   state: &mut LZ77State,
                                                   buffer: &mut InputBuffer,
                                                   mut writer: &mut W)
                                                   -> (usize, LZ77Status) {
    lz77_compress_block::<W>(data, state, buffer, &mut writer, Flush::Finish)
}

/// Compress a slice with lz77 compression.
///
/// This function processes one window at a time, and returns when there is no input left,
/// or it determines it's time to end a block.
///
/// Returns the number of bytes of the input that were not processed, and a status describing
/// whether there is no input, it's time to finish, or it's time to end the block.
pub fn lz77_compress_block<W: OutputWriter>(data: &[u8],
                                            state: &mut LZ77State,
                                            buffer: &mut InputBuffer,
                                            mut writer: &mut W,
                                            flush: Flush)
                                            -> (usize, LZ77Status) {
    // Currently we only support the maximum window size
    let window_size = DEFAULT_WINDOW_SIZE;

    let finish = flush == Flush::Finish || flush == Flush::Sync;
    let sync = flush == Flush::Sync;

    let mut status = LZ77Status::EndBlock;
    let mut remaining_data = buffer.add_data(data);

    while writer.buffer_length() < (window_size * 2) {
        if state.is_first_window {
            // Don't do anything until we are either flushing, or we have at least one window of
            // data.
            if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish {


                let first_chunk_end = if finish && remaining_data.is_none() {
                    // If we are finishing, make sure we include data in the lookahead area
                    buffer.current_end()
                } else {
                    cmp::min(window_size, buffer.current_end())
                };

                state.overlap = process_chunk::<W>(buffer.get_buffer(),
                                                   0..first_chunk_end,
                                                   &mut state.hash_table,
                                                   &mut writer,
                                                   state.max_hash_checks,
                                                   state.lazy_if_less_than as usize,
                                                   state.matching_type);

                // We are at the first window so we don't need to slide the hash table yet,

                if first_chunk_end >= data.len() && finish {
                    if !sync {
                        state.set_last();
                    }
                    status = LZ77Status::Finished;
                } else {
                    status = LZ77Status::EndBlock;
                }
                state.is_first_window = false;
                break;
            } else {
                status = LZ77Status::NeedInput;
                break;
            }
        } else if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish {
            // This isn't the first chunk, so we start reading at one window in in them
            // buffer plus any additional overlap from earlier.
            let start = window_size + state.overlap;

            // Determine where we have to stop iterating to slide the buffer and hash,
            // or stop because we are at the end of the input data.
            let end = if remaining_data.is_none() && finish {
                // If we are finishing, make sure we include the lookahead data
                buffer.current_end()
            } else {
                // Otherwise we process one window size of data.
                cmp::min(window_size * 2, buffer.current_end())
            };

            state.overlap = process_chunk::<W>(buffer.get_buffer(),
                                               start..end,
                                               &mut state.hash_table,
                                               &mut writer,
                                               state.max_hash_checks,
                                               state.lazy_if_less_than as usize,
                                               state.matching_type);
            if remaining_data.is_none() && finish {
                // We stopped before or at the window size, so we are at the end.
                if !sync {
                    state.set_last();
                } else {
                    // For sync flushing we need to slide the buffer and the has chains so that the
                    // next call to this function starts at the right place.
                    state.overlap = 0;
                    let n = buffer.move_down();
                    state.hash_table.slide(n);
                }
                status = LZ77Status::Finished;
                break;
            } else {
                // We are not at the end, so slide and continue
                // We slide the hash table back to make space for new hash values
                // We only need to remember 32k bytes back (the maximum distance allowed by the
                // deflate spec)
                state.hash_table.slide(window_size);

                // Slide the buffer
                remaining_data = buffer.slide(remaining_data.unwrap_or(&[]));

                status = LZ77Status::EndBlock;
            }
        } else {
            status = LZ77Status::NeedInput;
            break;
        }

    }

    (data.len() - remaining_data.unwrap_or(&[]).len(), status)
}

#[allow(dead_code)]
pub struct TestStruct {
    state: LZ77State,
    buffer: InputBuffer,
    writer: FixedWriter,
}

#[allow(dead_code)]
impl TestStruct {
    fn new() -> TestStruct {
        TestStruct {
            state: LZ77State::new(HIGH_MAX_HASH_CHECKS,
                                  HIGH_LAZY_IF_LESS_THAN,
                                  MatchingType::Lazy),
            buffer: InputBuffer::empty(),
            writer: FixedWriter::new(),
        }
    }

    fn compress_block(&mut self, data: &[u8], flush: bool) -> (usize, LZ77Status) {
        lz77_compress_block(data,
                            &mut self.state,
                            &mut self.buffer,
                            &mut self.writer,
                            if flush { Flush::Finish } else { Flush::None })
    }
}

/// Compress a slice, not storing frequency information
///
/// This is a convenience function for compression with fixed huffman values
/// Only used in tests for now
#[allow(dead_code)]
pub fn lz77_compress(data: &[u8]) -> Option<Vec<LZValue>> {
    let mut test_boxed = Box::new(TestStruct::new());
    let mut out = Vec::<LZValue>::with_capacity(data.len() / 3);
    {
        let mut test = test_boxed.as_mut();
        let mut slice = data;

        while !test.state.is_last_block {
            let bytes_written = lz77_compress_block_finish(slice,
                                                           &mut test.state,
                                                           &mut test.buffer,
                                                           &mut test.writer)
                .0;
            slice = &slice[bytes_written..];
            out.extend(test.writer.get_buffer());
            test.writer.clear_buffer();
        }

    }

    Some(out)
}

#[cfg(test)]
mod test {
    use super::*;
    use lzvalue::LZValue;
    use chained_hash_table::WINDOW_SIZE;
    use compression_options::DEFAULT_LAZY_IF_LESS_THAN;
    use test_utils::get_test_data;

    fn decompress_lz77(input: &[LZValue]) -> Vec<u8> {
        let mut output = Vec::new();
        let mut last_length = 0;
        for p in input {
            match p.value() {
                LDPair::Literal(l) => output.push(l),
                LDPair::Length(l) => last_length = l,
                LDPair::Distance(d) => {
                    let start = output.len() - d as usize;
                    let mut n = 0;
                    while n < last_length as usize {
                        let b = output[start + n];
                        output.push(b);
                        n += 1;
                    }
                }
            }
        }
        output
    }


    /// Helper function to print the output from the lz77 compression function
    fn print_output(input: &[LZValue]) {
        let mut output = vec![];
        for l in input {
            match l.value() {
                LDPair::Literal(l) => output.push(l),
                LDPair::Length(l) => output.extend(format!("<L {}>", l).into_bytes()),
                LDPair::Distance(d) => output.extend(format!("<D {}>", d).into_bytes()),
            }
        }

        println!("\"{}\"", String::from_utf8(output).unwrap());
    }

    /// Test that a short string from an example on SO compresses correctly
    #[test]
    fn compress_short() {
        use std::str;

        let test_bytes = String::from("Deflate late").into_bytes();
        let res = lz77_compress(&test_bytes).unwrap();
        // println!("{:?}", res);
        // TODO: Check that compression is correct
        // print_output(&res);
        let decompressed = decompress_lz77(&res);
        let d_str = str::from_utf8(&decompressed).unwrap();
        println!("{}", d_str);
        assert_eq!(test_bytes, decompressed);
        // assert_eq!(res[8],
        // LDPair::LengthDistance {
        // distance: 5,
        // length: 4,
        // });
    }

    /// Test that compression is working for a longer file
    #[test]
    fn compress_long() {
        use std::str;
        let input = get_test_data();
        let compressed = lz77_compress(&input).unwrap();
        assert!(compressed.len() < input.len());
        // print_output(&compressed);
        let decompressed = decompress_lz77(&compressed);
        // println!("{}", str::from_utf8(&decompressed).unwrap());
        // This is to check where the compression fails, if it were to
        for (n, (&a, &b)) in input.iter().zip(decompressed.iter()).enumerate() {
            if a != b {
                println!("First difference at {}, input: {}, output: {}", n, a, b);
                break;
            }
        }
        assert_eq!(input.len(), decompressed.len());
        assert!(&decompressed == &input);
    }

    /// Check that lazy matching is working as intended
    #[test]
    fn lazy() {
        // We want to match on `badger` rather than `nba` as it is longer
        // let data = b" nba nbadg badger nbadger";
        let data = b"nba badger nbadger";
        let compressed = lz77_compress(data).unwrap();
        let test = compressed[compressed.len() - 2];
        if let LDPair::Length(n) = test.value() {
            assert_eq!(n, 6);
        } else {
            print_output(&compressed);
            panic!();
        }
    }

    fn roundtrip(data: &[u8]) {
        let compressed = super::lz77_compress(&data).unwrap();
        let decompressed = decompress_lz77(&compressed);
        assert!(decompressed == data);
    }

    // Check that data with the exact window size is working properly
    #[test]
    #[allow(unused)]
    fn exact_window_size() {
        use std::io::Write;
        let mut data = vec![0; WINDOW_SIZE];
        roundtrip(&data);
        {
            data.write(&[22; WINDOW_SIZE]);
        }
        roundtrip(&data);
        {
            data.write(&[55; WINDOW_SIZE]);
        }
        roundtrip(&data);
    }

    /// Test that matches at the window border are working correctly
    #[test]
    fn border() {
        use chained_hash_table::WINDOW_SIZE;
        let mut data = vec![35; WINDOW_SIZE];
        data.extend(b"Test");
        let compressed = super::lz77_compress(&data).unwrap();
        assert!(compressed.len() < data.len());
        let decompressed = decompress_lz77(&compressed);
        // print_output(&compressed);
        assert_eq!(decompressed.len(), data.len());
        assert!(decompressed == data);
    }

    #[test]
    fn border_multiple_blocks() {
        use chained_hash_table::WINDOW_SIZE;
        let mut data = vec![0; (WINDOW_SIZE * 2) + 50];
        data.push(1);
        let compressed = super::lz77_compress(&data).unwrap();
        assert!(compressed.len() < data.len());
        let decompressed = decompress_lz77(&compressed);
        assert!(decompressed == data);
    }

    #[test]
    fn compress_block_status() {
        use input_buffer::InputBuffer;
        use output_writer::FixedWriter;

        let data = b"Test data data";
        let mut writer = FixedWriter::new();

        let mut buffer = InputBuffer::empty();
        let mut state = LZ77State::new(4096, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy);
        let status = lz77_compress_block_finish(data, &mut state, &mut buffer, &mut writer);
        assert_eq!(status.1, LZ77Status::Finished);
        assert!(&buffer.get_buffer()[..data.len()] == data);
        assert_eq!(buffer.current_end(), data.len());
    }

    #[test]
    fn compress_block_multiple_windows() {
        use input_buffer::InputBuffer;
        use output_writer::{OutputWriter, FixedWriter};

        let data = get_test_data();
        assert!(data.len() > (WINDOW_SIZE * 2) + super::MAX_MATCH);
        let mut writer = FixedWriter::new();

        let mut buffer = InputBuffer::empty();
        let mut state = LZ77State::new(0, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy);
        let (bytes_consumed, status) =
            lz77_compress_block_finish(&data, &mut state, &mut buffer, &mut writer);
        assert_eq!(buffer.get_buffer().len(),
                   (WINDOW_SIZE * 2) + super::MAX_MATCH);
        assert_eq!(status, LZ77Status::EndBlock);
        let buf_len = buffer.get_buffer().len();
        assert!(buffer.get_buffer()[..] == data[..buf_len]);
        // print_output(writer.get_buffer());
        writer.clear_buffer();
        let (_, status) = lz77_compress_block_finish(&data[bytes_consumed..],
                                                     &mut state,
                                                     &mut buffer,
                                                     &mut writer);
        assert_eq!(status, LZ77Status::EndBlock);
        // print_output(writer.get_buffer());
    }

    #[test]
    fn multiple_inputs() {
        use output_writer::OutputWriter;
        let data = b"Badger badger bababa test data 25 asfgestghresjkgh";
        let comp1 = lz77_compress(data).unwrap();
        let comp2 = {
            const SPLIT: usize = 25;
            let first_part = &data[..SPLIT];
            let second_part = &data[SPLIT..];
            let mut state = TestStruct::new();
            let (bytes_written, status) = state.compress_block(first_part, false);
            assert_eq!(bytes_written, first_part.len());
            assert_eq!(status, LZ77Status::NeedInput);
            let (bytes_written, status) = state.compress_block(second_part, true);
            assert_eq!(bytes_written, second_part.len());
            assert_eq!(status, LZ77Status::Finished);
            Vec::from(state.writer.get_buffer())
        };
        assert!(comp1 == comp2);
    }
}