zstd 0.10.1+zstd.1.5.2

Binding for the zstd compression library.
Documentation
//! Train a dictionary from various sources.
//!
//! A dictionary can help improve the compression of small files.
//! The dictionary must be present during decompression,
//! but can be shared accross multiple "similar" files.
//!
//! Creating a dictionary using the `zstd` C library,
//! using the `zstd` command-line interface, using this library,
//! or using the `train` binary provided, should give the same result,
//! and are therefore completely compatible.
//!
//! To use, see [`Encoder::with_dictionary`] or [`Decoder::with_dictionary`].
//!
//! [`Encoder::with_dictionary`]: ../struct.Encoder.html#method.with_dictionary
//! [`Decoder::with_dictionary`]: ../struct.Decoder.html#method.with_dictionary

#[cfg(feature = "zdict_builder")]
use std::io::{self, Read};

pub use zstd_safe::{CDict, DDict};

/// Prepared dictionary for compression
///
/// A dictionary can include its own copy of the data (if it is `'static`), or it can merely point
/// to a separate buffer (if it has another lifetime).
pub struct EncoderDictionary<'a> {
    cdict: CDict<'a>,
}

impl EncoderDictionary<'static> {
    /// Creates a prepared dictionary for compression.
    ///
    /// This will copy the dictionary internally.
    pub fn copy(dictionary: &[u8], level: i32) -> Self {
        Self {
            cdict: zstd_safe::create_cdict(dictionary, level),
        }
    }
}

impl<'a> EncoderDictionary<'a> {
    #[cfg(feature = "experimental")]
    #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "experimental")))]
    /// Create prepared dictionary for compression
    ///
    /// A level of `0` uses zstd's default (currently `3`).
    ///
    /// Only available with the `experimental` feature. Use `EncoderDictionary::copy` otherwise.
    pub fn new(dictionary: &'a [u8], level: i32) -> Self {
        Self {
            cdict: zstd_safe::create_cdict_by_reference(dictionary, level),
        }
    }

    /// Returns reference to `CDict` inner object
    pub fn as_cdict(&self) -> &CDict<'a> {
        &self.cdict
    }
}

/// Prepared dictionary for decompression
pub struct DecoderDictionary<'a> {
    ddict: DDict<'a>,
}

impl DecoderDictionary<'static> {
    /// Create a prepared dictionary for decompression.
    ///
    /// This will copy the dictionary internally.
    pub fn copy(dictionary: &[u8]) -> Self {
        Self {
            ddict: zstd_safe::DDict::create(dictionary),
        }
    }
}

impl<'a> DecoderDictionary<'a> {
    #[cfg(feature = "experimental")]
    #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "experimental")))]
    /// Create prepared dictionary for decompression
    ///
    /// Only available with the `experimental` feature. Use `DecoderDictionary::copy` otherwise.
    pub fn new(dict: &'a [u8]) -> Self {
        Self {
            ddict: zstd_safe::create_ddict_by_reference(dict),
        }
    }

    /// Returns reference to `DDict` inner object
    pub fn as_ddict(&self) -> &DDict<'a> {
        &self.ddict
    }
}

/// Train a dictionary from a big continuous chunk of data.
///
/// This is the most efficient way to train a dictionary,
/// since this is directly fed into `zstd`.
#[cfg(feature = "zdict_builder")]
#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
pub fn from_continuous(
    sample_data: &[u8],
    sample_sizes: &[usize],
    max_size: usize,
) -> io::Result<Vec<u8>> {
    use crate::map_error_code;

    // Complain if the lengths don't add up to the entire data.
    if sample_sizes.iter().sum::<usize>() != sample_data.len() {
        return Err(io::Error::new(
            io::ErrorKind::Other,
            "sample sizes don't add up".to_string(),
        ));
    }

    let mut result = Vec::with_capacity(max_size);
    zstd_safe::train_from_buffer(&mut result, sample_data, sample_sizes)
        .map_err(map_error_code)?;
    Ok(result)
}

/// Train a dictionary from multiple samples.
///
/// The samples will internaly be copied to a single continuous buffer,
/// so make sure you have enough memory available.
///
/// If you need to stretch your system's limits,
/// [`from_continuous`] directly uses the given slice.
///
/// [`from_continuous`]: ./fn.from_continuous.html
#[cfg(feature = "zdict_builder")]
#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
pub fn from_samples<S: AsRef<[u8]>>(
    samples: &[S],
    max_size: usize,
) -> io::Result<Vec<u8>> {
    // Copy every sample to a big chunk of memory
    let data: Vec<_> =
        samples.iter().flat_map(|s| s.as_ref()).cloned().collect();
    let sizes: Vec<_> = samples.iter().map(|s| s.as_ref().len()).collect();

    from_continuous(&data, &sizes, max_size)
}

/// Train a dict from a list of files.
#[cfg(feature = "zdict_builder")]
#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
pub fn from_files<I, P>(filenames: I, max_size: usize) -> io::Result<Vec<u8>>
where
    P: AsRef<std::path::Path>,
    I: IntoIterator<Item = P>,
{
    use std::fs;

    let mut buffer = Vec::new();
    let mut sizes = Vec::new();

    for filename in filenames {
        let mut file = fs::File::open(filename)?;
        let len = file.read_to_end(&mut buffer)?;
        sizes.push(len);
    }

    from_continuous(&buffer, &sizes, max_size)
}

#[cfg(test)]
#[cfg(feature = "zdict_builder")]
mod tests {
    use std::fs;
    use std::io;
    use std::io::Read;

    use walkdir;

    #[test]
    fn test_dict_training() {
        // Train a dictionary
        let paths: Vec<_> = walkdir::WalkDir::new("src")
            .into_iter()
            .map(|entry| entry.unwrap())
            .map(|entry| entry.into_path())
            .filter(|path| path.to_str().unwrap().ends_with(".rs"))
            .collect();

        let dict = super::from_files(&paths, 4000).unwrap();

        for path in paths {
            let mut buffer = Vec::new();
            let mut file = fs::File::open(path).unwrap();
            let mut content = Vec::new();
            file.read_to_end(&mut content).unwrap();
            io::copy(
                &mut &content[..],
                &mut crate::stream::Encoder::with_dictionary(
                    &mut buffer,
                    1,
                    &dict,
                )
                .unwrap()
                .auto_finish(),
            )
            .unwrap();

            let mut result = Vec::new();
            io::copy(
                &mut crate::stream::Decoder::with_dictionary(
                    &buffer[..],
                    &dict[..],
                )
                .unwrap(),
                &mut result,
            )
            .unwrap();

            assert_eq!(&content, &result);
        }
    }
}