1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
//! This crate implements the
//! [package-merge](https://en.wikipedia.org/wiki/Package-merge_algorithm)
//! algorithm. The package-merge
//! algorithm is able to compute an optimal length-limited prefix-free code.
//! As such, it might be useful for data compression purposes much like the
//! Huffman algorithm. But Huffman's algorithm does not allow you do
//! constrain the maximum length of all code words.

extern crate itertools;

use std::cmp;
use std::error;
use std::fmt;
use std::mem;

use itertools::Itertools;

use Error::*;

fn order_non_nan(a: f64, b: f64) -> cmp::Ordering {
    if a < b { cmp::Ordering::Less } else
    if a > b { cmp::Ordering::Greater } else
    { cmp::Ordering::Equal }
}

fn complete_chunks<T>(mut slice: &[T], csize: usize) -> std::slice::Chunks<T> {
    let remainder = slice.len() % csize;
    if remainder > 0 {
        slice = &slice[0..(slice.len() - remainder)];
    }
    slice.chunks(csize)
}

/// The error type for the package-merge algorithm
#[derive(Copy,Clone,PartialEq,Eq,Debug)]
pub enum Error {
    /// The given frequencies slice was empty.
    NoSymbols,
    /// The given `max_len` constraint was too small.
    MaxLenTooSmall,
    /// The given `max_len` constraint was too large.
    MaxLenTooLarge,
}

impl Error {
    fn descr(&self) -> &str {
        match *self {
            NoSymbols =>
                "package-merge error: frequencies slice was empty",
            MaxLenTooSmall =>
                "package-merge error: max_len parameter was chosen too small",
            MaxLenTooLarge =>
                "package-merge error: max_len parameter was chosen too large",
        }
    }
}

impl fmt::Display for Error {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{}", self.descr())
    }
}

impl error::Error for Error {
    fn description(&self) -> &str {
        self.descr()
    }
}

/// Given all symbol frequencies (or probabilities) and a limit on the
/// maximum length of code words (up to 32), this function will apply
/// the package merge algorithm to compute optimal code word lengths
/// for the symbols so that the expected code word length is minimized.
pub fn package_merge(frequencies: &[f64], max_len: u32) -> Result<Vec<u32>, Error> {

    if frequencies.is_empty() {
        return Err(Error::NoSymbols);
    }
    if frequencies.len() > (1usize << max_len) {
        return Err(Error::MaxLenTooSmall);
    }
    if max_len > 32 {
        return Err(Error::MaxLenTooLarge);
    }

    let sorted = {
        let mut tmp = Vec::new();
        tmp.extend(0..frequencies.len());
        tmp.sort_by( |&a, &b| order_non_nan(frequencies[a],frequencies[b]) );
        tmp
    };

    let capa = frequencies.len() * 2 - 1;
    let mut list: Vec<f64> = Vec::with_capacity(capa);
    let mut flags: Vec<u32> = vec![0; capa];
    let mut merged: Vec<f64> = Vec::with_capacity(capa);

    for depth in 0..max_len {
        {
            merged.clear();
            let mask = 1u32 << depth;
            let pairs = complete_chunks(&list, 2).map( |s| (s[0] + s[1], true) );
            let srted = sorted.iter().map( |&i| (frequencies[i], false) );
            for (p, m) in pairs.merge_by(srted, |a, b| a.0 < b.0 ) {
                if m { // was this a merged item?
                    flags[merged.len()] |= mask;
                }
                merged.push(p);
            }
        }
        mem::swap(&mut merged, &mut list);
    }

    let mut n = frequencies.len() * 2 - 2;
    debug_assert!(list.len() >= n);
    let mut code_lens = vec![0u32; frequencies.len()];
    let mut depth = max_len;
    while depth > 0 && n > 0 {
        depth -= 1;
        let mask = 1u32 << depth;
        let mut merged = 0;
        for i in 0..n {
            if (flags[i] & mask) == 0 {
                code_lens[sorted[i - merged]] += 1;
            } else {
                merged += 1;
            }
        }
        n = merged * 2;
    }

    Ok(code_lens)
}

#[cfg(test)]
mod tests {
    use super::package_merge;

    #[test]
    fn it_works() {
        let freqs = [1.0, 32.0, 16.0, 4.0, 8.0, 2.0, 1.0];
        let cl = package_merge(&freqs, 8).unwrap();
        assert_eq!(&cl[..], &[6, 1, 2, 4, 3, 5, 6]);
        let cl = package_merge(&freqs, 5).unwrap();
        assert_eq!(&cl[..], &[5, 1, 2, 5, 3, 5, 5]);
    }

    #[test]
    #[should_panic]
    fn it_fails() {
        let freqs = [1.0, 32.0, 16.0, 4.0, 8.0, 2.0, 1.0];
        package_merge(&freqs, 2).unwrap();
    }
}