Skip to main content

jkl/
lz78.rs

1//! LZ78 dictionary compression.
2//!
3//! Symbols are compressed into [`Token`]s - a prefix index into the dictionary paired with a trailing literal.
4//! The [`Encoder`] builds tokens from an input stream and
5//! the [`Decoder`] reconstructs the original data.
6
7use std::{
8    io,
9    iter::{Fuse, FusedIterator},
10};
11
12use crate::{
13    bits::{ReadBits, WriteBits},
14    encode::VarCode,
15    math::Delta,
16    vle,
17};
18
19/// An LZ78 token: a dictionary prefix index paired with a trailing literal.
20#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
21pub struct Token<T> {
22    /// Zero-based index into the dictionary. `0` means no prefix (standalone literal).
23    pub prefix: usize,
24    /// The literal symbol that follows the prefix string.
25    pub literal: T,
26}
27
28impl<T> Delta for Token<T>
29where
30    T: Delta,
31{
32    fn delta(self, base: Self) -> Self {
33        if self.prefix == base.prefix {
34            Token {
35                prefix: 0,
36                literal: self.literal.delta(base.literal),
37            }
38        } else {
39            Token {
40                prefix: self.prefix - base.prefix,
41                literal: self.literal,
42            }
43        }
44    }
45
46    fn from_delta(base: Self, delta: Self) -> Self {
47        if delta.prefix == 0 {
48            Token {
49                prefix: base.prefix,
50                literal: T::from_delta(base.literal, delta.literal),
51            }
52        } else {
53            Token {
54                prefix: base.prefix + delta.prefix,
55                literal: delta.literal,
56            }
57        }
58    }
59}
60
61impl<T> VarCode for Token<T>
62where
63    T: VarCode,
64{
65    fn var_bit_len(&self) -> usize {
66        vle::encode_bit_len(self.prefix) + self.literal.var_bit_len()
67    }
68
69    fn var_write(&self, writer: &mut WriteBits<impl io::Write>) -> io::Result<()> {
70        vle::encode(self.prefix, writer)?;
71        self.literal.var_write(writer)
72    }
73
74    fn var_read(reader: &mut ReadBits<impl io::Read>) -> io::Result<Self> {
75        let prefix = vle::decode::<usize, _>(reader)?;
76        let literal = T::var_read(reader)?;
77
78        Ok(Token { prefix, literal })
79    }
80}
81
82#[derive(Clone, Copy, PartialEq, Eq)]
83struct Entry<T> {
84    prefix: usize,
85    literal: T,
86}
87
88/// LZ78 encoder that builds a dictionary on the fly and emits [`Token`]s.
89pub struct Encoder<T> {
90    entires: Vec<Entry<T>>,
91    prefix: usize,
92}
93
94impl<T> Default for Encoder<T> {
95    fn default() -> Self {
96        Self::new()
97    }
98}
99
100impl<T> Encoder<T> {
101    /// Creates a new, empty LZ78 encoder.
102    pub fn new() -> Self {
103        Encoder {
104            entires: Vec::new(),
105            prefix: 0,
106        }
107    }
108}
109
110impl<T> Encoder<T>
111where
112    T: Copy + Eq,
113{
114    fn lookup(&self, entry: Entry<T>) -> Option<usize> {
115        for i in entry.prefix..self.entires.len() {
116            if self.entires[i] == entry {
117                return Some(i + 1);
118            }
119        }
120
121        None
122    }
123
124    /// Feeds one symbol to the encoder; returns a token when a dictionary
125    /// entry is completed.
126    pub fn encode(&mut self, symbol: T) -> Option<Token<T>> {
127        let entry = Entry {
128            prefix: self.prefix,
129            literal: symbol,
130        };
131        let index = self.lookup(entry);
132
133        match index {
134            None => {
135                debug_assert!(u32::try_from(self.prefix).is_ok());
136
137                let token = Token {
138                    prefix: self.prefix,
139                    literal: symbol,
140                };
141                self.entires.push(entry);
142                self.prefix = 0;
143                Some(token)
144            }
145            Some(index) => {
146                self.prefix = index;
147                None
148            }
149        }
150    }
151
152    /// Flushes the encoder, returning any final pending token.
153    pub fn finish(self) -> Option<Token<T>> {
154        { self }.finish_mut()
155    }
156
157    fn finish_mut(&mut self) -> Option<Token<T>> {
158        if self.prefix == 0 {
159            return None;
160        }
161
162        let last = &self.entires[self.prefix - 1];
163        self.prefix = 0;
164
165        Some(Token {
166            prefix: last.prefix,
167            literal: last.literal,
168        })
169    }
170
171    /// Wraps this encoder with an input iterator, producing a lazy token
172    /// stream.
173    pub fn stream<I>(self, input: I) -> EncodeStream<T, I::IntoIter>
174    where
175        I: IntoIterator,
176    {
177        EncodeStream {
178            encoder: self,
179            input: input.into_iter().fuse(),
180        }
181    }
182}
183
184/// A lazy iterator adapter that feeds an input iterator through an
185/// [`Encoder`] and yields [`Token`]s.
186pub struct EncodeStream<T, I> {
187    encoder: Encoder<T>,
188    input: Fuse<I>,
189}
190
191impl<T, I> Iterator for EncodeStream<T, I>
192where
193    I: Iterator<Item = T>,
194    T: Eq + Copy,
195{
196    type Item = Token<T>;
197
198    fn next(&mut self) -> Option<Token<T>> {
199        for input in self.input.by_ref() {
200            if let Some(token) = self.encoder.encode(input) {
201                return Some(token);
202            }
203        }
204
205        self.encoder.finish_mut()
206    }
207
208    fn fold<B, F>(mut self, init: B, mut f: F) -> B
209    where
210        Self: Sized,
211        F: FnMut(B, Self::Item) -> B,
212    {
213        self.input
214            .fold(init, |state, input| match self.encoder.encode(input) {
215                None => state,
216                Some(token) => f(state, token),
217            })
218    }
219}
220
221impl<T, I> FusedIterator for EncodeStream<T, I>
222where
223    I: Iterator<Item = T>,
224    T: Eq + Copy,
225{
226}
227
228/// LZ78 decoder that rebuilds the dictionary and expands tokens back into symbols.
229pub struct Decoder<T> {
230    scratch: Vec<T>,
231    entires: Vec<(usize, usize)>,
232}
233
234impl<T> Default for Decoder<T> {
235    fn default() -> Self {
236        Self::new()
237    }
238}
239
240impl<T> Decoder<T> {
241    /// Creates a new, empty LZ78 decoder.
242    pub fn new() -> Self {
243        Decoder {
244            scratch: Vec::new(),
245            entires: Vec::new(),
246        }
247    }
248}
249
250/// Errors that can occur while decoding an LZ78 stream.
251#[derive(Debug)]
252pub enum DecodeError {
253    /// A token referenced a dictionary index that does not exist.
254    InvalidIndex,
255}
256
257impl<T> Decoder<T>
258where
259    T: Copy + Eq,
260{
261    fn decode_next_range(&mut self, token: Token<T>) -> Result<(usize, usize), DecodeError> {
262        // Add the new substring to the cache.
263        let (prefix_start, prefix_end) = if token.prefix > 0 {
264            if token.prefix > self.entires.len() {
265                return Err(DecodeError::InvalidIndex);
266            }
267            self.entires[token.prefix - 1]
268        } else {
269            (0, 0)
270        };
271
272        debug_assert!(prefix_end >= prefix_start);
273        let prefix_len = prefix_end - prefix_start;
274
275        let element = token.literal;
276
277        let end = if self.entires.is_empty() {
278            0
279        } else {
280            let (_start, end) = *self.entires.last().unwrap();
281            end
282        };
283
284        debug_assert_eq!(end, self.scratch.len());
285
286        self.scratch.extend_from_within(prefix_start..prefix_end);
287        self.scratch.push(element);
288
289        let new_start = end;
290        let new_end = new_start + prefix_len + 1;
291
292        self.entires.push((new_start, new_end));
293
294        Ok((new_start, new_end))
295    }
296
297    /// Decodes a single token, returning the expanded symbol slice.
298    pub fn decode_next_slice(&mut self, token: Token<T>) -> Result<&[T], DecodeError> {
299        let output = self.decode_next_range(token)?;
300
301        let slice = &self.scratch[output.0..output.1];
302        Ok(slice)
303    }
304}
305
306#[test]
307fn test_u16() {
308    let mut encoder = Encoder::<u16>::new();
309    let mut compressed = Vec::new();
310
311    let data = [
312        1, 1, 2, 1, 1, 2, 3, 1, 2, 1, 1, 1, 2, 1, 1, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 3, 1, 2,
313        1, 1, 2, 1, 1, 2, 3, 1, 2, 1, 1, 1, 2, 1, 1, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 3, 1, 2,
314        1, 1, 2, 1, 1, 2, 3, 1, 2, 1, 1, 1, 2, 1, 1, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 3, 1, 2,
315        1, 1, 2, 1, 1, 2, 3, 1, 2, 1, 1, 1, 2, 1, 1, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 3, 1, 2,
316        1, 1, 2, 1, 1, 2, 3, 1, 2, 1, 1, 1, 2, 1, 1, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 3, 1, 2,
317        1, 1, 1, 2, 1, 1, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 3, 3, 1, 1, 1, 2, 1, 3, 1, 1, 1, 2, 2,
318        1, 1, 3, 3, 1, 1, 1, 2, 1, 1, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 3, 3, 1, 1, 1, 2, 1, 3, 1,
319        1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 1, 2, 1, 1, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 3,
320        3, 1, 1, 1, 2, 1, 3, 1, 1, 1, 2, 2, 1, 1, 3, 3,
321    ];
322
323    for symbol in data {
324        if let Some(token) = encoder.encode(symbol) {
325            compressed.push(token);
326        }
327    }
328
329    if let Some(token) = encoder.finish() {
330        compressed.push(token);
331    }
332
333    let mut decoder = Decoder::<u16>::new();
334    let mut decoded = 0;
335
336    for token in compressed {
337        let slice = decoder.decode_next_slice(token).unwrap();
338        assert_eq!(data[decoded..][..slice.len()], *slice);
339        decoded += slice.len();
340    }
341}