weezl/
encode.rs

1//! A module for all encoding needs.
2use crate::error::{BufferResult, LzwError, LzwStatus, VectorResult};
3use crate::{BitOrder, Code, StreamBuf, MAX_CODESIZE, MAX_ENTRIES, STREAM_BUF_SIZE};
4
5use crate::alloc::{boxed::Box, vec::Vec};
6#[cfg(feature = "std")]
7use crate::error::StreamResult;
8#[cfg(feature = "std")]
9use std::io::{self, BufRead, Write};
10
11/// The state for encoding data with an LZW algorithm.
12///
13/// The same structure can be utilized with streams as well as your own buffers and driver logic.
14/// It may even be possible to mix them if you are sufficiently careful not to lose any written
15/// data in the process.
16///
17/// This is a sans-IO implementation, meaning that it only contains the state of the encoder and
18/// the caller will provide buffers for input and output data when calling the basic
19/// [`encode_bytes`] method. Nevertheless, a number of _adapters_ are provided in the `into_*`
20/// methods for enoding with a particular style of common IO.
21///
22/// * [`encode`] for encoding once without any IO-loop.
23/// * [`into_async`] for encoding with the `futures` traits for asynchronous IO.
24/// * [`into_stream`] for encoding with the standard `io` traits.
25/// * [`into_vec`] for in-memory encoding.
26///
27/// [`encode_bytes`]: #method.encode_bytes
28/// [`encode`]: #method.encode
29/// [`into_async`]: #method.into_async
30/// [`into_stream`]: #method.into_stream
31/// [`into_vec`]: #method.into_vec
32pub struct Encoder {
33    /// Internally dispatch via a dynamic trait object. This did not have any significant
34    /// performance impact as we batch data internally and this pointer does not change after
35    /// creation!
36    state: Box<dyn Stateful + Send + 'static>,
37}
38
39/// A encoding stream sink.
40///
41/// See [`Encoder::into_stream`] on how to create this type.
42///
43/// [`Encoder::into_stream`]: struct.Encoder.html#method.into_stream
44#[cfg_attr(
45    not(feature = "std"),
46    deprecated = "This type is only useful with the `std` feature."
47)]
48#[cfg_attr(not(feature = "std"), allow(dead_code))]
49pub struct IntoStream<'d, W> {
50    encoder: &'d mut Encoder,
51    writer: W,
52    buffer: Option<StreamBuf<'d>>,
53    default_size: usize,
54}
55
56/// An async decoding sink.
57///
58/// See [`Encoder::into_async`] on how to create this type.
59///
60/// [`Encoder::into_async`]: struct.Encoder.html#method.into_async
61#[cfg(feature = "async")]
62pub struct IntoAsync<'d, W> {
63    encoder: &'d mut Encoder,
64    writer: W,
65    buffer: Option<StreamBuf<'d>>,
66    default_size: usize,
67}
68
69/// A encoding sink into a vector.
70///
71/// See [`Encoder::into_vec`] on how to create this type.
72///
73/// [`Encoder::into_vec`]: struct.Encoder.html#method.into_vec
74pub struct IntoVec<'d> {
75    encoder: &'d mut Encoder,
76    vector: &'d mut Vec<u8>,
77}
78
79trait Stateful {
80    fn advance(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult;
81    fn mark_ended(&mut self) -> bool;
82    /// Reset the state tracking if end code has been written.
83    fn restart(&mut self);
84    /// Reset the encoder to the beginning, dropping all buffers etc.
85    fn reset(&mut self);
86}
87
88struct EncodeState<B: Buffer> {
89    /// The configured minimal code size.
90    min_size: u8,
91    /// The current encoding symbol tree.
92    tree: Tree,
93    /// If we have pushed the end code.
94    has_ended: bool,
95    /// If tiff then bumps are a single code sooner.
96    is_tiff: bool,
97    /// The code corresponding to the currently read characters.
98    current_code: Code,
99    /// The clear code for resetting the dictionary.
100    clear_code: Code,
101    /// The bit buffer for encoding.
102    buffer: B,
103}
104
105struct MsbBuffer {
106    /// The current code length.
107    code_size: u8,
108    /// The buffer bits.
109    buffer: u64,
110    /// The number of valid buffer bits.
111    bits_in_buffer: u8,
112}
113
114struct LsbBuffer {
115    /// The current code length.
116    code_size: u8,
117    /// The buffer bits.
118    buffer: u64,
119    /// The number of valid buffer bits.
120    bits_in_buffer: u8,
121}
122
123trait Buffer {
124    fn new(size: u8) -> Self;
125    /// Reset the code size in the buffer.
126    fn reset(&mut self, min_size: u8);
127    /// Apply effects of a Clear Code.
128    fn clear(&mut self, min_size: u8);
129    /// Insert a code into the buffer.
130    fn buffer_code(&mut self, code: Code);
131    /// Push bytes if the buffer space is getting small.
132    fn push_out(&mut self, out: &mut &mut [u8]) -> bool;
133    /// Flush all full bytes, returning if at least one more byte remains.
134    fn flush_out(&mut self, out: &mut &mut [u8]) -> bool;
135    /// Pad the buffer to a full byte.
136    fn buffer_pad(&mut self);
137    /// Increase the maximum code size.
138    fn bump_code_size(&mut self);
139    /// Return the maximum code with the current code size.
140    fn max_code(&self) -> Code;
141    /// Return the current code size in bits.
142    fn code_size(&self) -> u8;
143}
144
145/// One tree node for at most each code.
146/// To avoid using too much memory we keep nodes with few successors in optimized form. This form
147/// doesn't offer lookup by indexing but instead does a linear search.
148#[derive(Default)]
149struct Tree {
150    simples: Vec<Simple>,
151    complex: Vec<Full>,
152    keys: Vec<CompressedKey>,
153}
154
155#[derive(Clone, Copy)]
156enum FullKey {
157    NoSuccessor,
158    Simple(u16),
159    Full(u16),
160}
161
162#[derive(Clone, Copy)]
163struct CompressedKey(u16);
164
165const SHORT: usize = 16;
166
167#[derive(Clone, Copy)]
168struct Simple {
169    codes: [Code; SHORT],
170    chars: [u8; SHORT],
171    count: u8,
172}
173
174#[derive(Clone, Copy)]
175struct Full {
176    char_continuation: [Code; 256],
177}
178
179impl Encoder {
180    /// Create a new encoder with the specified bit order and symbol size.
181    ///
182    /// The algorithm for dynamically increasing the code symbol bit width is compatible with the
183    /// original specification. In particular you will need to specify an `Lsb` bit oder to encode
184    /// the data portion of a compressed `gif` image.
185    ///
186    /// # Panics
187    ///
188    /// The `size` needs to be in the interval `2..=12`.
189    pub fn new(order: BitOrder, size: u8) -> Self {
190        type Boxed = Box<dyn Stateful + Send + 'static>;
191        super::assert_encode_size(size);
192        let state = match order {
193            BitOrder::Lsb => Box::new(EncodeState::<LsbBuffer>::new(size)) as Boxed,
194            BitOrder::Msb => Box::new(EncodeState::<MsbBuffer>::new(size)) as Boxed,
195        };
196
197        Encoder { state }
198    }
199
200    /// Create a TIFF compatible encoder with the specified bit order and symbol size.
201    ///
202    /// The algorithm for dynamically increasing the code symbol bit width is compatible with the
203    /// TIFF specification, which is a misinterpretation of the original algorithm for increasing
204    /// the code size. It switches one symbol sooner.
205    ///
206    /// # Panics
207    ///
208    /// The `size` needs to be in the interval `2..=12`.
209    pub fn with_tiff_size_switch(order: BitOrder, size: u8) -> Self {
210        type Boxed = Box<dyn Stateful + Send + 'static>;
211        super::assert_encode_size(size);
212        let state = match order {
213            BitOrder::Lsb => {
214                let mut state = Box::new(EncodeState::<LsbBuffer>::new(size));
215                state.is_tiff = true;
216                state as Boxed
217            }
218            BitOrder::Msb => {
219                let mut state = Box::new(EncodeState::<MsbBuffer>::new(size));
220                state.is_tiff = true;
221                state as Boxed
222            }
223        };
224
225        Encoder { state }
226    }
227
228    /// Encode some bytes from `inp` into `out`.
229    ///
230    /// See [`into_stream`] for high-level functions (this interface is only available with the
231    /// `std` feature) and [`finish`] for marking the input data as complete.
232    ///
233    /// When some input byte is invalid, i.e. is not smaller than `1 << size`, then that byte and
234    /// all following ones will _not_ be consumed and the `status` of the result will signal an
235    /// error. The result will also indicate that all bytes up to but not including the offending
236    /// byte have been consumed. You may try again with a fixed byte.
237    ///
238    /// [`into_stream`]: #method.into_stream
239    /// [`finish`]: #method.finish
240    pub fn encode_bytes(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult {
241        self.state.advance(inp, out)
242    }
243
244    /// Encode a single chunk of data.
245    ///
246    /// This method will add an end marker to the encoded chunk.
247    ///
248    /// This is a convenience wrapper around [`into_vec`]. Use the `into_vec` adapter to customize
249    /// buffer size, to supply an existing vector, to control whether an end marker is required, or
250    /// to preserve partial data in the case of a decoding error.
251    ///
252    /// [`into_vec`]: #into_vec
253    ///
254    /// # Example
255    ///
256    /// ```
257    /// use weezl::{BitOrder, encode::Encoder};
258    ///
259    /// let data = b"Hello, world";
260    /// let encoded = Encoder::new(BitOrder::Msb, 9)
261    ///     .encode(data)
262    ///     .expect("All bytes valid for code size");
263    /// ```
264    pub fn encode(&mut self, data: &[u8]) -> Result<Vec<u8>, LzwError> {
265        let mut output = Vec::new();
266        self.into_vec(&mut output).encode_all(data).status?;
267        Ok(output)
268    }
269
270    /// Construct a encoder into a writer.
271    #[cfg(feature = "std")]
272    pub fn into_stream<W: Write>(&mut self, writer: W) -> IntoStream<'_, W> {
273        IntoStream {
274            encoder: self,
275            writer,
276            buffer: None,
277            default_size: STREAM_BUF_SIZE,
278        }
279    }
280
281    /// Construct a encoder into an async writer.
282    #[cfg(feature = "async")]
283    pub fn into_async<W: futures::io::AsyncWrite>(&mut self, writer: W) -> IntoAsync<'_, W> {
284        IntoAsync {
285            encoder: self,
286            writer,
287            buffer: None,
288            default_size: STREAM_BUF_SIZE,
289        }
290    }
291
292    /// Construct an encoder into a vector.
293    ///
294    /// All encoded data is appended and the vector is __not__ cleared.
295    ///
296    /// Compared to `into_stream` this interface allows a high-level access to encoding without
297    /// requires the `std`-feature. Also, it can make full use of the extra buffer control that the
298    /// special target exposes.
299    pub fn into_vec<'lt>(&'lt mut self, vec: &'lt mut Vec<u8>) -> IntoVec<'lt> {
300        IntoVec {
301            encoder: self,
302            vector: vec,
303        }
304    }
305
306    /// Mark the encoding as in the process of finishing.
307    ///
308    /// The next following call to `encode_bytes` which is able to consume the complete input will
309    /// also try to emit an end code. It's not recommended, but also not unsound, to use different
310    /// byte slices in different calls from this point forward and thus to 'delay' the actual end
311    /// of the data stream. The behaviour after the end marker has been written is unspecified but
312    /// sound.
313    pub fn finish(&mut self) {
314        self.state.mark_ended();
315    }
316
317    /// Undo marking this data stream as ending.
318    /// FIXME: clarify how this interacts with padding introduced after end code.
319    #[allow(dead_code)]
320    pub(crate) fn restart(&mut self) {
321        self.state.restart()
322    }
323
324    /// Reset all internal state.
325    ///
326    /// This produce an encoder as if just constructed with `new` but taking slightly less work. In
327    /// particular it will not deallocate any internal allocations. It will also avoid some
328    /// duplicate setup work.
329    pub fn reset(&mut self) {
330        self.state.reset()
331    }
332}
333
334#[cfg(feature = "std")]
335impl<'d, W: Write> IntoStream<'d, W> {
336    /// Encode data from a reader.
337    ///
338    /// This will drain the supplied reader. It will not encode an end marker after all data has
339    /// been processed.
340    pub fn encode(&mut self, read: impl BufRead) -> StreamResult {
341        self.encode_part(read, false)
342    }
343
344    /// Encode data from a reader and an end marker.
345    pub fn encode_all(mut self, read: impl BufRead) -> StreamResult {
346        self.encode_part(read, true)
347    }
348
349    /// Set the size of the intermediate encode buffer.
350    ///
351    /// A buffer of this size is allocated to hold one part of the encoded stream when no buffer is
352    /// available and any encoding method is called. No buffer is allocated if `set_buffer` has
353    /// been called. The buffer is reused.
354    ///
355    /// # Panics
356    /// This method panics if `size` is `0`.
357    pub fn set_buffer_size(&mut self, size: usize) {
358        assert_ne!(size, 0, "Attempted to set empty buffer");
359        self.default_size = size;
360    }
361
362    /// Use a particular buffer as an intermediate encode buffer.
363    ///
364    /// Calling this sets or replaces the buffer. When a buffer has been set then it is used
365    /// instead of a dynamically allocating a buffer. Note that the size of the buffer is relevant
366    /// for efficient encoding as there is additional overhead from `write` calls each time the
367    /// buffer has been filled.
368    ///
369    /// # Panics
370    /// This method panics if the `buffer` is empty.
371    pub fn set_buffer(&mut self, buffer: &'d mut [u8]) {
372        assert_ne!(buffer.len(), 0, "Attempted to set empty buffer");
373        self.buffer = Some(StreamBuf::Borrowed(buffer));
374    }
375
376    fn encode_part(&mut self, mut read: impl BufRead, finish: bool) -> StreamResult {
377        let IntoStream {
378            encoder,
379            writer,
380            buffer,
381            default_size,
382        } = self;
383        enum Progress {
384            Ok,
385            Done,
386        }
387
388        let mut bytes_read = 0;
389        let mut bytes_written = 0;
390
391        let read_bytes = &mut bytes_read;
392        let write_bytes = &mut bytes_written;
393
394        let outbuf: &mut [u8] =
395            match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } {
396                StreamBuf::Borrowed(slice) => &mut *slice,
397                StreamBuf::Owned(vec) => &mut *vec,
398            };
399        assert!(!outbuf.is_empty());
400
401        let once = move || {
402            let data = read.fill_buf()?;
403
404            if data.is_empty() {
405                if finish {
406                    encoder.finish();
407                } else {
408                    return Ok(Progress::Done);
409                }
410            }
411
412            let result = encoder.encode_bytes(data, &mut outbuf[..]);
413            *read_bytes += result.consumed_in;
414            *write_bytes += result.consumed_out;
415            read.consume(result.consumed_in);
416
417            let done = result.status.map_err(|err| {
418                io::Error::new(io::ErrorKind::InvalidData, &*format!("{:?}", err))
419            })?;
420
421            if let LzwStatus::Done = done {
422                writer.write_all(&outbuf[..result.consumed_out])?;
423                return Ok(Progress::Done);
424            }
425
426            if let LzwStatus::NoProgress = done {
427                return Err(io::Error::new(
428                    io::ErrorKind::UnexpectedEof,
429                    "No more data but no end marker detected",
430                ));
431            }
432
433            writer.write_all(&outbuf[..result.consumed_out])?;
434            Ok(Progress::Ok)
435        };
436
437        let status = core::iter::repeat_with(once)
438            // scan+fuse can be replaced with map_while
439            .scan((), |(), result| match result {
440                Ok(Progress::Ok) => Some(Ok(())),
441                Err(err) => Some(Err(err)),
442                Ok(Progress::Done) => None,
443            })
444            .fuse()
445            .collect();
446
447        StreamResult {
448            bytes_read,
449            bytes_written,
450            status,
451        }
452    }
453}
454
455impl IntoVec<'_> {
456    /// Encode data from a slice.
457    pub fn encode(&mut self, read: &[u8]) -> VectorResult {
458        self.encode_part(read, false)
459    }
460
461    /// Decode data from a reader, adding an end marker.
462    pub fn encode_all(mut self, read: &[u8]) -> VectorResult {
463        self.encode_part(read, true)
464    }
465
466    fn grab_buffer(&mut self) -> (&mut [u8], &mut Encoder) {
467        const CHUNK_SIZE: usize = 1 << 12;
468        let decoder = &mut self.encoder;
469        let length = self.vector.len();
470
471        // Use the vector to do overflow checks and w/e.
472        self.vector.reserve(CHUNK_SIZE);
473        // FIXME: encoding into uninit buffer?
474        self.vector.resize(length + CHUNK_SIZE, 0u8);
475
476        (&mut self.vector[length..], decoder)
477    }
478
479    fn encode_part(&mut self, part: &[u8], finish: bool) -> VectorResult {
480        let mut result = VectorResult {
481            consumed_in: 0,
482            consumed_out: 0,
483            status: Ok(LzwStatus::Ok),
484        };
485
486        enum Progress {
487            Ok,
488            Done,
489        }
490
491        // Converting to mutable refs to move into the `once` closure.
492        let read_bytes = &mut result.consumed_in;
493        let write_bytes = &mut result.consumed_out;
494        let mut data = part;
495
496        // A 64 MB buffer is quite large but should get alloc_zeroed.
497        // Note that the decoded size can be up to quadratic in code block.
498        let once = move || {
499            // Grab a new output buffer.
500            let (outbuf, encoder) = self.grab_buffer();
501
502            if finish {
503                encoder.finish();
504            }
505
506            // Decode as much of the buffer as fits.
507            let result = encoder.encode_bytes(data, &mut outbuf[..]);
508            // Do the bookkeeping and consume the buffer.
509            *read_bytes += result.consumed_in;
510            *write_bytes += result.consumed_out;
511            data = &data[result.consumed_in..];
512
513            let unfilled = outbuf.len() - result.consumed_out;
514            let filled = self.vector.len() - unfilled;
515            self.vector.truncate(filled);
516
517            // Handle the status in the result.
518            let done = result.status?;
519            if let LzwStatus::Done = done {
520                Ok(Progress::Done)
521            } else {
522                Ok(Progress::Ok)
523            }
524        };
525
526        // Decode chunks of input data until we're done.
527        let status: Result<(), _> = core::iter::repeat_with(once)
528            // scan+fuse can be replaced with map_while
529            .scan((), |(), result| match result {
530                Ok(Progress::Ok) => Some(Ok(())),
531                Err(err) => Some(Err(err)),
532                Ok(Progress::Done) => None,
533            })
534            .fuse()
535            .collect();
536
537        if let Err(err) = status {
538            result.status = Err(err);
539        }
540
541        result
542    }
543}
544
545// This is implemented in a separate file, so that 1.34.2 does not parse it. Otherwise, it would
546// trip over the usage of await, which is a reserved keyword in that edition/version. It only
547// contains an impl block.
548#[cfg(feature = "async")]
549#[path = "encode_into_async.rs"]
550mod impl_encode_into_async;
551
552impl<B: Buffer> EncodeState<B> {
553    fn new(min_size: u8) -> Self {
554        let clear_code = 1 << min_size;
555        let mut tree = Tree::default();
556        tree.init(min_size);
557        let mut state = EncodeState {
558            min_size,
559            tree,
560            has_ended: false,
561            is_tiff: false,
562            current_code: clear_code,
563            clear_code,
564            buffer: B::new(min_size),
565        };
566        state.buffer_code(clear_code);
567        state
568    }
569}
570
571impl<B: Buffer> Stateful for EncodeState<B> {
572    fn advance(&mut self, mut inp: &[u8], mut out: &mut [u8]) -> BufferResult {
573        let c_in = inp.len();
574        let c_out = out.len();
575        let mut status = Ok(LzwStatus::Ok);
576
577        'encoding: loop {
578            if self.push_out(&mut out) {
579                break;
580            }
581
582            if inp.is_empty() && self.has_ended {
583                let end = self.end_code();
584                if self.current_code != end {
585                    if self.current_code != self.clear_code {
586                        self.buffer_code(self.current_code);
587
588                        // When reading this code, the decoder will add an extra entry to its table
589                        // before reading th end code. Thusly, it may increase its code size based
590                        // on this additional entry.
591                        if self.tree.keys.len() + usize::from(self.is_tiff)
592                            > usize::from(self.buffer.max_code())
593                            && self.buffer.code_size() < MAX_CODESIZE
594                        {
595                            self.buffer.bump_code_size();
596                        }
597                    }
598                    self.buffer_code(end);
599                    self.current_code = end;
600                    self.buffer_pad();
601                }
602
603                break;
604            }
605
606            let mut next_code = None;
607            let mut bytes = inp.iter();
608            while let Some(&byte) = bytes.next() {
609                if self.min_size < 8 && byte >= 1 << self.min_size {
610                    status = Err(LzwError::InvalidCode);
611                    break 'encoding;
612                }
613
614                inp = bytes.as_slice();
615                match self.tree.iterate(self.current_code, byte) {
616                    Ok(code) => self.current_code = code,
617                    Err(_) => {
618                        next_code = Some(self.current_code);
619
620                        self.current_code = u16::from(byte);
621                        break;
622                    }
623                }
624            }
625
626            match next_code {
627                // No more bytes, no code produced.
628                None => break,
629                Some(code) => {
630                    self.buffer_code(code);
631
632                    if self.tree.keys.len() + usize::from(self.is_tiff)
633                        > usize::from(self.buffer.max_code()) + 1
634                        && self.buffer.code_size() < MAX_CODESIZE
635                    {
636                        self.buffer.bump_code_size();
637                    }
638
639                    if self.tree.keys.len() > MAX_ENTRIES {
640                        self.buffer_code(self.clear_code);
641                        self.tree.reset(self.min_size);
642                        self.buffer.clear(self.min_size);
643                    }
644                }
645            }
646        }
647
648        if inp.is_empty() && self.current_code == self.end_code() {
649            if !self.flush_out(&mut out) {
650                status = Ok(LzwStatus::Done);
651            }
652        }
653
654        BufferResult {
655            consumed_in: c_in - inp.len(),
656            consumed_out: c_out - out.len(),
657            status,
658        }
659    }
660
661    fn mark_ended(&mut self) -> bool {
662        core::mem::replace(&mut self.has_ended, true)
663    }
664
665    fn restart(&mut self) {
666        self.has_ended = false;
667    }
668
669    fn reset(&mut self) {
670        self.restart();
671        self.current_code = self.clear_code;
672        self.tree.reset(self.min_size);
673        self.buffer.reset(self.min_size);
674        self.buffer_code(self.clear_code);
675    }
676}
677
678impl<B: Buffer> EncodeState<B> {
679    fn push_out(&mut self, out: &mut &mut [u8]) -> bool {
680        self.buffer.push_out(out)
681    }
682
683    fn flush_out(&mut self, out: &mut &mut [u8]) -> bool {
684        self.buffer.flush_out(out)
685    }
686
687    fn end_code(&self) -> Code {
688        self.clear_code + 1
689    }
690
691    fn buffer_pad(&mut self) {
692        self.buffer.buffer_pad();
693    }
694
695    fn buffer_code(&mut self, code: Code) {
696        self.buffer.buffer_code(code);
697    }
698}
699
700impl Buffer for MsbBuffer {
701    fn new(min_size: u8) -> Self {
702        MsbBuffer {
703            code_size: min_size + 1,
704            buffer: 0,
705            bits_in_buffer: 0,
706        }
707    }
708
709    fn reset(&mut self, min_size: u8) {
710        self.code_size = min_size + 1;
711        self.buffer = 0;
712        self.bits_in_buffer = 0;
713    }
714
715    fn clear(&mut self, min_size: u8) {
716        self.code_size = min_size + 1;
717    }
718
719    fn buffer_code(&mut self, code: Code) {
720        let shift = 64 - self.bits_in_buffer - self.code_size;
721        self.buffer |= u64::from(code) << shift;
722        self.bits_in_buffer += self.code_size;
723    }
724
725    fn push_out(&mut self, out: &mut &mut [u8]) -> bool {
726        if self.bits_in_buffer + 2 * self.code_size < 64 {
727            return false;
728        }
729
730        self.flush_out(out)
731    }
732
733    fn flush_out(&mut self, out: &mut &mut [u8]) -> bool {
734        let want = usize::from(self.bits_in_buffer / 8);
735        let count = want.min((*out).len());
736        let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count);
737        *out = tail;
738
739        for b in bytes {
740            *b = ((self.buffer & 0xff00_0000_0000_0000) >> 56) as u8;
741            self.buffer <<= 8;
742            self.bits_in_buffer -= 8;
743        }
744
745        count < want
746    }
747
748    fn buffer_pad(&mut self) {
749        let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7;
750        self.bits_in_buffer += to_byte;
751    }
752
753    fn bump_code_size(&mut self) {
754        self.code_size += 1;
755    }
756
757    fn max_code(&self) -> Code {
758        (1 << self.code_size) - 1
759    }
760
761    fn code_size(&self) -> u8 {
762        self.code_size
763    }
764}
765
766impl Buffer for LsbBuffer {
767    fn new(min_size: u8) -> Self {
768        LsbBuffer {
769            code_size: min_size + 1,
770            buffer: 0,
771            bits_in_buffer: 0,
772        }
773    }
774
775    fn reset(&mut self, min_size: u8) {
776        self.code_size = min_size + 1;
777        self.buffer = 0;
778        self.bits_in_buffer = 0;
779    }
780
781    fn clear(&mut self, min_size: u8) {
782        self.code_size = min_size + 1;
783    }
784
785    fn buffer_code(&mut self, code: Code) {
786        self.buffer |= u64::from(code) << self.bits_in_buffer;
787        self.bits_in_buffer += self.code_size;
788    }
789
790    fn push_out(&mut self, out: &mut &mut [u8]) -> bool {
791        if self.bits_in_buffer + 2 * self.code_size < 64 {
792            return false;
793        }
794
795        self.flush_out(out)
796    }
797
798    fn flush_out(&mut self, out: &mut &mut [u8]) -> bool {
799        let want = usize::from(self.bits_in_buffer / 8);
800        let count = want.min((*out).len());
801        let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count);
802        *out = tail;
803
804        for b in bytes {
805            *b = (self.buffer & 0x0000_0000_0000_00ff) as u8;
806            self.buffer >>= 8;
807            self.bits_in_buffer -= 8;
808        }
809
810        count < want
811    }
812
813    fn buffer_pad(&mut self) {
814        let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7;
815        self.bits_in_buffer += to_byte;
816    }
817
818    fn bump_code_size(&mut self) {
819        self.code_size += 1;
820    }
821
822    fn max_code(&self) -> Code {
823        (1 << self.code_size) - 1
824    }
825
826    fn code_size(&self) -> u8 {
827        self.code_size
828    }
829}
830
831impl Tree {
832    fn init(&mut self, min_size: u8) {
833        // We need a way to represent the state of a currently empty buffer. We use the clear code
834        // for this, thus create one complex mapping that leads to the one-char base codes.
835        self.keys
836            .resize((1 << min_size) + 2, FullKey::NoSuccessor.into());
837        self.complex.push(Full {
838            char_continuation: [0; 256],
839        });
840        let map_of_begin = self.complex.last_mut().unwrap();
841        for ch in 0u16..256 {
842            map_of_begin.char_continuation[usize::from(ch)] = ch;
843        }
844        self.keys[1 << min_size] = FullKey::Full(0).into();
845    }
846
847    fn reset(&mut self, min_size: u8) {
848        self.simples.clear();
849        self.keys.truncate((1 << min_size) + 2);
850        // Keep entry for clear code.
851        self.complex.truncate(1);
852        // The first complex is not changed..
853        for k in self.keys[..(1 << min_size) + 2].iter_mut() {
854            *k = FullKey::NoSuccessor.into();
855        }
856        self.keys[1 << min_size] = FullKey::Full(0).into();
857    }
858
859    fn at_key(&self, code: Code, ch: u8) -> Option<Code> {
860        let key = self.keys[usize::from(code)];
861        match FullKey::from(key) {
862            FullKey::NoSuccessor => None,
863            FullKey::Simple(idx) => {
864                let nexts = &self.simples[usize::from(idx)];
865                let successors = nexts
866                    .codes
867                    .iter()
868                    .zip(nexts.chars.iter())
869                    .take(usize::from(nexts.count));
870                for (&scode, &sch) in successors {
871                    if sch == ch {
872                        return Some(scode);
873                    }
874                }
875
876                None
877            }
878            FullKey::Full(idx) => {
879                let full = &self.complex[usize::from(idx)];
880                let precode = full.char_continuation[usize::from(ch)];
881                if usize::from(precode) < MAX_ENTRIES {
882                    Some(precode)
883                } else {
884                    None
885                }
886            }
887        }
888    }
889
890    /// Iterate to the next char.
891    /// Return Ok when it was already in the tree or creates a new entry for it and returns Err.
892    fn iterate(&mut self, code: Code, ch: u8) -> Result<Code, Code> {
893        if let Some(next) = self.at_key(code, ch) {
894            Ok(next)
895        } else {
896            Err(self.append(code, ch))
897        }
898    }
899
900    fn append(&mut self, code: Code, ch: u8) -> Code {
901        let next: Code = self.keys.len() as u16;
902        let key = self.keys[usize::from(code)];
903        // TODO: with debug assertions, check for non-existence
904        match FullKey::from(key) {
905            FullKey::NoSuccessor => {
906                let new_key = FullKey::Simple(self.simples.len() as u16);
907                self.simples.push(Simple::default());
908                let simples = self.simples.last_mut().unwrap();
909                simples.codes[0] = next;
910                simples.chars[0] = ch;
911                simples.count = 1;
912                self.keys[usize::from(code)] = new_key.into();
913            }
914            FullKey::Simple(idx) if usize::from(self.simples[usize::from(idx)].count) < SHORT => {
915                let nexts = &mut self.simples[usize::from(idx)];
916                let nidx = usize::from(nexts.count);
917                nexts.chars[nidx] = ch;
918                nexts.codes[nidx] = next;
919                nexts.count += 1;
920            }
921            FullKey::Simple(idx) => {
922                let new_key = FullKey::Full(self.complex.len() as u16);
923                let simples = &self.simples[usize::from(idx)];
924                self.complex.push(Full {
925                    char_continuation: [Code::max_value(); 256],
926                });
927                let full = self.complex.last_mut().unwrap();
928                for (&pch, &pcont) in simples.chars.iter().zip(simples.codes.iter()) {
929                    full.char_continuation[usize::from(pch)] = pcont;
930                }
931                self.keys[usize::from(code)] = new_key.into();
932            }
933            FullKey::Full(idx) => {
934                let full = &mut self.complex[usize::from(idx)];
935                full.char_continuation[usize::from(ch)] = next;
936            }
937        }
938        self.keys.push(FullKey::NoSuccessor.into());
939        next
940    }
941}
942
943impl Default for FullKey {
944    fn default() -> Self {
945        FullKey::NoSuccessor
946    }
947}
948
949impl Default for Simple {
950    fn default() -> Self {
951        Simple {
952            codes: [0; SHORT],
953            chars: [0; SHORT],
954            count: 0,
955        }
956    }
957}
958
959impl From<CompressedKey> for FullKey {
960    fn from(CompressedKey(key): CompressedKey) -> Self {
961        match (key >> MAX_CODESIZE) & 0xf {
962            0 => FullKey::Full(key & 0xfff),
963            1 => FullKey::Simple(key & 0xfff),
964            _ => FullKey::NoSuccessor,
965        }
966    }
967}
968
969impl From<FullKey> for CompressedKey {
970    fn from(full: FullKey) -> Self {
971        CompressedKey(match full {
972            FullKey::NoSuccessor => 0x2000,
973            FullKey::Simple(code) => 0x1000 | code,
974            FullKey::Full(code) => code,
975        })
976    }
977}
978
979#[cfg(test)]
980mod tests {
981    use super::{BitOrder, Encoder, LzwError, LzwStatus};
982    use crate::alloc::vec::Vec;
983    use crate::decode::Decoder;
984    #[cfg(feature = "std")]
985    use crate::StreamBuf;
986
987    #[test]
988    fn invalid_input_rejected() {
989        const BIT_LEN: u8 = 2;
990        let ref input = [0, 1 << BIT_LEN /* invalid */, 0];
991        let ref mut target = [0u8; 128];
992        let mut encoder = Encoder::new(BitOrder::Msb, BIT_LEN);
993
994        encoder.finish();
995        // We require simulation of normality, that is byte-for-byte compression.
996        let result = encoder.encode_bytes(input, target);
997        assert!(if let Err(LzwError::InvalidCode) = result.status {
998            true
999        } else {
1000            false
1001        });
1002        assert_eq!(result.consumed_in, 1);
1003
1004        let fixed = encoder.encode_bytes(&[1, 0], &mut target[result.consumed_out..]);
1005        assert!(if let Ok(LzwStatus::Done) = fixed.status {
1006            true
1007        } else {
1008            false
1009        });
1010        assert_eq!(fixed.consumed_in, 2);
1011
1012        // Okay, now test we actually fixed it.
1013        let ref mut compare = [0u8; 4];
1014        let mut todo = &target[..result.consumed_out + fixed.consumed_out];
1015        let mut free = &mut compare[..];
1016        let mut decoder = Decoder::new(BitOrder::Msb, BIT_LEN);
1017
1018        // Decode with up to 16 rounds, far too much but inconsequential.
1019        for _ in 0..16 {
1020            if decoder.has_ended() {
1021                break;
1022            }
1023
1024            let result = decoder.decode_bytes(todo, free);
1025            assert!(result.status.is_ok());
1026            todo = &todo[result.consumed_in..];
1027            free = &mut free[result.consumed_out..];
1028        }
1029
1030        let remaining = { free }.len();
1031        let len = compare.len() - remaining;
1032        assert_eq!(todo, &[]);
1033        assert_eq!(compare[..len], [0, 1, 0]);
1034    }
1035
1036    #[test]
1037    #[should_panic]
1038    fn invalid_code_size_low() {
1039        let _ = Encoder::new(BitOrder::Msb, 1);
1040    }
1041
1042    #[test]
1043    #[should_panic]
1044    fn invalid_code_size_high() {
1045        let _ = Encoder::new(BitOrder::Msb, 14);
1046    }
1047
1048    fn make_decoded() -> Vec<u8> {
1049        const FILE: &'static [u8] =
1050            include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/Cargo.lock"));
1051        return Vec::from(FILE);
1052    }
1053
1054    #[test]
1055    #[cfg(feature = "std")]
1056    fn into_stream_buffer_no_alloc() {
1057        let encoded = make_decoded();
1058        let mut encoder = Encoder::new(BitOrder::Msb, 8);
1059
1060        let mut output = vec![];
1061        let mut buffer = [0; 512];
1062        let mut istream = encoder.into_stream(&mut output);
1063        istream.set_buffer(&mut buffer[..]);
1064        istream.encode(&encoded[..]).status.unwrap();
1065
1066        match istream.buffer {
1067            Some(StreamBuf::Borrowed(_)) => {}
1068            None => panic!("Decoded without buffer??"),
1069            Some(StreamBuf::Owned(_)) => panic!("Unexpected buffer allocation"),
1070        }
1071    }
1072
1073    #[test]
1074    #[cfg(feature = "std")]
1075    fn into_stream_buffer_small_alloc() {
1076        struct WriteTap<W: std::io::Write>(W);
1077        const BUF_SIZE: usize = 512;
1078
1079        impl<W: std::io::Write> std::io::Write for WriteTap<W> {
1080            fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
1081                assert!(buf.len() <= BUF_SIZE);
1082                self.0.write(buf)
1083            }
1084            fn flush(&mut self) -> std::io::Result<()> {
1085                self.0.flush()
1086            }
1087        }
1088
1089        let encoded = make_decoded();
1090        let mut encoder = Encoder::new(BitOrder::Msb, 8);
1091
1092        let mut output = vec![];
1093        let mut istream = encoder.into_stream(WriteTap(&mut output));
1094        istream.set_buffer_size(512);
1095        istream.encode(&encoded[..]).status.unwrap();
1096
1097        match istream.buffer {
1098            Some(StreamBuf::Owned(vec)) => assert!(vec.len() <= BUF_SIZE),
1099            Some(StreamBuf::Borrowed(_)) => panic!("Unexpected borrowed buffer, where from?"),
1100            None => panic!("Decoded without buffer??"),
1101        }
1102    }
1103
1104    #[test]
1105    #[cfg(feature = "std")]
1106    fn reset() {
1107        let encoded = make_decoded();
1108        let mut encoder = Encoder::new(BitOrder::Msb, 8);
1109        let mut reference = None;
1110
1111        for _ in 0..2 {
1112            let mut output = vec![];
1113            let mut buffer = [0; 512];
1114            let mut istream = encoder.into_stream(&mut output);
1115            istream.set_buffer(&mut buffer[..]);
1116            istream.encode_all(&encoded[..]).status.unwrap();
1117
1118            encoder.reset();
1119            if let Some(reference) = &reference {
1120                assert_eq!(output, *reference);
1121            } else {
1122                reference = Some(output);
1123            }
1124        }
1125    }
1126}
weezl/encode.rs

weezl/
encode.rs