preflate_rs/
stream_processor.rs

1/*---------------------------------------------------------------------------------------------
2 *  Copyright (c) Microsoft Corporation. All rights reserved.
3 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
4 *  This software incorporates material from third parties. See NOTICE.txt for details.
5 *--------------------------------------------------------------------------------------------*/
6
7//! Responsible for performing preflate and recreation on a chunk by chunk basis
8
9use std::io::{BufRead, Cursor};
10
11use bitcode::{Decode, Encode};
12use cabac::vp8::{VP8Reader, VP8Writer};
13
14use crate::{
15    Result,
16    cabac_codec::{PredictionDecoderCabac, PredictionEncoderCabac},
17    deflate::{
18        deflate_reader::DeflateParser, deflate_token::DeflateTokenBlock,
19        deflate_writer::DeflateWriter,
20    },
21    estimator::preflate_parameter_estimator::{
22        TokenPredictorParameters, estimate_preflate_parameters,
23    },
24    preflate_error::{AddContext, ExitCode, PreflateError},
25    preflate_input::{PlainText, PreflateInput},
26    statistical_codec::{CodecCorrection, PredictionDecoder, PredictionEncoder},
27    token_predictor::TokenPredictor,
28};
29
30/// the data required to reconstruct the deflate stream exactly the way that it was
31#[derive(Encode, Decode)]
32struct ReconstructionData {
33    pub parameters: TokenPredictorParameters,
34    pub corrections: Vec<u8>,
35}
36
37impl ReconstructionData {
38    pub fn read(data: &[u8]) -> Result<Self> {
39        bitcode::decode(data).map_err(|e| {
40            PreflateError::new(
41                ExitCode::InvalidCompressedWrapper,
42                format!("{:?}", e).as_str(),
43            )
44        })
45    }
46}
47
48/// Result of a call to PreflateStreamProcessor::decompress
49pub struct PreflateStreamChunkResult {
50    /// the extra data that is needed to reconstruct the deflate stream exactly as it was written
51    pub corrections: Vec<u8>,
52
53    /// the number of bytes that were processed from the compressed stream (this will be exactly the
54    /// data that will be recreated using the cabac_encoded data)
55    pub compressed_size: usize,
56
57    /// the parameters that were used to compress the stream. Only returned for the first
58    /// chunk that is passed in.
59    pub parameters: Option<TokenPredictorParameters>,
60
61    pub blocks: Vec<DeflateTokenBlock>,
62}
63
64/// Takes a stream of deflate compressed data removes the deflate compression, recording the data
65/// that can be used to reconstruct it along with the plain-text.
66#[derive(Debug)]
67pub struct PreflateStreamProcessor {
68    predictor: Option<TokenPredictor>,
69    validator: Option<RecreateStreamProcessor>,
70    parser: DeflateParser,
71}
72
73impl PreflateStreamProcessor {
74    /// Creates a new PreflateStreamProcessor
75    /// plain_text_limit: the maximum size of the plain text that will decompressed to memory
76    /// verify: if true, the decompressed data will be recompressed and compared to the original as it is run
77    pub fn new(plain_text_limit: usize, verify: bool) -> Self {
78        Self {
79            predictor: None,
80            parser: DeflateParser::new(plain_text_limit),
81            validator: if verify {
82                Some(RecreateStreamProcessor::new())
83            } else {
84                None
85            },
86        }
87    }
88
89    pub fn is_done(&self) -> bool {
90        self.parser.is_done()
91    }
92
93    pub fn plain_text(&self) -> &PlainText {
94        &self.parser.plain_text()
95    }
96
97    pub fn shrink_to_dictionary(&mut self) {
98        self.parser.shrink_to_dictionary();
99    }
100
101    pub fn detach_plain_text(self) -> PlainText {
102        self.parser.detach_plain_text()
103    }
104
105    /// decompresses a deflate stream and returns the plaintext and cabac_encoded data that can be used to reconstruct it
106    pub fn decompress(&mut self, compressed_data: &[u8]) -> Result<PreflateStreamChunkResult> {
107        let contents = self.parser.parse(compressed_data)?;
108
109        let mut cabac_encoded = Vec::new();
110
111        let mut cabac_encoder =
112            PredictionEncoderCabac::new(VP8Writer::new(&mut cabac_encoded).unwrap());
113
114        if let Some(predictor) = &mut self.predictor {
115            let mut input = PreflateInput::new(&self.parser.plain_text());
116
117            // we are missing the last couple hashes in the dictionary since we didn't
118            // have the full plaintext yet.
119            predictor.add_missing_previous_hash(&input);
120
121            predict_blocks(&contents.blocks, predictor, &mut cabac_encoder, &mut input)?;
122
123            cabac_encoder.finish();
124
125            if let Some(validator) = &mut self.validator {
126                let (recompressed, _rec_blocks) = validator.recompress(
127                    &mut Cursor::new(self.parser.plain_text().text()),
128                    &cabac_encoded,
129                )?;
130
131                #[cfg(test)]
132                for i in 0..contents.blocks.len() {
133                    crate::utils::assert_block_eq(&contents.blocks[i], &_rec_blocks[i]);
134                }
135
136                // we should always succeed here in test code
137                #[cfg(test)]
138                crate::utils::assert_eq_array(
139                    &recompressed,
140                    &compressed_data[..contents.compressed_size],
141                );
142
143                if recompressed[..] != compressed_data[..contents.compressed_size] {
144                    return Err(PreflateError::new(
145                        ExitCode::RoundtripMismatch,
146                        "recompressed data does not match original",
147                    ));
148                }
149            }
150
151            Ok(PreflateStreamChunkResult {
152                corrections: cabac_encoded,
153                compressed_size: contents.compressed_size,
154                parameters: None,
155                blocks: contents.blocks,
156            })
157        } else {
158            let params =
159                estimate_preflate_parameters(&contents, &self.parser.plain_text()).context()?;
160
161            let mut input = PreflateInput::new(&self.parser.plain_text());
162
163            let mut token_predictor = TokenPredictor::new(&params);
164
165            predict_blocks(
166                &contents.blocks,
167                &mut token_predictor,
168                &mut cabac_encoder,
169                &mut input,
170            )?;
171
172            cabac_encoder.finish();
173
174            let reconstruction_data = bitcode::encode(&ReconstructionData {
175                parameters: params,
176                corrections: cabac_encoded,
177            });
178
179            self.predictor = Some(token_predictor);
180
181            if let Some(validator) = &mut self.validator {
182                let (recompressed, _rec_blocks) = validator.recompress(
183                    &mut Cursor::new(self.parser.plain_text().text()),
184                    &reconstruction_data,
185                )?;
186
187                #[cfg(test)]
188                for i in 0..contents.blocks.len() {
189                    crate::utils::assert_block_eq(&contents.blocks[i], &_rec_blocks[i]);
190                }
191
192                // we should always succeed here in test code
193                #[cfg(test)]
194                crate::utils::assert_eq_array(
195                    &recompressed,
196                    &compressed_data[..contents.compressed_size],
197                );
198
199                if recompressed[..] != compressed_data[..contents.compressed_size] {
200                    return Err(PreflateError::new(
201                        ExitCode::RoundtripMismatch,
202                        "recompressed data does not match original",
203                    ));
204                }
205            }
206
207            Ok(PreflateStreamChunkResult {
208                corrections: reconstruction_data,
209                compressed_size: contents.compressed_size,
210                parameters: Some(params),
211                blocks: contents.blocks,
212            })
213        }
214    }
215}
216
217/// Decompresses a deflate stream and returns the plaintext and diff data that can be used to reconstruct it
218/// via recreate_whole_deflate_stream
219pub fn preflate_whole_deflate_stream(
220    compressed_data: &[u8],
221    verify: bool,
222    plain_text_limit: usize,
223) -> Result<(PreflateStreamChunkResult, PlainText)> {
224    let mut state = PreflateStreamProcessor::new(plain_text_limit, verify);
225    let r = state.decompress(compressed_data)?;
226
227    Ok((r, state.parser.detach_plain_text()))
228}
229
230/// recreates the original deflate stream, piece-by-piece
231#[derive(Debug)]
232pub struct RecreateStreamProcessor {
233    predictor: Option<TokenPredictor>,
234    writer: DeflateWriter,
235    plain_text: PlainText,
236}
237
238impl RecreateStreamProcessor {
239    pub fn new() -> Self {
240        Self {
241            predictor: None,
242            writer: DeflateWriter::new(),
243            plain_text: PlainText::new(),
244        }
245    }
246
247    pub fn recompress(
248        &mut self,
249        plain_text: &mut impl BufRead,
250        corrections: &[u8],
251    ) -> Result<(Vec<u8>, Vec<DeflateTokenBlock>)> {
252        loop {
253            let buf = plain_text.fill_buf().context()?;
254            let buf_len = buf.len();
255            if buf_len == 0 {
256                break;
257            }
258
259            self.plain_text.append(&buf);
260
261            plain_text.consume(buf_len);
262        }
263
264        let mut input = PreflateInput::new(&self.plain_text);
265
266        if let Some(predictor) = &mut self.predictor {
267            let mut cabac_decoder =
268                PredictionDecoderCabac::new(VP8Reader::new(Cursor::new(corrections)).unwrap());
269
270            predictor.add_missing_previous_hash(&input);
271
272            let blocks =
273                recreate_blocks(predictor, &mut cabac_decoder, &mut self.writer, &mut input)
274                    .context()?;
275
276            self.plain_text.shrink_to_dictionary();
277
278            self.writer.flush();
279
280            Ok((self.writer.detach_output(), blocks))
281        } else {
282            let r = ReconstructionData::read(corrections)?;
283
284            let mut predictor = TokenPredictor::new(&r.parameters);
285
286            let mut cabac_decoder =
287                PredictionDecoderCabac::new(VP8Reader::new(Cursor::new(r.corrections)).unwrap());
288
289            let blocks = recreate_blocks(
290                &mut predictor,
291                &mut cabac_decoder,
292                &mut self.writer,
293                &mut input,
294            )
295            .context()?;
296
297            self.predictor = Some(predictor);
298
299            self.plain_text.shrink_to_dictionary();
300
301            self.writer.flush();
302
303            Ok((self.writer.detach_output(), blocks))
304        }
305    }
306}
307
308/// recompresses a deflate stream using the cabac_encoded data that was returned from decompress_deflate_stream
309pub fn recreate_whole_deflate_stream(
310    plain_text: &[u8],
311    prediction_corrections: &[u8],
312) -> Result<Vec<u8>> {
313    let mut state = RecreateStreamProcessor::new();
314
315    let (recompressed, _) =
316        state.recompress(&mut Cursor::new(&plain_text), prediction_corrections)?;
317
318    Ok(recompressed)
319}
320
321/// takes a deflate compressed stream, analyzes it, decoompresses it, and records
322/// any differences in the encoder codec
323#[cfg(test)]
324fn encode_mispredictions(
325    deflate: &crate::deflate::deflate_reader::DeflateContents,
326    plain_text: &PlainText,
327    params: &TokenPredictorParameters,
328    encoder: &mut impl PredictionEncoder,
329) -> Result<()> {
330    let mut input = PreflateInput::new(plain_text);
331
332    let mut token_predictor = TokenPredictor::new(&params);
333
334    predict_blocks(&deflate.blocks, &mut token_predictor, encoder, &mut input)?;
335
336    Ok(())
337}
338
339fn predict_blocks(
340    blocks: &[DeflateTokenBlock],
341    token_predictor: &mut TokenPredictor,
342    encoder: &mut impl PredictionEncoder,
343    input: &mut PreflateInput,
344) -> Result<()> {
345    for i in 0..blocks.len() {
346        token_predictor.predict_block(&blocks[i], encoder, input, i == blocks.len() - 1)?;
347        // end of stream normally is the last block
348        encoder.encode_correction_bool(
349            CodecCorrection::EndOfChunk,
350            i == blocks.len() - 1,
351            input.remaining() == 0,
352        );
353    }
354    assert!(input.remaining() == 0);
355    Ok(())
356}
357
358#[cfg(test)]
359fn decode_mispredictions(
360    params: &TokenPredictorParameters,
361    input: &mut PreflateInput,
362    decoder: &mut impl crate::statistical_codec::PredictionDecoder,
363) -> Result<(Vec<u8>, Vec<DeflateTokenBlock>)> {
364    let mut deflate_writer: DeflateWriter = DeflateWriter::new();
365    let mut predictor = TokenPredictor::new(&params);
366
367    let output_blocks = recreate_blocks(&mut predictor, decoder, &mut deflate_writer, input)?;
368
369    deflate_writer.flush();
370
371    Ok((deflate_writer.detach_output(), output_blocks))
372}
373
374fn recreate_blocks<D: PredictionDecoder>(
375    token_predictor: &mut TokenPredictor,
376    decoder: &mut D,
377    deflate_writer: &mut DeflateWriter,
378    input: &mut PreflateInput,
379) -> Result<Vec<DeflateTokenBlock>> {
380    let mut output_blocks = Vec::new();
381    loop {
382        let block = token_predictor.recreate_block(decoder, input)?;
383
384        deflate_writer.encode_block(&block)?;
385
386        output_blocks.push(block);
387
388        // end of stream normally is the last block
389        let last =
390            decoder.decode_correction_bool(CodecCorrection::EndOfChunk, input.remaining() == 0);
391
392        if last {
393            break;
394        }
395    }
396    Ok(output_blocks)
397}
398
399/// decompresses a deflate stream and returns the plaintext and cabac_encoded data that can be used to reconstruct it
400/// This version uses DebugWriter and DebugReader, which are slower but can be used to debug the cabac encoding errors.
401#[cfg(test)]
402fn decompress_deflate_stream_assert(
403    compressed_data: &[u8],
404    verify: bool,
405) -> Result<(PreflateStreamChunkResult, PlainText)> {
406    use crate::deflate::deflate_reader::parse_deflate_whole;
407    use cabac::debug::{DebugReader, DebugWriter};
408
409    use crate::preflate_error::AddContext;
410
411    let mut cabac_encoded = Vec::new();
412
413    let mut cabac_encoder =
414        PredictionEncoderCabac::new(DebugWriter::new(&mut cabac_encoded).unwrap());
415
416    let (contents, plain_text) = parse_deflate_whole(compressed_data)?;
417
418    let params = estimate_preflate_parameters(&contents, &plain_text).context()?;
419
420    encode_mispredictions(&contents, &plain_text, &params, &mut cabac_encoder)?;
421    assert_eq!(contents.compressed_size, compressed_data.len());
422    cabac_encoder.finish();
423
424    let reconstruction_data = bitcode::encode(&ReconstructionData {
425        parameters: params,
426        corrections: cabac_encoded,
427    });
428
429    if verify {
430        let r = ReconstructionData::read(&reconstruction_data)?;
431
432        let mut cabac_decoder =
433            PredictionDecoderCabac::new(DebugReader::new(Cursor::new(&r.corrections)).unwrap());
434
435        let params = r.parameters;
436        let mut input = PreflateInput::new(&plain_text);
437        let (recompressed, _recreated_blocks) =
438            decode_mispredictions(&params, &mut input, &mut cabac_decoder)?;
439
440        if recompressed[..] != compressed_data[..] {
441            return Err(PreflateError::new(
442                ExitCode::RoundtripMismatch,
443                "recompressed data does not match original",
444            ));
445        }
446    }
447
448    Ok((
449        PreflateStreamChunkResult {
450            corrections: reconstruction_data,
451            compressed_size: contents.compressed_size,
452            parameters: Some(params),
453            blocks: contents.blocks,
454        },
455        plain_text,
456    ))
457}
458
459#[test]
460fn verify_roundtrip_assert() {
461    crate::init_logging();
462
463    use crate::utils::read_file;
464
465    let v = read_file("compressed_zlib_level1.deflate");
466
467    let (r, plain_text) = decompress_deflate_stream_assert(&v, true).unwrap();
468    let recompressed = recompress_deflate_stream_assert(&plain_text, &r.corrections).unwrap();
469    assert!(v == recompressed);
470}
471
472#[test]
473fn verify_roundtrip_zlib() {
474    crate::init_logging();
475
476    for i in 0..9 {
477        verify_file(&format!("compressed_zlib_level{}.deflate", i));
478    }
479}
480
481#[test]
482fn verify_roundtrip_flate2() {
483    crate::init_logging();
484
485    for i in 0..9 {
486        verify_file(&format!("compressed_flate2_level{}.deflate", i));
487    }
488}
489
490#[test]
491fn verify_roundtrip_libdeflate() {
492    crate::init_logging();
493
494    for i in 0..9 {
495        verify_file(&format!("compressed_libdeflate_level{}.deflate", i));
496    }
497}
498
499#[cfg(test)]
500fn verify_file(filename: &str) {
501    use crate::utils::read_file;
502    let v = read_file(filename);
503
504    let (r, plain_text) = preflate_whole_deflate_stream(&v, true, usize::MAX).unwrap();
505    let recompressed = recreate_whole_deflate_stream(plain_text.text(), &r.corrections).unwrap();
506    assert!(v == recompressed);
507}
508
509/// recompresses a deflate stream using the cabac_encoded data that was returned from decompress_deflate_stream
510/// This version uses DebugWriter and DebugReader, which are slower and don't compress but can be used to debug the cabac encoding errors.
511#[cfg(test)]
512fn recompress_deflate_stream_assert(
513    plain_text: &PlainText,
514    prediction_corrections: &[u8],
515) -> Result<Vec<u8>> {
516    use cabac::debug::DebugReader;
517
518    let r = ReconstructionData::read(prediction_corrections)?;
519
520    let mut cabac_decoder =
521        PredictionDecoderCabac::new(DebugReader::new(Cursor::new(&r.corrections)).unwrap());
522
523    let mut input = PreflateInput::new(plain_text);
524    let (recompressed, _recreated_blocks) =
525        decode_mispredictions(&r.parameters, &mut input, &mut cabac_decoder)?;
526    Ok(recompressed)
527}
528
529#[cfg(test)]
530fn analyze_compressed_data_fast(
531    compressed_data: &[u8],
532    header_crc32: Option<u32>,
533    uncompressed_size: &mut u64,
534) {
535    use crate::{
536        cabac_codec::{PredictionDecoderCabac, PredictionEncoderCabac},
537        deflate::deflate_reader::parse_deflate_whole,
538    };
539    use std::io::Cursor;
540
541    use cabac::vp8::{VP8Reader, VP8Writer};
542
543    let mut buffer = Vec::new();
544
545    let mut cabac_encoder = PredictionEncoderCabac::new(VP8Writer::new(&mut buffer).unwrap());
546
547    let (contents, plain_text) = parse_deflate_whole(compressed_data).unwrap();
548
549    let params = estimate_preflate_parameters(&contents, &plain_text).unwrap();
550
551    println!("params: {:?}", params);
552
553    encode_mispredictions(&contents, &plain_text, &params, &mut cabac_encoder).unwrap();
554
555    if let Some(crc) = header_crc32 {
556        let result_crc = crc32fast::hash(&plain_text.text());
557        assert_eq!(result_crc, crc);
558    }
559
560    assert_eq!(contents.compressed_size, compressed_data.len());
561
562    cabac_encoder.finish();
563
564    cabac_encoder.print();
565
566    println!("buffer size: {}", buffer.len());
567
568    let mut cabac_decoder =
569        PredictionDecoderCabac::new(VP8Reader::new(Cursor::new(&buffer)).unwrap());
570
571    let mut input = PreflateInput::new(&plain_text);
572
573    let (recompressed, _recreated_blocks) =
574        decode_mispredictions(&params, &mut input, &mut cabac_decoder).unwrap();
575
576    assert!(recompressed[..] == compressed_data[..]);
577
578    *uncompressed_size = plain_text.text().len() as u64;
579}
580
581#[cfg(test)]
582fn analyze_compressed_data_verify(
583    compressed_data: &[u8],
584    header_crc32: Option<u32>,
585    _deflate_info_dump_level: i32,
586    uncompressed_size: &mut u64,
587) {
588    use crate::{
589        cabac_codec::{PredictionDecoderCabac, PredictionEncoderCabac},
590        deflate::{deflate_reader::parse_deflate_whole, deflate_token::DeflateTokenBlockType},
591        statistical_codec::{VerifyPredictionDecoder, VerifyPredictionEncoder},
592        utils::assert_eq_array,
593    };
594    use cabac::debug::{DebugReader, DebugWriter};
595    use std::io::Cursor;
596
597    let mut buffer = Vec::new();
598
599    let cabac_encoder = PredictionEncoderCabac::new(DebugWriter::new(&mut buffer).unwrap());
600    let debug_encoder = VerifyPredictionEncoder::new();
601
602    let mut combined_encoder = (debug_encoder, cabac_encoder);
603
604    let (contents, plain_text) = parse_deflate_whole(compressed_data).unwrap();
605
606    let params = estimate_preflate_parameters(&contents, &plain_text).unwrap();
607
608    println!("params: {:?}", params);
609
610    encode_mispredictions(&contents, &plain_text, &params, &mut combined_encoder).unwrap();
611
612    assert_eq!(contents.compressed_size, compressed_data.len());
613
614    combined_encoder.finish();
615
616    combined_encoder.0.print();
617
618    let actions = combined_encoder.0.actions();
619
620    println!("buffer size: {}", buffer.len());
621
622    let debug_decoder = VerifyPredictionDecoder::new(actions);
623    let cabac_decoder =
624        PredictionDecoderCabac::new(DebugReader::new(Cursor::new(&buffer)).unwrap());
625
626    let mut combined_decoder = (debug_decoder, cabac_decoder);
627    let mut input = PreflateInput::new(&plain_text);
628
629    let (recompressed, recreated_blocks) =
630        decode_mispredictions(&params, &mut input, &mut combined_decoder).unwrap();
631
632    assert_eq!(contents.blocks.len(), recreated_blocks.len());
633    contents
634        .blocks
635        .iter()
636        .zip(recreated_blocks)
637        .enumerate()
638        .for_each(|(index, (a, b))| match (&a.block_type, &b.block_type) {
639            (
640                DeflateTokenBlockType::Stored { uncompressed: a },
641                DeflateTokenBlockType::Stored { uncompressed: c },
642            ) => {
643                assert_eq!(a, c, "uncompressed data differs {index}");
644            }
645            (
646                DeflateTokenBlockType::Huffman {
647                    tokens: t1,
648                    huffman_type: h1,
649                },
650                DeflateTokenBlockType::Huffman {
651                    tokens: t2,
652                    huffman_type: h2,
653                },
654            ) => {
655                assert_eq_array(t1, t2);
656                assert_eq!(h1, h2, "huffman type differs {index}");
657            }
658            _ => panic!("block type differs {index}"),
659        });
660
661    assert_eq!(
662        recompressed.len(),
663        compressed_data.len(),
664        "re-compressed version should be same (length)"
665    );
666    assert!(
667        &recompressed[..] == compressed_data,
668        "re-compressed version should be same (content)"
669    );
670
671    let result_crc = crc32fast::hash(&plain_text.text());
672
673    if let Some(crc) = header_crc32 {
674        assert_eq!(crc, result_crc, "crc mismatch");
675    }
676
677    *uncompressed_size = plain_text.text().len() as u64;
678}
679
680#[cfg(test)]
681fn do_analyze(crc: Option<u32>, compressed_data: &[u8]) {
682    let mut uncompressed_size = 0;
683
684    analyze_compressed_data_verify(compressed_data, crc, 1, &mut uncompressed_size);
685    analyze_compressed_data_fast(compressed_data, crc, &mut uncompressed_size);
686}
687
688/// verify that levels 1-6 of zlib are compressed without any correction data
689///
690/// Future work: figure out why level 7 and above are not perfect
691#[test]
692fn verify_zlib_perfect_compression() {
693    crate::init_logging();
694
695    use crate::deflate::deflate_reader::parse_deflate_whole;
696    use crate::utils::read_file;
697
698    for i in 1..6 {
699        println!("iteration {}", i);
700        let compressed_data: &[u8] =
701            &read_file(format!("compressed_zlib_level{i}.deflate").as_str());
702
703        let compressed_data = compressed_data;
704
705        let (contents, plain_text) = parse_deflate_whole(compressed_data).unwrap();
706
707        let params = estimate_preflate_parameters(&contents, &plain_text).unwrap();
708
709        println!("params: {:?}", params);
710
711        // this "encoder" just asserts if anything gets passed to it
712        let mut verify_encoder = crate::statistical_codec::AssertDefaultOnlyEncoder {};
713        encode_mispredictions(&contents, &plain_text, &params, &mut verify_encoder).unwrap();
714
715        println!("params buffer length {}", bitcode::encode(&params).len());
716    }
717}
718
719#[test]
720fn verify_longmatch() {
721    crate::init_logging();
722
723    use crate::utils::read_file;
724    do_analyze(
725        None,
726        &read_file("compressed_flate2_level1_longmatch.deflate"),
727    );
728}
729
730#[test]
731fn verify_zlibng() {
732    crate::init_logging();
733
734    use crate::utils::read_file;
735
736    do_analyze(None, &read_file("compressed_zlibng_level1.deflate"));
737}
738
739#[test]
740fn verify_miniz() {
741    crate::init_logging();
742
743    use crate::utils::read_file;
744
745    do_analyze(None, &read_file("compressed_minizoxide_level1.deflate"));
746}
747
748/// this is the deflate stream extracted out of the png file (minus the idat wrapper)
749#[test]
750fn verify_png_deflate() {
751    crate::init_logging();
752
753    use crate::utils::read_file;
754    do_analyze(None, &read_file("treegdi.extract.deflate"));
755}
756
757#[cfg(test)]
758pub fn analyze_compressed_data_verify_incremental(compressed_data: &[u8], plain_text_limit: usize) {
759    use crate::{deflate::deflate_reader::parse_deflate_whole, utils::assert_eq_array};
760
761    let (original_con, _) = parse_deflate_whole(compressed_data).unwrap();
762
763    let mut start_offset = 0;
764    let mut end_offset = compressed_data.len().min(100001);
765
766    let mut stream = PreflateStreamProcessor::new(plain_text_limit, true);
767
768    let mut plain_text_offset = 0;
769
770    let mut expanded_contents = Vec::new();
771    while !stream.is_done() {
772        let result = stream.decompress(&compressed_data[start_offset..end_offset]);
773        match result {
774            Ok(r) => {
775                println!(
776                    "chunk cmp_start={} cmp_size={} blocks={} pt_off={}({})",
777                    start_offset,
778                    r.compressed_size,
779                    r.blocks.len(),
780                    plain_text_offset,
781                    stream.plain_text().len()
782                );
783                start_offset += r.compressed_size;
784                end_offset = (start_offset + 10001).min(compressed_data.len());
785
786                plain_text_offset += stream.plain_text().len();
787                expanded_contents.push((r.corrections, stream.plain_text().text().to_vec()));
788
789                stream.shrink_to_dictionary();
790            }
791            Err(e) => {
792                if e.exit_code() == ExitCode::PredictionFailure {
793                    println!(
794                        "Prediction failure for {:?} not great, but some corner cases where the initial estimator isn't totaly right",
795                        e
796                    );
797                    return;
798                }
799                assert_eq!(
800                    e.exit_code(),
801                    ExitCode::ShortRead,
802                    "unexpected error {:?}",
803                    e
804                );
805                end_offset = (end_offset + 10001).min(compressed_data.len());
806            }
807        }
808    }
809
810    // now reconstruct the data and make sure it is identical
811    let mut recompressed = Vec::new();
812    let mut reconstructed_blocks = Vec::new();
813
814    let mut reconstruct = RecreateStreamProcessor::new();
815    for i in 0..expanded_contents.len() {
816        let (mut r, mut b) = reconstruct
817            .recompress(
818                &mut Cursor::new(&expanded_contents[i].1),
819                &expanded_contents[i].0,
820            )
821            .unwrap();
822
823        println!(
824            "reconstruct block offset={} blocks={} pt={}",
825            i,
826            b.len(),
827            expanded_contents[i].1.len()
828        );
829
830        recompressed.append(&mut r);
831        reconstructed_blocks.append(&mut b);
832    }
833
834    //assert_eq!(original_con.blocks.len(), reconstructed_blocks.len());
835    for i in 0..original_con.blocks.len() {
836        println!("block {}", i);
837        crate::utils::assert_block_eq(&original_con.blocks[i], &reconstructed_blocks[i]);
838    }
839
840    assert_eq_array(compressed_data, &recompressed);
841}
842
843#[test]
844fn verify_plain_text_limit() {
845    crate::init_logging();
846
847    analyze_compressed_data_verify_incremental(
848        &crate::utils::read_file("compressed_zlib_level3.deflate"),
849        1 * 1024 * 1024,
850    );
851}
852
853/// test partial reading reading
854#[test]
855fn verify_partial_blocks() {
856    crate::init_logging();
857
858    for i in 0..=9 {
859        analyze_compressed_data_verify_incremental(
860            &crate::utils::read_file(&format!("compressed_zlib_level{}.deflate", i)),
861            usize::MAX,
862        );
863    }
864}