struct_compression_analyzer/utils/
analyze_utils.rs

1//! Utility functions for analyzing and processing bit-packed data.
2//!
3//! This module provides low-level utilities for:
4//! - Size estimation and compression
5//! - Bit manipulation and ordering
6//! - Bitstream reader/writer creation and management
7//!
8//! # Core Functions
9//!
10//! - [`size_estimate`]: Estimates compressed data size based on LZ matches and entropy
11//! - [`get_zstd_compressed_size`]: Calculates actual compressed size using zstandard
12//! - [`calculate_file_entropy`]: Computes Shannon entropy of input data
13//! - [`reverse_bits`]: Reverses bits in a u64 value
14//!
15//! # Bitstream Utilities
16//!
17//! - [`create_bit_reader`]: Creates a [`BitReaderContainer`] with specified endianness
18//! - [`create_bit_writer`]: Creates a [`BitWriterContainer`] with specified endianness
19//! - [`create_bit_writer_with_owned_data`]: Creates writer containing copied data
20//! - [`get_writer_buffer`]: Retrieves underlying buffer from a writer
21//! - [`bit_writer_to_reader`]: Converts a writer into a reader
22//!
23//! # Types
24//!
25//! - [`BitReaderContainer`]: Wrapper around bit readers supporting both endians
26//! - [`BitWriterContainer`]: Wrapper around bit writers supporting both endians
27
28use crate::{analyzer::SizeEstimationParameters, schema::BitOrder};
29use bitstream_io::{BigEndian, BitRead, BitReader, BitWrite, BitWriter, LittleEndian};
30use lossless_transform_utils::{
31    entropy::code_length_of_histogram32,
32    histogram::{histogram32_from_bytes, Histogram32},
33};
34use std::io::{self, Cursor, SeekFrom};
35
36/// Estimate size of a compressed data based on precalculated LZ matches and entropy
37///
38/// # Arguments
39///
40/// * `params` - [`SizeEstimationParameters`] containing:
41///     * `data_len` - The uncompressed data length
42///     * `num_lz_matches` - The number of LZ matches
43///     * `entropy` - The estimated entropy of the data
44///     * `lz_match_multiplier` - Multiplier for LZ matches
45///     * `entropy_multiplier` - Multiplier for entropy
46///
47/// # Returns
48///
49/// This is a rough estimation based on very limited testing on DXT1, only, you'll want to
50/// replace this function with something more suitable for your use case, possibly.
51pub fn size_estimate(params: SizeEstimationParameters) -> usize {
52    // Calculate expected bytes after LZ
53    let bytes_after_lz =
54        params.data_len - (params.num_lz_matches as f64 * params.lz_match_multiplier) as usize;
55
56    // Calculate expected bits and convert to bytes
57    (bytes_after_lz as f64 * params.entropy * params.entropy_multiplier).ceil() as usize / 8
58}
59
60/// Determines the actual size of the compressed data by compressing with a realistic compressor.
61pub fn get_zstd_compressed_size(data: &[u8], level: i32) -> u64 {
62    zstd::bulk::compress(data, level)
63        .ok()
64        .map(|compressed| compressed.len())
65        .unwrap() as u64
66}
67
68/// Calculates the entropy of a given input
69pub fn calculate_file_entropy(bytes: &[u8]) -> f64 {
70    let mut histogram = Histogram32::default();
71    histogram32_from_bytes(bytes, &mut histogram);
72    code_length_of_histogram32(&histogram, bytes.len() as u64)
73}
74
75/// Reverses the bits of a u64 value
76/// # Arguments
77/// * `max_bits` - The number of bits to reverse
78/// * `bits` - The bits to reverse
79///
80/// # Returns
81/// The reversed bits
82pub fn reverse_bits(max_bits: u32, bits: u64) -> u64 {
83    let mut reversed_bits = 0u64;
84    for x in 0..max_bits {
85        if bits & (1 << x) != 0 {
86            reversed_bits |= 1 << (max_bits - 1 - x);
87        }
88    }
89    reversed_bits
90}
91
92/// Wrapper around the `BitReader` type that allows it to be used with either endian.
93pub enum BitReaderContainer<'a> {
94    Msb(BitReader<Cursor<&'a [u8]>, BigEndian>),
95    Lsb(BitReader<Cursor<&'a [u8]>, LittleEndian>),
96}
97
98impl BitReaderContainer<'_> {
99    pub fn read(&mut self, bits: u32) -> io::Result<u64> {
100        match self {
101            BitReaderContainer::Msb(reader) => reader.read(bits),
102            BitReaderContainer::Lsb(reader) => reader.read(bits),
103        }
104    }
105
106    pub fn seek_bits(&mut self, seekfrom: SeekFrom) -> io::Result<u64> {
107        match self {
108            BitReaderContainer::Msb(reader) => reader.seek_bits(seekfrom),
109            BitReaderContainer::Lsb(reader) => reader.seek_bits(seekfrom),
110        }
111    }
112}
113
114/// Creates a [`BitReaderContainer`] instance based on the given [`BitOrder`].
115///
116/// # Arguments
117///
118/// * `data` - The data to create the bit reader from.
119/// * `bit_order` - The endianness of the bit stream.
120///
121/// # Returns
122/// A [`BitReaderContainer`] instance with the specified endianness.
123pub fn create_bit_reader(data: &[u8], bit_order: BitOrder) -> BitReaderContainer<'_> {
124    match bit_order {
125        BitOrder::Default | BitOrder::Msb => {
126            BitReaderContainer::Msb(BitReader::endian(Cursor::new(data), BigEndian))
127        }
128        BitOrder::Lsb => {
129            BitReaderContainer::Lsb(BitReader::endian(Cursor::new(data), LittleEndian))
130        }
131    }
132}
133
134/// Tracks statistics about individual bits in a field
135///
136/// Maintains counts of zero and one values observed at each bit position
137/// to support entropy calculations and bit distribution analysis.
138pub enum BitWriterContainer {
139    Msb(BitWriter<Cursor<Vec<u8>>, BigEndian>),
140    Lsb(BitWriter<Cursor<Vec<u8>>, LittleEndian>),
141}
142
143/// Creates a [`BitWriterContainer`] instance based on the given [`BitOrder`].
144///
145/// # Arguments
146///
147/// * `bit_order` - The endianness of the bit stream.
148///
149/// # Returns
150/// A [`BitWriterContainer`] instance with the specified endianness.
151pub fn create_bit_writer(bit_order: BitOrder) -> BitWriterContainer {
152    match bit_order {
153        BitOrder::Default | BitOrder::Msb => {
154            BitWriterContainer::Msb(BitWriter::endian(Cursor::new(Vec::new()), BigEndian))
155        }
156        BitOrder::Lsb => {
157            BitWriterContainer::Lsb(BitWriter::endian(Cursor::new(Vec::new()), LittleEndian))
158        }
159    }
160}
161
162/// Creates a [`BitWriterContainer`] instance based on the given [`BitOrder`].
163/// This copies the supplied data into a new buffer, which is then owned by the container.
164///
165/// # Arguments
166///
167/// * `data` - The data to create the bit reader from.
168/// * `bit_order` - The endianness of the bit stream.
169///
170/// # Returns
171/// A [`BitWriterContainer`] instance with the specified endianness.
172pub fn create_bit_writer_with_owned_data(data: &[u8], bit_order: BitOrder) -> BitWriterContainer {
173    match bit_order {
174        BitOrder::Default | BitOrder::Msb => {
175            let mut cursor = Cursor::new(data.to_vec());
176            cursor.set_position(data.len() as u64);
177            BitWriterContainer::Msb(BitWriter::endian(cursor, BigEndian))
178        }
179        BitOrder::Lsb => {
180            let mut cursor = Cursor::new(data.to_vec());
181            cursor.set_position(data.len() as u64);
182            BitWriterContainer::Lsb(BitWriter::endian(cursor, LittleEndian))
183        }
184    }
185}
186
187/// Retrieves the buffer behind a [`BitWriterContainer`] instance.
188///
189/// # Arguments
190///
191/// * `writer` - The [`BitWriterContainer`] instance to retrieve the buffer from.
192///
193/// # Returns
194/// A reference to the buffer behind the [`BitWriterContainer`] instance.
195pub fn get_writer_buffer(writer: &mut BitWriterContainer) -> &[u8] {
196    match writer {
197        BitWriterContainer::Msb(writer) => {
198            writer.byte_align().unwrap();
199            writer.writer().unwrap().get_ref()
200        }
201        BitWriterContainer::Lsb(writer) => {
202            writer.byte_align().unwrap();
203            writer.writer().unwrap().get_ref()
204        }
205    }
206}
207
208/// Converts a [`BitWriterContainer`] instance into a [`BitReaderContainer`] instance.
209///
210/// # Arguments
211///
212/// * `writer` - The [`BitWriterContainer`] instance to convert.
213///
214/// # Returns
215/// A [`BitReaderContainer`] instance containing the same data as the input [`BitWriterContainer`].
216pub fn bit_writer_to_reader(writer: &mut BitWriterContainer) -> BitReaderContainer {
217    match writer {
218        BitWriterContainer::Msb(writer) => {
219            writer.byte_align().unwrap();
220            let array = writer.writer().unwrap().get_ref();
221            BitReaderContainer::Msb(BitReader::endian(Cursor::new(array), BigEndian))
222        }
223        BitWriterContainer::Lsb(writer) => {
224            writer.byte_align().unwrap();
225            let array = writer.writer().unwrap().get_ref();
226            BitReaderContainer::Lsb(BitReader::endian(Cursor::new(array), LittleEndian))
227        }
228    }
229}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234
235    #[test]
236    fn zstd_compression_estimate() {
237        let data = b"This is a test string that should compress well with zstandard zstandard zstandard zstandard zstandard zstandard";
238        let compressed_size = get_zstd_compressed_size(data, 16);
239        assert!(compressed_size < data.len() as u64);
240    }
241}