struct_compression_analyzer/utils/analyze_utils.rs
1//! Utility functions for analyzing and processing bit-packed data.
2//!
3//! This module provides low-level utilities for:
4//! - Size estimation and compression
5//! - Bit manipulation and ordering
6//! - Bitstream reader/writer creation and management
7//!
8//! # Core Functions
9//!
10//! - [`size_estimate`]: Estimates compressed data size based on LZ matches and entropy
11//! - [`get_zstd_compressed_size`]: Calculates actual compressed size using zstandard
12//! - [`calculate_file_entropy`]: Computes Shannon entropy of input data
13//! - [`reverse_bits`]: Reverses bits in a u64 value
14//!
15//! # Bitstream Utilities
16//!
17//! - [`create_bit_reader`]: Creates a [`BitReaderContainer`] with specified endianness
18//! - [`create_bit_writer`]: Creates a [`BitWriterContainer`] with specified endianness
19//! - [`create_bit_writer_with_owned_data`]: Creates writer containing copied data
20//! - [`get_writer_buffer`]: Retrieves underlying buffer from a writer
21//! - [`bit_writer_to_reader`]: Converts a writer into a reader
22//!
23//! # Types
24//!
25//! - [`BitReaderContainer`]: Wrapper around bit readers supporting both endians
26//! - [`BitWriterContainer`]: Wrapper around bit writers supporting both endians
27
28use crate::{analyzer::SizeEstimationParameters, schema::BitOrder};
29use bitstream_io::{BigEndian, BitRead, BitReader, BitWrite, BitWriter, LittleEndian};
30use lossless_transform_utils::{
31 entropy::code_length_of_histogram32,
32 histogram::{histogram32_from_bytes, Histogram32},
33};
34use std::io::{self, Cursor, SeekFrom};
35
36/// Estimate size of a compressed data based on precalculated LZ matches and entropy
37///
38/// # Arguments
39///
40/// * `params` - [`SizeEstimationParameters`] containing:
41/// * `data_len` - The uncompressed data length
42/// * `num_lz_matches` - The number of LZ matches
43/// * `entropy` - The estimated entropy of the data
44/// * `lz_match_multiplier` - Multiplier for LZ matches
45/// * `entropy_multiplier` - Multiplier for entropy
46///
47/// # Returns
48///
49/// This is a rough estimation based on very limited testing on DXT1, only, you'll want to
50/// replace this function with something more suitable for your use case, possibly.
51pub fn size_estimate(params: SizeEstimationParameters) -> usize {
52 // Calculate expected bytes after LZ
53 let bytes_after_lz =
54 params.data_len - (params.num_lz_matches as f64 * params.lz_match_multiplier) as usize;
55
56 // Calculate expected bits and convert to bytes
57 (bytes_after_lz as f64 * params.entropy * params.entropy_multiplier).ceil() as usize / 8
58}
59
60/// Determines the actual size of the compressed data by compressing with a realistic compressor.
61pub fn get_zstd_compressed_size(data: &[u8], level: i32) -> u64 {
62 zstd::bulk::compress(data, level)
63 .ok()
64 .map(|compressed| compressed.len())
65 .unwrap() as u64
66}
67
68/// Calculates the entropy of a given input
69pub fn calculate_file_entropy(bytes: &[u8]) -> f64 {
70 let mut histogram = Histogram32::default();
71 histogram32_from_bytes(bytes, &mut histogram);
72 code_length_of_histogram32(&histogram, bytes.len() as u64)
73}
74
75/// Reverses the bits of a u64 value
76/// # Arguments
77/// * `max_bits` - The number of bits to reverse
78/// * `bits` - The bits to reverse
79///
80/// # Returns
81/// The reversed bits
82pub fn reverse_bits(max_bits: u32, bits: u64) -> u64 {
83 let mut reversed_bits = 0u64;
84 for x in 0..max_bits {
85 if bits & (1 << x) != 0 {
86 reversed_bits |= 1 << (max_bits - 1 - x);
87 }
88 }
89 reversed_bits
90}
91
92/// Wrapper around the `BitReader` type that allows it to be used with either endian.
93pub enum BitReaderContainer<'a> {
94 Msb(BitReader<Cursor<&'a [u8]>, BigEndian>),
95 Lsb(BitReader<Cursor<&'a [u8]>, LittleEndian>),
96}
97
98impl BitReaderContainer<'_> {
99 pub fn read(&mut self, bits: u32) -> io::Result<u64> {
100 match self {
101 BitReaderContainer::Msb(reader) => reader.read(bits),
102 BitReaderContainer::Lsb(reader) => reader.read(bits),
103 }
104 }
105
106 pub fn seek_bits(&mut self, seekfrom: SeekFrom) -> io::Result<u64> {
107 match self {
108 BitReaderContainer::Msb(reader) => reader.seek_bits(seekfrom),
109 BitReaderContainer::Lsb(reader) => reader.seek_bits(seekfrom),
110 }
111 }
112}
113
114/// Creates a [`BitReaderContainer`] instance based on the given [`BitOrder`].
115///
116/// # Arguments
117///
118/// * `data` - The data to create the bit reader from.
119/// * `bit_order` - The endianness of the bit stream.
120///
121/// # Returns
122/// A [`BitReaderContainer`] instance with the specified endianness.
123pub fn create_bit_reader(data: &[u8], bit_order: BitOrder) -> BitReaderContainer<'_> {
124 match bit_order {
125 BitOrder::Default | BitOrder::Msb => {
126 BitReaderContainer::Msb(BitReader::endian(Cursor::new(data), BigEndian))
127 }
128 BitOrder::Lsb => {
129 BitReaderContainer::Lsb(BitReader::endian(Cursor::new(data), LittleEndian))
130 }
131 }
132}
133
134/// Tracks statistics about individual bits in a field
135///
136/// Maintains counts of zero and one values observed at each bit position
137/// to support entropy calculations and bit distribution analysis.
138pub enum BitWriterContainer {
139 Msb(BitWriter<Cursor<Vec<u8>>, BigEndian>),
140 Lsb(BitWriter<Cursor<Vec<u8>>, LittleEndian>),
141}
142
143/// Creates a [`BitWriterContainer`] instance based on the given [`BitOrder`].
144///
145/// # Arguments
146///
147/// * `bit_order` - The endianness of the bit stream.
148///
149/// # Returns
150/// A [`BitWriterContainer`] instance with the specified endianness.
151pub fn create_bit_writer(bit_order: BitOrder) -> BitWriterContainer {
152 match bit_order {
153 BitOrder::Default | BitOrder::Msb => {
154 BitWriterContainer::Msb(BitWriter::endian(Cursor::new(Vec::new()), BigEndian))
155 }
156 BitOrder::Lsb => {
157 BitWriterContainer::Lsb(BitWriter::endian(Cursor::new(Vec::new()), LittleEndian))
158 }
159 }
160}
161
162/// Creates a [`BitWriterContainer`] instance based on the given [`BitOrder`].
163/// This copies the supplied data into a new buffer, which is then owned by the container.
164///
165/// # Arguments
166///
167/// * `data` - The data to create the bit reader from.
168/// * `bit_order` - The endianness of the bit stream.
169///
170/// # Returns
171/// A [`BitWriterContainer`] instance with the specified endianness.
172pub fn create_bit_writer_with_owned_data(data: &[u8], bit_order: BitOrder) -> BitWriterContainer {
173 match bit_order {
174 BitOrder::Default | BitOrder::Msb => {
175 let mut cursor = Cursor::new(data.to_vec());
176 cursor.set_position(data.len() as u64);
177 BitWriterContainer::Msb(BitWriter::endian(cursor, BigEndian))
178 }
179 BitOrder::Lsb => {
180 let mut cursor = Cursor::new(data.to_vec());
181 cursor.set_position(data.len() as u64);
182 BitWriterContainer::Lsb(BitWriter::endian(cursor, LittleEndian))
183 }
184 }
185}
186
187/// Retrieves the buffer behind a [`BitWriterContainer`] instance.
188///
189/// # Arguments
190///
191/// * `writer` - The [`BitWriterContainer`] instance to retrieve the buffer from.
192///
193/// # Returns
194/// A reference to the buffer behind the [`BitWriterContainer`] instance.
195pub fn get_writer_buffer(writer: &mut BitWriterContainer) -> &[u8] {
196 match writer {
197 BitWriterContainer::Msb(writer) => {
198 writer.byte_align().unwrap();
199 writer.writer().unwrap().get_ref()
200 }
201 BitWriterContainer::Lsb(writer) => {
202 writer.byte_align().unwrap();
203 writer.writer().unwrap().get_ref()
204 }
205 }
206}
207
208/// Converts a [`BitWriterContainer`] instance into a [`BitReaderContainer`] instance.
209///
210/// # Arguments
211///
212/// * `writer` - The [`BitWriterContainer`] instance to convert.
213///
214/// # Returns
215/// A [`BitReaderContainer`] instance containing the same data as the input [`BitWriterContainer`].
216pub fn bit_writer_to_reader(writer: &mut BitWriterContainer) -> BitReaderContainer {
217 match writer {
218 BitWriterContainer::Msb(writer) => {
219 writer.byte_align().unwrap();
220 let array = writer.writer().unwrap().get_ref();
221 BitReaderContainer::Msb(BitReader::endian(Cursor::new(array), BigEndian))
222 }
223 BitWriterContainer::Lsb(writer) => {
224 writer.byte_align().unwrap();
225 let array = writer.writer().unwrap().get_ref();
226 BitReaderContainer::Lsb(BitReader::endian(Cursor::new(array), LittleEndian))
227 }
228 }
229}
230
231#[cfg(test)]
232mod tests {
233 use super::*;
234
235 #[test]
236 fn zstd_compression_estimate() {
237 let data = b"This is a test string that should compress well with zstandard zstandard zstandard zstandard zstandard zstandard";
238 let compressed_size = get_zstd_compressed_size(data, 16);
239 assert!(compressed_size < data.len() as u64);
240 }
241}