lossless-transform-utils 0.1.3

General purpose utility methods for creating lossless transforms for various file formats.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
//! Match Estimator
//!
//! This module provides functions for estimating the number of matches in the data, once LZ
//! compression is applied to a given byte array.
use core::alloc::Layout;
use safe_allocator_api::RawAlloc;
#[cfg(all(
    any(target_arch = "x86", target_arch = "x86_64"),
    any(feature = "estimator-avx512", feature = "estimator-avx2")
))]
use std::is_x86_feature_detected;

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[cfg(feature = "estimator-avx2")]
mod avx2;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[cfg(feature = "estimator-avx512")]
mod avx512;

/// # Golden Ratio constant used for better hash scattering
/// https://softwareengineering.stackexchange.com/a/402543
/// It's a very 'irrational' number, the most, dare I say.
const GOLDEN_RATIO: u32 = 0x9E3779B1_u32;

// These values are based around CPU cache data sizes:
//
// - AMD Ryzen 1000/2000/3000/5000/7000: 32K L1
// - Intel 4790k/6700k/7700k/9900k: 32K L1
//
// Newer CPUs can have slightly larger L1:
// - Intel 11700k: 48K
// - Intel 12700k, 13700k, 14700k, 265k (2024): 48K (PCore), 32K (ECore)
// - AMD Ryzen 9000: 48K
//
// Since this workload can last a longer time, we're going to completely fill up our L1
// cache; hoping that increasing accuracy will give us better results.
//
// # Hardware prefetching of data
//
// Exact info on 'how much to prefetch' is hard to come by, but in the 'Intel Optimization Manual'
// I found this for L3.
//
// > Automatic hardware prefetch can bring cache lines into the unified last-level cache
// > based on prior data misses. It will attempt to prefetch ***two cache lines ahead**** of
// > the prefetch stream.
//
// For L1, this quote may be relevant:
//
// > This prefetcher, also known as the streaming prefetcher, is
// > triggered by an ascending access to very recently loaded data. The processor assumes that this
// > access is part of a streaming algorithm and automatically fetches ***the next line***.
//
// In this case it says (1 line).
// In other words, space contention between read in data and the hash table is not going to be too
// big, perf shouldn't change much.
//
// # What does testing say?
//
// On my 5900X with 32K of L1, using a HASH_BITS == 14 == 32K table (of u16s) yields optimal results
// when using `u16` as the value type. However, I decided to extend to the `u32` type, to speed up
// computation a little as we can skip a shift and zero extend.
//
// When table fits in L1 cache, doing compares with u32(s) is faster (any data):
// - u16 -> 1.45GiB/s
// - u32 -> 1.77GiB/s
//
// However, because we overrun the L1 cache, we do suffer a speed penalty (HASH_BITS == 14):
// - u32, Random Data -> 1.48GiB/s
// - u32, Repeating Data -> 1.77GiB/s
//
// When data is compressible, penalty is less. When not compressible, it is more.
// This should be fairly natural, as repeating numbers means accessing same slots in hash table,
// making parts of the table rarely accessed.
//
// For exact numbers, run benchmark.
// In any case, it is still faster, AND more accurate, because we compare more bits; it's a win-win.
//
// # When to change this value?
//
// When *common* CPUs hit the next power of 2 on L1, we'll lift this, but this probably wouldn't
// happen for quite a while. That said, the lift to next power of 2 only brings marginal improvements
// accuracy wise.
//
// Fun, semi-related reading: https://en.algorithmica.org/hpc/cpu-cache/associativity/#hardware-caches
// And I found this after writing all this code: https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
const HASH_BITS: usize = 15; // 2^15 = 32k (of u32s) == 128KBytes
const HASH_SIZE: usize = 1 << HASH_BITS;
#[allow(dead_code)]
const HASH_MASK: u32 = (HASH_SIZE - 1) as u32;

/// Estimates the number of >=3 byte LZ matches in a given input data stream.
/// This implementation sacrifices a bit of accuracy for speed, i.e. it focuses more on shorter
/// range matches.
///
/// # Arguments
///
/// * `bytes` - The input data stream.
///
/// # Returns
///
/// The estimate number of >=3 byte LZ matches.
/// This number is an estimate, it is not an exact amount.
///
/// # Remarks
///
/// This function is optimized around more modern speedy LZ compressors; namely, those which
/// match 3 or more bytes at a time.
///
/// Do note that this is an estimator; it is not an exact number; but the number should be accurate-ish
/// given that we use 32-bit hashes (longer than 24-bit source). Think of this as equivalent to a
/// 'fast mode'/low compression level mode.
pub fn estimate_num_lz_matches_fast(bytes: &[u8]) -> usize {
    // This table stores 3 byte hashes, each hash is transformed
    let layout = unsafe { Layout::from_size_align_unchecked(size_of::<u32>() * HASH_SIZE, 64) };
    let mut alloc = RawAlloc::new_zeroed(layout).unwrap();
    let hash_table = unsafe { &mut *(alloc.as_mut_ptr() as *mut [u32; HASH_SIZE]) };

    let mut matches = 0;
    let begin_ptr = bytes.as_ptr();
    unsafe {
        // 7 == (4) u32 match (4 bytes), using hash
        //      +3 bytes for offset
        // We're dropping it, this is an estimation, after all.
        let end_ptr = begin_ptr.add(bytes.len().saturating_sub(7)); // min 0
        calculate_matches_impl(hash_table, &mut matches, begin_ptr, end_ptr);
    }

    matches
}

#[inline(always)]
fn calculate_matches_impl(
    hash_table: &mut [u32; HASH_SIZE],
    matches: &mut usize,
    begin_ptr: *const u8,
    end_ptr: *const u8,
) {
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    {
        #[cfg(feature = "estimator-avx512")]
        if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
            unsafe {
                avx512::calculate_matches_avx512(hash_table, matches, begin_ptr, end_ptr);
                return;
            }
        }

        #[cfg(feature = "estimator-avx2")]
        if is_x86_feature_detected!("avx2") {
            unsafe {
                avx2::calculate_matches_avx2(hash_table, matches, begin_ptr, end_ptr);
                return;
            }
        }
    }

    unsafe {
        calculate_matches_generic(hash_table, matches, begin_ptr, end_ptr);
    }
}

// Generic, for any CPU.
#[inline(never)] // try reduce register pressure
pub(crate) unsafe fn calculate_matches_generic(
    hash_table: &mut [u32; HASH_SIZE],
    matches: &mut usize,
    mut begin_ptr: *const u8,
    end_ptr: *const u8,
) {
    // We're doing a little 'trick' here.
    // Because doing a lookup earlier in the buffer is a bit expensive, cache wise, and because
    // this is an estimate, rather than an accurate lookup.

    while begin_ptr < end_ptr {
        // Note: I had this in nicer form
        // - hash_u32(read_3_byte_le_unaligned(begin_ptr, ofs))
        // But LLVM failed to optimize properly when `target-cpu != native`

        // Get the values at x+0, x+1, x+2, x+3
        let d0 = read_4_byte_le_unaligned(begin_ptr, 0);
        let d1 = read_4_byte_le_unaligned(begin_ptr, 1);
        let d2 = read_4_byte_le_unaligned(begin_ptr, 2);
        let d3 = read_4_byte_le_unaligned(begin_ptr, 3);
        begin_ptr = begin_ptr.add(4);

        // Drop the byte we're not accounting for
        let d0 = reduce_to_3byte(d0);
        let d1 = reduce_to_3byte(d1);
        let d2 = reduce_to_3byte(d2);
        let d3 = reduce_to_3byte(d3);

        // Convert to hashes.
        let h0 = hash_u32(d0);
        let h1 = hash_u32(d1);
        let h2 = hash_u32(d2);
        let h3 = hash_u32(d3);

        // Good reading: https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
        // In this case, I create a multiplicative hash which creates a large 'sea of red'
        // as I call it https://probablydance.com/wp-content/uploads/2018/06/avalanche_fibonacci1.png
        // near the top bits.

        // We get rid of this 'sea of red' by shifting right `32 - HASH_BITS`, and then AND-ing
        // to fit our value in the mask. I'm not sure if it's the article not specifying bit
        // order here, or whether my results are 'backwards',

        // Use HASH_MASK bits for index into HASH_SIZE table
        // Note: We don't need to AND with HASH_MASK because we're only taking upper bits.
        let index0 = (h0 >> (32 - HASH_BITS)) as usize;
        let index1 = (h1 >> (32 - HASH_BITS)) as usize;
        let index2 = (h2 >> (32 - HASH_BITS)) as usize;
        let index3 = (h3 >> (32 - HASH_BITS)) as usize;

        // Increment matches if the 32-bit data at the table matches
        // (which indicates a very likely LZ match)
        *matches += (hash_table[index0] == d0) as usize;
        *matches += (hash_table[index1] == d1) as usize;
        *matches += (hash_table[index2] == d2) as usize;
        *matches += (hash_table[index3] == d3) as usize;

        // Update the data at the given index.
        hash_table[index0] = d0;
        hash_table[index1] = d1;
        hash_table[index2] = d2;
        hash_table[index3] = d3;
    }
}

/// Hashes a 32-bit value by multiplying it with the golden ratio,
/// ensuring that it
#[inline(always)]
#[allow(dead_code)]
pub(crate) fn hash_u32(value: u32) -> u32 {
    value.wrapping_mul(GOLDEN_RATIO)
}

/// Reads a 3 byte value from a 32-bit unaligned pointer.
///
/// # Safety
///
/// This function dereferences a raw pointer
#[inline(always)]
#[allow(dead_code)]
pub(crate) unsafe fn read_4_byte_le_unaligned(ptr: *const u8, offset: usize) -> u32 {
    (ptr.add(offset) as *const u32).read_unaligned().to_le()
}

/// Drops the upper 8 bits of a 32-bit value.
#[inline(always)]
#[allow(dead_code)]
pub(crate) fn reduce_to_3byte(value: u32) -> u32 {
    value & 0xFFFFFF
}

#[cfg(test)]
mod tests {
    use rstest::rstest;

    use super::*;
    use core::slice;
    use std::borrow::ToOwned;
    use std::format;
    use std::vec::Vec;
    use std::{println, vec};

    #[test]
    fn can_hash_u32() {
        // Test that different inputs produce different hashes
        assert_ne!(
            hash_u32(1),
            hash_u32(2),
            "Different inputs should produce different hashes"
        );
    }

    #[test]
    fn is_zero_on_empty_input() {
        let empty: Vec<u8> = vec![];
        assert_eq!(
            estimate_num_lz_matches_fast(&empty),
            0,
            "Empty input should return 0 matches"
        );
    }

    #[test]
    fn is_0_on_small_input() {
        // Test input smaller than minimum required length (7 bytes)
        let small = vec![1, 2, 3, 4, 5, 6];
        assert_eq!(
            estimate_num_lz_matches_fast(&small),
            0,
            "Input smaller than 7 bytes should return 0 matches"
        );
    }

    #[rstest]
    #[case(1 << 17, 0.001)] // 128K data (zstd block size), 0.1% error margin
    #[case(16777215, 0.001)] // 16.8MiB data, 0.1% error margin
    fn with_no_repetition_should_have_no_matches(
        #[case] test_size: usize,
        #[case] allowed_error: f32,
    ) {
        let expected = (test_size as f32 * allowed_error) as usize;

        // Use u16 sequence for 128K test, 3-byte sequence for larger tests
        let data = if test_size == 1 << 17 {
            // Create sequence with no repetitions using u16
            let unique: Vec<u16> = (0..u16::MAX).collect();
            cast_u16_slice_to_u8_slice(&unique).to_vec()
        } else {
            // Generate a sequence of all unique 3-byte integers
            // Since the estimator matches for >= 3 bytes, this should ideally return
            // a number as close to 0 as possible.
            generate_unique_3byte_sequence(test_size / 3)
        };

        // There should actually be 0 matches, but there's always going to be a bit of
        // error with hash collisions.
        let matches = estimate_num_lz_matches_fast(&data);
        println!(
            "[res:no_matches_{}] matches: {}, expected: < {}, allowed_error: {:.1}%, actual_error: {:.3}%",
            if test_size == 1 << 17 {
                "128k".to_owned()
            } else {
                format!("long_distance_{test_size}")
            },
            matches,
            expected,
            allowed_error * 100.0,
            (matches as f32 / test_size as f32) * 100.0
        ); // cargo test -- --nocapture | grep -i "^\[res:"
        assert!(
            matches < expected,
            "Sequence with no repetitions should have very few matches, \
             but got {matches} matches, expected at most {expected}"
        );
    }

    fn generate_unique_3byte_sequence(length: usize) -> Vec<u8> {
        let mut result = Vec::with_capacity(length * 3);

        // Generate sequence of unique 3-byte integers
        for x in 0..length {
            // Extract individual bytes from a 24-bit number
            let b0 = (x & 0xFF) as u8; // Least significant byte
            let b1 = ((x >> 8) & 0xFF) as u8; // Middle byte
            let b2 = ((x >> 16) & 0xFF) as u8; // Most significant byte

            result.push(b0);
            result.push(b1);
            result.push(b2);
        }

        result
    }

    #[rstest]
    #[case(1 << 17, 1 << 12, 113000)] // 128K size, 4K offset, expect at least 90K matches (found 89.4%)
    #[case(1 << 17, 1 << 13, 95000)] // 128K size, 8K offset, expect at least 90K matches (found 78.3%)
    #[case(1 << 17, 1 << 14, 60000)] // 128K size, 16K offset, expect at least 60K matches (found 53.8%)
    #[case(1 << 17, 1 << 15, 13000)] // 128K size, 32K offset, expect at least 13K matches (found 13.7%)
    #[case(1 << 17, 1 << 16, 450)] // 128K size, 64K offset, expect at least 450 matches (found 0.7%)
    fn estimate_num_lz_matches_at_various_offsets(
        #[case] test_size: usize,
        #[case] match_interval: usize,
        #[case] min_matches: usize,
    ) {
        assert!(
            match_interval <= 1 << 16,
            "Match interval must be <= 64K due to u16 limits"
        );
        assert!(match_interval > 0, "Match interval must be positive");
        assert!(
            test_size >= match_interval,
            "Test size must be >= match interval"
        );
        assert!(
            test_size.is_multiple_of(2) && match_interval.is_multiple_of(2),
            "Test size and match interval must be even due to u16 alignment"
        );

        // Create sequence that repeats every match_interval bytes
        let mut unique: Vec<u16> = Vec::with_capacity(test_size / 2);
        for x in 0..test_size / 2 {
            let val = (x % (match_interval / 2)) as u16;
            unique.push(val);
        }

        // Example of pattern:
        // Bytes 0-16383:     [00 00, 01 00, 02 00, ..., FF 3F]  First cycle
        // Bytes 16384-32767: [00 00, 01 00, 02 00, ..., FF 3F]  Second cycle (matches first)
        // Bytes 32768-49151: [00 00, 01 00, 02 00, ..., FF 3F]  Third cycle
        // And so on...

        let matches = estimate_num_lz_matches_fast(cast_u16_slice_to_u8_slice(&unique));

        // After first match_interval bytes, every position matches with match_interval bytes before it
        let expected = test_size - match_interval;

        // Calculate percentage of matches found vs expected
        let percentage = (matches as f32 / expected as f32) * 100.0;

        // Assert against minimum matches based on empirical results
        assert!(
            matches >= min_matches,
            "Got {matches} matches, which is below minimum threshold of {min_matches}"
        );

        println!(
            "[res:matches_{match_interval}_intervals_{test_size}] matches: {matches}, expected: < {expected}, minimum: {min_matches}, found: {percentage:.1}%"
        ); // cargo test -- --nocapture | grep -i "^\[res:"
    }

    fn cast_u16_slice_to_u8_slice(u16_slice: &[u16]) -> &[u8] {
        let ptr = u16_slice.as_ptr() as *const u8;
        let len = u16_slice.len() * 2; // Each u16 is 2 bytes
        unsafe { slice::from_raw_parts(ptr, len) }
    }
}