1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
//! GPU LZ4 Compression Kernel (Pure Rust PTX Generation)
//!
//! Implements Warp-per-Page architecture for high-throughput LZ4 compression.
//! Each 4KB page is processed by a single warp (32 threads) cooperatively.
//!
//! ## Algorithm Overview (from LZ4 Block Format)
//!
//! LZ4 encodes data as sequences of:
//! - **Literals**: Raw uncompressed bytes
//! - **Matches**: Back-references to previously seen data (offset + length)
//!
//! Token format: `[4-bit literal length][4-bit match length]`
//! - Minimum match length is 4 bytes (MINMATCH)
//!
//! ## Warp-Cooperative Strategy
//!
//! 1. **Shared Memory Load**: All 32 threads load 128 bytes each (4KB total)
//! 2. **Hash Table**: Hash table in shared memory for match finding
//! 3. **Parallel Match Search**: Each thread checks different positions
//! 4. **Leader Encoding**: Lane 0 encodes tokens sequentially
pub use Lz4WarpCompressKernel;
pub use ;
pub use Lz4WarpDecompressKernel;
/// LZ4 minimum match length (per LZ4 block format spec)
pub const LZ4_MIN_MATCH: u32 = 4;
/// LZ4 maximum match length: 255 + 15 + 4 = 274 bytes
pub const LZ4_MAX_MATCH: u32 = 255 + 15 + 4;
/// Number of bits for hash table indexing (4096 entries)
pub const LZ4_HASH_BITS: u32 = 12;
/// Hash table size in entries (1 << 12 = 4096)
pub const LZ4_HASH_SIZE: u32 = 1 << LZ4_HASH_BITS;
/// Page size for ZRAM compression (4KB)
pub const PAGE_SIZE: u32 = 4096;
/// LZ4 hash multiplier (Knuth multiplicative hash constant)
pub const LZ4_HASH_MULT: u32 = 2_654_435_761;
/// Maximum offset for LZ4 match (64KB - 1)
pub const LZ4_MAX_OFFSET: u32 = 65535;