ragc_core/
segment_compression.rs

1// Segment Compression
2// ZSTD compression/decompression for segments with tuple packing
3
4use crate::tuple_packing::{bytes_to_tuples, tuples_to_bytes};
5use crate::zstd_pool;
6use anyhow::Result;
7use ragc_common::types::{Contig, PackedBlock};
8
9/// Default ZSTD compression level for delta packs
10/// Use level 17 to match C++ AGC's delta pack compression (segment.h:279)
11const DELTA_COMPRESSION_LEVEL: i32 = 17;
12
13/// ZSTD compression level for reference segments with tuple packing
14/// C++ AGC uses level 13 for tuple-packed references (segment.h:252)
15const REF_TUPLES_COMPRESSION_LEVEL: i32 = 13;
16
17/// ZSTD compression level for reference segments without tuple packing
18/// C++ AGC uses level 19 for plain references (segment.h:254)
19const REF_PLAIN_COMPRESSION_LEVEL: i32 = 19;
20
21/// Repetitiveness threshold for choosing compression method
22/// C++ AGC uses 0.5 (segment.h:225)
23const REPETITIVENESS_THRESHOLD: f64 = 0.5;
24
25/// Check repetitiveness of data to decide compression method
26/// Matches C++ AGC implementation in segment.h:224-249
27fn check_repetitiveness(data: &[u8]) -> f64 {
28    let mut best_frac = 0.0;
29
30    for offset in 4..32 {
31        let mut cnt = 0;
32        let mut cur_size = 0;
33
34        for j in 0..data.len() {
35            if j + offset < data.len() {
36                if data[j] == data[j + offset] {
37                    cnt += 1;
38                }
39                // Only count ACGT bases (values < 4)
40                if data[j] < 4 {
41                    cur_size += 1;
42                }
43            }
44        }
45
46        let frac = if cur_size > 0 {
47            cnt as f64 / cur_size as f64
48        } else {
49            0.0
50        };
51
52        if frac > best_frac {
53            best_frac = frac;
54            // Early exit if we've reached threshold
55            if best_frac >= REPETITIVENESS_THRESHOLD {
56                break;
57            }
58        }
59    }
60
61    best_frac
62}
63
64/// Compress a segment using ZSTD with default compression level (for delta packs)
65pub fn compress_segment(data: &Contig) -> Result<PackedBlock> {
66    compress_segment_plain(data, DELTA_COMPRESSION_LEVEL)
67}
68
69/// Compress a segment using ZSTD with configured level (for delta packs)
70pub fn compress_segment_configured(data: &Contig, level: i32) -> Result<PackedBlock> {
71    compress_segment_plain(data, level)
72}
73
74/// Compress a reference segment with automatic tuple packing decision
75///
76/// **IMPORTANT**: Matches C++ AGC's store_in_archive() logic!
77/// - Checks repetitiveness of data
78/// - If repetitiveness < 0.5: use tuple packing (marker 1, level 13)
79/// - If repetitiveness >= 0.5: use plain ZSTD (marker 0, level 19)
80///
81/// Returns (compressed_data, marker_byte)
82pub fn compress_reference_segment(data: &Contig) -> Result<(PackedBlock, u8)> {
83    let repetitiveness = check_repetitiveness(data);
84
85    // Debug logging for reference compression decisions
86    let debug_ref = crate::env_cache::debug_ref();
87    if debug_ref {
88        eprintln!(
89            "RAGC_REF_COMPRESS: len={} rep={:.4} threshold={:.4}",
90            data.len(),
91            repetitiveness,
92            REPETITIVENESS_THRESHOLD
93        );
94    }
95
96    if repetitiveness < REPETITIVENESS_THRESHOLD {
97        // Low repetitiveness: use tuple packing
98        let tuples = bytes_to_tuples(data);
99        let compressed = zstd_pool::compress_segment_pooled(&tuples, REF_TUPLES_COMPRESSION_LEVEL)?;
100        if debug_ref {
101            eprintln!(
102                "RAGC_REF_DECISION: TUPLE_PACK marker=1 level={} tuple_len={} compressed_len={}",
103                REF_TUPLES_COMPRESSION_LEVEL,
104                tuples.len(),
105                compressed.len()
106            );
107        }
108        Ok((compressed, 1)) // Marker 1 = tuple-packed
109    } else {
110        // High repetitiveness: use plain ZSTD
111        let compressed = zstd_pool::compress_segment_pooled(data, REF_PLAIN_COMPRESSION_LEVEL)?;
112        if debug_ref {
113            eprintln!(
114                "RAGC_REF_DECISION: PLAIN marker=0 level={} compressed_len={}",
115                REF_PLAIN_COMPRESSION_LEVEL,
116                compressed.len()
117            );
118        }
119        Ok((compressed, 0)) // Marker 0 = plain
120    }
121}
122
123/// Compress a segment using plain ZSTD (for delta packs)
124pub fn compress_segment_plain(data: &Contig, level: i32) -> Result<PackedBlock> {
125    zstd_pool::compress_segment_pooled(data, level)
126}
127
128/// Decompress a segment based on marker byte
129///
130/// **IMPORTANT**: Checks marker byte to determine decompression method!
131/// - Marker 0: plain ZSTD decompression
132/// - Marker 1 (or any non-zero): ZSTD + tuple unpacking
133pub fn decompress_segment_with_marker(compressed: &[u8], marker: u8) -> Result<Contig> {
134    if compressed.is_empty() {
135        return Ok(Vec::new());
136    }
137
138    if marker == 0 {
139        // Plain ZSTD
140        zstd_pool::decompress_segment_pooled(&compressed.to_vec())
141    } else {
142        // Tuple-packed: decompress then unpack
143        let tuples = zstd_pool::decompress_segment_pooled(&compressed.to_vec())?;
144        Ok(tuples_to_bytes(&tuples))
145    }
146}
147
148/// Decompress a segment using plain ZSTD (for old code compatibility)
149pub fn decompress_segment(compressed: &[u8]) -> Result<Contig> {
150    zstd_pool::decompress_segment_pooled(&compressed.to_vec())
151}
152
153#[cfg(test)]
154mod tests {
155    use super::*;
156
157    #[test]
158    fn test_compress_decompress_roundtrip() {
159        let original = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
160
161        let compressed = compress_segment(&original).unwrap();
162        let decompressed = decompress_segment(&compressed).unwrap();
163
164        assert_eq!(original, decompressed);
165    }
166
167    #[test]
168    fn test_compress_empty() {
169        let original = vec![];
170
171        let compressed = compress_segment(&original).unwrap();
172        let decompressed = decompress_segment(&compressed).unwrap();
173
174        assert_eq!(original, decompressed);
175    }
176
177    #[test]
178    fn test_compress_large() {
179        // Create a large sequence with some repetition
180        let mut original = Vec::new();
181        for i in 0..1000 {
182            original.push((i % 4) as u8);
183        }
184
185        let compressed = compress_segment(&original).unwrap();
186        let decompressed = decompress_segment(&compressed).unwrap();
187
188        assert_eq!(original, decompressed);
189
190        // Compressed should be smaller than original
191        assert!(compressed.len() < original.len());
192    }
193
194    #[test]
195    fn test_different_compression_levels() {
196        let original = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
197
198        for level in [1, 3, 9, 19].iter() {
199            let compressed = compress_segment_configured(&original, *level).unwrap();
200            let decompressed = decompress_segment(&compressed).unwrap();
201            assert_eq!(original, decompressed);
202        }
203    }
204
205    #[test]
206    fn test_repetitiveness_check() {
207        // Highly repetitive sequence (all zeros) - should get score of 1.0
208        let repetitive = vec![0; 100];
209        let rep1 = check_repetitiveness(&repetitive);
210        assert_eq!(
211            rep1, 1.0,
212            "All-zero sequence should have perfect repetitiveness"
213        );
214
215        // Test that the function returns a value between 0 and 1
216        let mixed = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1];
217        let rep2 = check_repetitiveness(&mixed);
218        assert!(
219            (0.0..=1.0).contains(&rep2),
220            "Repetitiveness should be in [0, 1]"
221        );
222    }
223
224    #[test]
225    fn test_reference_compression_with_tuple_packing() {
226        // Test both compression paths work correctly
227
228        // High repetitiveness: should use plain ZSTD (marker 0)
229        let high_rep = vec![0; 100];
230        let (compressed1, marker1) = compress_reference_segment(&high_rep).unwrap();
231        assert_eq!(marker1, 0, "High repetitiveness should use plain ZSTD");
232        let decompressed1 = decompress_segment_with_marker(&compressed1, marker1).unwrap();
233        assert_eq!(high_rep, decompressed1);
234
235        // Note: We don't test for marker=1 here because it's hard to create
236        // test data that reliably has repetitiveness < 0.5. The important
237        // thing is that both code paths work (tested separately below).
238    }
239
240    #[test]
241    fn test_reference_compression_without_tuple_packing() {
242        // High repetitiveness: should use plain ZSTD
243        let high_rep = vec![0; 100];
244        let (compressed, marker) = compress_reference_segment(&high_rep).unwrap();
245        assert_eq!(marker, 0, "High repetitiveness should use plain ZSTD");
246        let decompressed = decompress_segment_with_marker(&compressed, marker).unwrap();
247        assert_eq!(high_rep, decompressed);
248    }
249
250    #[test]
251    fn test_tuple_packing_compression_path() {
252        // Test the tuple packing code path directly (marker 1)
253        // Even if we can't reliably trigger it via compress_reference_segment,
254        // we can test that decompress_segment_with_marker handles it correctly
255        let original = vec![0, 1, 2, 3, 0, 1, 2, 3];
256
257        // Manually invoke tuple packing path
258        let tuples = bytes_to_tuples(&original);
259        let compressed = zstd_pool::compress_segment_pooled(&tuples, 13).unwrap();
260
261        // Test decompression with marker 1
262        let decompressed = decompress_segment_with_marker(&compressed, 1).unwrap();
263        assert_eq!(
264            original, decompressed,
265            "Tuple packing roundtrip should work"
266        );
267    }
268}