ragc_core/
segment_compression.rs

1// Segment Compression
2// ZSTD compression/decompression for segments with tuple packing
3
4use crate::tuple_packing::{bytes_to_tuples, tuples_to_bytes};
5use crate::zstd_pool;
6use anyhow::Result;
7use ragc_common::types::{Contig, PackedBlock};
8
9/// Default ZSTD compression level for delta packs
10/// Use level 17 to match C++ AGC's delta pack compression (segment.h:279)
11const DELTA_COMPRESSION_LEVEL: i32 = 17;
12
13/// ZSTD compression level for reference segments with tuple packing
14/// C++ AGC uses level 13 for tuple-packed references (segment.h:252)
15const REF_TUPLES_COMPRESSION_LEVEL: i32 = 13;
16
17/// ZSTD compression level for reference segments without tuple packing
18/// C++ AGC uses level 19 for plain references (segment.h:254)
19const REF_PLAIN_COMPRESSION_LEVEL: i32 = 19;
20
21/// Repetitiveness threshold for choosing compression method
22/// C++ AGC uses 0.5 (segment.h:225)
23const REPETITIVENESS_THRESHOLD: f64 = 0.5;
24
25/// Check repetitiveness of data to decide compression method
26/// Matches C++ AGC implementation in segment.h:224-249
27fn check_repetitiveness(data: &[u8]) -> f64 {
28    let mut best_frac = 0.0;
29
30    for offset in 4..32 {
31        let mut cnt = 0;
32        let mut cur_size = 0;
33
34        for j in 0..data.len() {
35            if j + offset < data.len() {
36                if data[j] == data[j + offset] {
37                    cnt += 1;
38                }
39                // Only count ACGT bases (values < 4)
40                if data[j] < 4 {
41                    cur_size += 1;
42                }
43            }
44        }
45
46        let frac = if cur_size > 0 {
47            cnt as f64 / cur_size as f64
48        } else {
49            0.0
50        };
51
52        if frac > best_frac {
53            best_frac = frac;
54            // Early exit if we've reached threshold
55            if best_frac >= REPETITIVENESS_THRESHOLD {
56                break;
57            }
58        }
59    }
60
61    best_frac
62}
63
64/// Compress a segment using ZSTD with default compression level (for delta packs)
65pub fn compress_segment(data: &Contig) -> Result<PackedBlock> {
66    compress_segment_plain(data, DELTA_COMPRESSION_LEVEL)
67}
68
69/// Compress a segment using ZSTD with configured level (for delta packs)
70pub fn compress_segment_configured(data: &Contig, level: i32) -> Result<PackedBlock> {
71    compress_segment_plain(data, level)
72}
73
74/// Compress a reference segment with automatic tuple packing decision
75///
76/// **IMPORTANT**: Matches C++ AGC's store_in_archive() logic!
77/// - Checks repetitiveness of data
78/// - If repetitiveness < 0.5: use tuple packing (marker 1, level 13)
79/// - If repetitiveness >= 0.5: use plain ZSTD (marker 0, level 19)
80///
81/// Returns (compressed_data, marker_byte)
82pub fn compress_reference_segment(data: &Contig) -> Result<(PackedBlock, u8)> {
83    let repetitiveness = check_repetitiveness(data);
84
85    if repetitiveness < REPETITIVENESS_THRESHOLD {
86        // Low repetitiveness: use tuple packing
87        let tuples = bytes_to_tuples(data);
88        let compressed = zstd_pool::compress_segment_pooled(&tuples, REF_TUPLES_COMPRESSION_LEVEL)?;
89        Ok((compressed, 1)) // Marker 1 = tuple-packed
90    } else {
91        // High repetitiveness: use plain ZSTD
92        let compressed = zstd_pool::compress_segment_pooled(data, REF_PLAIN_COMPRESSION_LEVEL)?;
93        Ok((compressed, 0)) // Marker 0 = plain
94    }
95}
96
97/// Compress a segment using plain ZSTD (for delta packs)
98pub fn compress_segment_plain(data: &Contig, level: i32) -> Result<PackedBlock> {
99    zstd_pool::compress_segment_pooled(data, level)
100}
101
102/// Decompress a segment based on marker byte
103///
104/// **IMPORTANT**: Checks marker byte to determine decompression method!
105/// - Marker 0: plain ZSTD decompression
106/// - Marker 1 (or any non-zero): ZSTD + tuple unpacking
107pub fn decompress_segment_with_marker(compressed: &[u8], marker: u8) -> Result<Contig> {
108    if compressed.is_empty() {
109        return Ok(Vec::new());
110    }
111
112    if marker == 0 {
113        // Plain ZSTD
114        zstd_pool::decompress_segment_pooled(&compressed.to_vec())
115    } else {
116        // Tuple-packed: decompress then unpack
117        let tuples = zstd_pool::decompress_segment_pooled(&compressed.to_vec())?;
118        Ok(tuples_to_bytes(&tuples))
119    }
120}
121
122/// Decompress a segment using plain ZSTD (for old code compatibility)
123pub fn decompress_segment(compressed: &[u8]) -> Result<Contig> {
124    zstd_pool::decompress_segment_pooled(&compressed.to_vec())
125}
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130
131    #[test]
132    fn test_compress_decompress_roundtrip() {
133        let original = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
134
135        let compressed = compress_segment(&original).unwrap();
136        let decompressed = decompress_segment(&compressed).unwrap();
137
138        assert_eq!(original, decompressed);
139    }
140
141    #[test]
142    fn test_compress_empty() {
143        let original = vec![];
144
145        let compressed = compress_segment(&original).unwrap();
146        let decompressed = decompress_segment(&compressed).unwrap();
147
148        assert_eq!(original, decompressed);
149    }
150
151    #[test]
152    fn test_compress_large() {
153        // Create a large sequence with some repetition
154        let mut original = Vec::new();
155        for i in 0..1000 {
156            original.push((i % 4) as u8);
157        }
158
159        let compressed = compress_segment(&original).unwrap();
160        let decompressed = decompress_segment(&compressed).unwrap();
161
162        assert_eq!(original, decompressed);
163
164        // Compressed should be smaller than original
165        assert!(compressed.len() < original.len());
166    }
167
168    #[test]
169    fn test_different_compression_levels() {
170        let original = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
171
172        for level in [1, 3, 9, 19].iter() {
173            let compressed = compress_segment_configured(&original, *level).unwrap();
174            let decompressed = decompress_segment(&compressed).unwrap();
175            assert_eq!(original, decompressed);
176        }
177    }
178
179    #[test]
180    fn test_repetitiveness_check() {
181        // Highly repetitive sequence (all zeros) - should get score of 1.0
182        let repetitive = vec![0; 100];
183        let rep1 = check_repetitiveness(&repetitive);
184        assert_eq!(
185            rep1, 1.0,
186            "All-zero sequence should have perfect repetitiveness"
187        );
188
189        // Test that the function returns a value between 0 and 1
190        let mixed = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1];
191        let rep2 = check_repetitiveness(&mixed);
192        assert!(
193            (0.0..=1.0).contains(&rep2),
194            "Repetitiveness should be in [0, 1]"
195        );
196    }
197
198    #[test]
199    fn test_reference_compression_with_tuple_packing() {
200        // Test both compression paths work correctly
201
202        // High repetitiveness: should use plain ZSTD (marker 0)
203        let high_rep = vec![0; 100];
204        let (compressed1, marker1) = compress_reference_segment(&high_rep).unwrap();
205        assert_eq!(marker1, 0, "High repetitiveness should use plain ZSTD");
206        let decompressed1 = decompress_segment_with_marker(&compressed1, marker1).unwrap();
207        assert_eq!(high_rep, decompressed1);
208
209        // Note: We don't test for marker=1 here because it's hard to create
210        // test data that reliably has repetitiveness < 0.5. The important
211        // thing is that both code paths work (tested separately below).
212    }
213
214    #[test]
215    fn test_reference_compression_without_tuple_packing() {
216        // High repetitiveness: should use plain ZSTD
217        let high_rep = vec![0; 100];
218        let (compressed, marker) = compress_reference_segment(&high_rep).unwrap();
219        assert_eq!(marker, 0, "High repetitiveness should use plain ZSTD");
220        let decompressed = decompress_segment_with_marker(&compressed, marker).unwrap();
221        assert_eq!(high_rep, decompressed);
222    }
223
224    #[test]
225    fn test_tuple_packing_compression_path() {
226        // Test the tuple packing code path directly (marker 1)
227        // Even if we can't reliably trigger it via compress_reference_segment,
228        // we can test that decompress_segment_with_marker handles it correctly
229        let original = vec![0, 1, 2, 3, 0, 1, 2, 3];
230
231        // Manually invoke tuple packing path
232        let tuples = bytes_to_tuples(&original);
233        let compressed = zstd_pool::compress_segment_pooled(&tuples, 13).unwrap();
234
235        // Test decompression with marker 1
236        let decompressed = decompress_segment_with_marker(&compressed, 1).unwrap();
237        assert_eq!(
238            original, decompressed,
239            "Tuple packing roundtrip should work"
240        );
241    }
242}