ragc_core/
segment_compression.rs1use crate::tuple_packing::{bytes_to_tuples, tuples_to_bytes};
5use crate::zstd_pool;
6use anyhow::Result;
7use ragc_common::types::{Contig, PackedBlock};
8
9const DELTA_COMPRESSION_LEVEL: i32 = 17;
12
13const REF_TUPLES_COMPRESSION_LEVEL: i32 = 13;
16
17const REF_PLAIN_COMPRESSION_LEVEL: i32 = 19;
20
21const REPETITIVENESS_THRESHOLD: f64 = 0.5;
24
25fn check_repetitiveness(data: &[u8]) -> f64 {
28 let mut best_frac = 0.0;
29
30 for offset in 4..32 {
31 let mut cnt = 0;
32 let mut cur_size = 0;
33
34 for j in 0..data.len() {
35 if j + offset < data.len() {
36 if data[j] == data[j + offset] {
37 cnt += 1;
38 }
39 if data[j] < 4 {
41 cur_size += 1;
42 }
43 }
44 }
45
46 let frac = if cur_size > 0 {
47 cnt as f64 / cur_size as f64
48 } else {
49 0.0
50 };
51
52 if frac > best_frac {
53 best_frac = frac;
54 if best_frac >= REPETITIVENESS_THRESHOLD {
56 break;
57 }
58 }
59 }
60
61 best_frac
62}
63
64pub fn compress_segment(data: &Contig) -> Result<PackedBlock> {
66 compress_segment_plain(data, DELTA_COMPRESSION_LEVEL)
67}
68
69pub fn compress_segment_configured(data: &Contig, level: i32) -> Result<PackedBlock> {
71 compress_segment_plain(data, level)
72}
73
74pub fn compress_reference_segment(data: &Contig) -> Result<(PackedBlock, u8)> {
83 let repetitiveness = check_repetitiveness(data);
84
85 let debug_ref = crate::env_cache::debug_ref();
87 if debug_ref {
88 eprintln!(
89 "RAGC_REF_COMPRESS: len={} rep={:.4} threshold={:.4}",
90 data.len(),
91 repetitiveness,
92 REPETITIVENESS_THRESHOLD
93 );
94 }
95
96 if repetitiveness < REPETITIVENESS_THRESHOLD {
97 let tuples = bytes_to_tuples(data);
99 let compressed = zstd_pool::compress_segment_pooled(&tuples, REF_TUPLES_COMPRESSION_LEVEL)?;
100 if debug_ref {
101 eprintln!(
102 "RAGC_REF_DECISION: TUPLE_PACK marker=1 level={} tuple_len={} compressed_len={}",
103 REF_TUPLES_COMPRESSION_LEVEL,
104 tuples.len(),
105 compressed.len()
106 );
107 }
108 Ok((compressed, 1)) } else {
110 let compressed = zstd_pool::compress_segment_pooled(data, REF_PLAIN_COMPRESSION_LEVEL)?;
112 if debug_ref {
113 eprintln!(
114 "RAGC_REF_DECISION: PLAIN marker=0 level={} compressed_len={}",
115 REF_PLAIN_COMPRESSION_LEVEL,
116 compressed.len()
117 );
118 }
119 Ok((compressed, 0)) }
121}
122
123pub fn compress_segment_plain(data: &Contig, level: i32) -> Result<PackedBlock> {
125 zstd_pool::compress_segment_pooled(data, level)
126}
127
128pub fn decompress_segment_with_marker(compressed: &[u8], marker: u8) -> Result<Contig> {
134 if compressed.is_empty() {
135 return Ok(Vec::new());
136 }
137
138 if marker == 0 {
139 zstd_pool::decompress_segment_pooled(&compressed.to_vec())
141 } else {
142 let tuples = zstd_pool::decompress_segment_pooled(&compressed.to_vec())?;
144 Ok(tuples_to_bytes(&tuples))
145 }
146}
147
148pub fn decompress_segment(compressed: &[u8]) -> Result<Contig> {
150 zstd_pool::decompress_segment_pooled(&compressed.to_vec())
151}
152
153#[cfg(test)]
154mod tests {
155 use super::*;
156
157 #[test]
158 fn test_compress_decompress_roundtrip() {
159 let original = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
160
161 let compressed = compress_segment(&original).unwrap();
162 let decompressed = decompress_segment(&compressed).unwrap();
163
164 assert_eq!(original, decompressed);
165 }
166
167 #[test]
168 fn test_compress_empty() {
169 let original = vec![];
170
171 let compressed = compress_segment(&original).unwrap();
172 let decompressed = decompress_segment(&compressed).unwrap();
173
174 assert_eq!(original, decompressed);
175 }
176
177 #[test]
178 fn test_compress_large() {
179 let mut original = Vec::new();
181 for i in 0..1000 {
182 original.push((i % 4) as u8);
183 }
184
185 let compressed = compress_segment(&original).unwrap();
186 let decompressed = decompress_segment(&compressed).unwrap();
187
188 assert_eq!(original, decompressed);
189
190 assert!(compressed.len() < original.len());
192 }
193
194 #[test]
195 fn test_different_compression_levels() {
196 let original = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
197
198 for level in [1, 3, 9, 19].iter() {
199 let compressed = compress_segment_configured(&original, *level).unwrap();
200 let decompressed = decompress_segment(&compressed).unwrap();
201 assert_eq!(original, decompressed);
202 }
203 }
204
205 #[test]
206 fn test_repetitiveness_check() {
207 let repetitive = vec![0; 100];
209 let rep1 = check_repetitiveness(&repetitive);
210 assert_eq!(
211 rep1, 1.0,
212 "All-zero sequence should have perfect repetitiveness"
213 );
214
215 let mixed = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1];
217 let rep2 = check_repetitiveness(&mixed);
218 assert!(
219 (0.0..=1.0).contains(&rep2),
220 "Repetitiveness should be in [0, 1]"
221 );
222 }
223
224 #[test]
225 fn test_reference_compression_with_tuple_packing() {
226 let high_rep = vec![0; 100];
230 let (compressed1, marker1) = compress_reference_segment(&high_rep).unwrap();
231 assert_eq!(marker1, 0, "High repetitiveness should use plain ZSTD");
232 let decompressed1 = decompress_segment_with_marker(&compressed1, marker1).unwrap();
233 assert_eq!(high_rep, decompressed1);
234
235 }
239
240 #[test]
241 fn test_reference_compression_without_tuple_packing() {
242 let high_rep = vec![0; 100];
244 let (compressed, marker) = compress_reference_segment(&high_rep).unwrap();
245 assert_eq!(marker, 0, "High repetitiveness should use plain ZSTD");
246 let decompressed = decompress_segment_with_marker(&compressed, marker).unwrap();
247 assert_eq!(high_rep, decompressed);
248 }
249
250 #[test]
251 fn test_tuple_packing_compression_path() {
252 let original = vec![0, 1, 2, 3, 0, 1, 2, 3];
256
257 let tuples = bytes_to_tuples(&original);
259 let compressed = zstd_pool::compress_segment_pooled(&tuples, 13).unwrap();
260
261 let decompressed = decompress_segment_with_marker(&compressed, 1).unwrap();
263 assert_eq!(
264 original, decompressed,
265 "Tuple packing roundtrip should work"
266 );
267 }
268}