ragc_core/
segment_compression.rs1use crate::tuple_packing::{bytes_to_tuples, tuples_to_bytes};
5use crate::zstd_pool;
6use anyhow::Result;
7use ragc_common::types::{Contig, PackedBlock};
8
9const DELTA_COMPRESSION_LEVEL: i32 = 17;
12
13const REF_TUPLES_COMPRESSION_LEVEL: i32 = 13;
16
17const REF_PLAIN_COMPRESSION_LEVEL: i32 = 19;
20
21const REPETITIVENESS_THRESHOLD: f64 = 0.5;
24
25fn check_repetitiveness(data: &[u8]) -> f64 {
28 let mut best_frac = 0.0;
29
30 for offset in 4..32 {
31 let mut cnt = 0;
32 let mut cur_size = 0;
33
34 for j in 0..data.len() {
35 if j + offset < data.len() {
36 if data[j] == data[j + offset] {
37 cnt += 1;
38 }
39 if data[j] < 4 {
41 cur_size += 1;
42 }
43 }
44 }
45
46 let frac = if cur_size > 0 {
47 cnt as f64 / cur_size as f64
48 } else {
49 0.0
50 };
51
52 if frac > best_frac {
53 best_frac = frac;
54 if best_frac >= REPETITIVENESS_THRESHOLD {
56 break;
57 }
58 }
59 }
60
61 best_frac
62}
63
64pub fn compress_segment(data: &Contig) -> Result<PackedBlock> {
66 compress_segment_plain(data, DELTA_COMPRESSION_LEVEL)
67}
68
69pub fn compress_segment_configured(data: &Contig, level: i32) -> Result<PackedBlock> {
71 compress_segment_plain(data, level)
72}
73
74pub fn compress_reference_segment(data: &Contig) -> Result<(PackedBlock, u8)> {
83 let repetitiveness = check_repetitiveness(data);
84
85 if repetitiveness < REPETITIVENESS_THRESHOLD {
86 let tuples = bytes_to_tuples(data);
88 let compressed = zstd_pool::compress_segment_pooled(&tuples, REF_TUPLES_COMPRESSION_LEVEL)?;
89 Ok((compressed, 1)) } else {
91 let compressed = zstd_pool::compress_segment_pooled(data, REF_PLAIN_COMPRESSION_LEVEL)?;
93 Ok((compressed, 0)) }
95}
96
97pub fn compress_segment_plain(data: &Contig, level: i32) -> Result<PackedBlock> {
99 zstd_pool::compress_segment_pooled(data, level)
100}
101
102pub fn decompress_segment_with_marker(compressed: &[u8], marker: u8) -> Result<Contig> {
108 if compressed.is_empty() {
109 return Ok(Vec::new());
110 }
111
112 if marker == 0 {
113 zstd_pool::decompress_segment_pooled(&compressed.to_vec())
115 } else {
116 let tuples = zstd_pool::decompress_segment_pooled(&compressed.to_vec())?;
118 Ok(tuples_to_bytes(&tuples))
119 }
120}
121
122pub fn decompress_segment(compressed: &[u8]) -> Result<Contig> {
124 zstd_pool::decompress_segment_pooled(&compressed.to_vec())
125}
126
127#[cfg(test)]
128mod tests {
129 use super::*;
130
131 #[test]
132 fn test_compress_decompress_roundtrip() {
133 let original = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
134
135 let compressed = compress_segment(&original).unwrap();
136 let decompressed = decompress_segment(&compressed).unwrap();
137
138 assert_eq!(original, decompressed);
139 }
140
141 #[test]
142 fn test_compress_empty() {
143 let original = vec![];
144
145 let compressed = compress_segment(&original).unwrap();
146 let decompressed = decompress_segment(&compressed).unwrap();
147
148 assert_eq!(original, decompressed);
149 }
150
151 #[test]
152 fn test_compress_large() {
153 let mut original = Vec::new();
155 for i in 0..1000 {
156 original.push((i % 4) as u8);
157 }
158
159 let compressed = compress_segment(&original).unwrap();
160 let decompressed = decompress_segment(&compressed).unwrap();
161
162 assert_eq!(original, decompressed);
163
164 assert!(compressed.len() < original.len());
166 }
167
168 #[test]
169 fn test_different_compression_levels() {
170 let original = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
171
172 for level in [1, 3, 9, 19].iter() {
173 let compressed = compress_segment_configured(&original, *level).unwrap();
174 let decompressed = decompress_segment(&compressed).unwrap();
175 assert_eq!(original, decompressed);
176 }
177 }
178
179 #[test]
180 fn test_repetitiveness_check() {
181 let repetitive = vec![0; 100];
183 let rep1 = check_repetitiveness(&repetitive);
184 assert_eq!(
185 rep1, 1.0,
186 "All-zero sequence should have perfect repetitiveness"
187 );
188
189 let mixed = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1];
191 let rep2 = check_repetitiveness(&mixed);
192 assert!(
193 (0.0..=1.0).contains(&rep2),
194 "Repetitiveness should be in [0, 1]"
195 );
196 }
197
198 #[test]
199 fn test_reference_compression_with_tuple_packing() {
200 let high_rep = vec![0; 100];
204 let (compressed1, marker1) = compress_reference_segment(&high_rep).unwrap();
205 assert_eq!(marker1, 0, "High repetitiveness should use plain ZSTD");
206 let decompressed1 = decompress_segment_with_marker(&compressed1, marker1).unwrap();
207 assert_eq!(high_rep, decompressed1);
208
209 }
213
214 #[test]
215 fn test_reference_compression_without_tuple_packing() {
216 let high_rep = vec![0; 100];
218 let (compressed, marker) = compress_reference_segment(&high_rep).unwrap();
219 assert_eq!(marker, 0, "High repetitiveness should use plain ZSTD");
220 let decompressed = decompress_segment_with_marker(&compressed, marker).unwrap();
221 assert_eq!(high_rep, decompressed);
222 }
223
224 #[test]
225 fn test_tuple_packing_compression_path() {
226 let original = vec![0, 1, 2, 3, 0, 1, 2, 3];
230
231 let tuples = bytes_to_tuples(&original);
233 let compressed = zstd_pool::compress_segment_pooled(&tuples, 13).unwrap();
234
235 let decompressed = decompress_segment_with_marker(&compressed, 1).unwrap();
237 assert_eq!(
238 original, decompressed,
239 "Tuple packing roundtrip should work"
240 );
241 }
242}