use crate::tuple_packing::{bytes_to_tuples, tuples_to_bytes};
use crate::zstd_pool;
use anyhow::Result;
use ragc_common::types::{Contig, PackedBlock};
const DELTA_COMPRESSION_LEVEL: i32 = 17;
const REF_TUPLES_COMPRESSION_LEVEL: i32 = 13;
const REF_PLAIN_COMPRESSION_LEVEL: i32 = 19;
const REPETITIVENESS_THRESHOLD: f64 = 0.5;
fn check_repetitiveness(data: &[u8]) -> f64 {
let mut best_frac = 0.0;
for offset in 4..32 {
let mut cnt = 0;
let mut cur_size = 0;
for j in 0..data.len() {
if j + offset < data.len() {
if data[j] == data[j + offset] {
cnt += 1;
}
if data[j] < 4 {
cur_size += 1;
}
}
}
let frac = if cur_size > 0 {
cnt as f64 / cur_size as f64
} else {
0.0
};
if frac > best_frac {
best_frac = frac;
if best_frac >= REPETITIVENESS_THRESHOLD {
break;
}
}
}
best_frac
}
pub fn compress_segment(data: &Contig) -> Result<PackedBlock> {
compress_segment_plain(data, DELTA_COMPRESSION_LEVEL)
}
pub fn compress_segment_configured(data: &Contig, level: i32) -> Result<PackedBlock> {
compress_segment_plain(data, level)
}
pub fn compress_reference_segment(data: &Contig) -> Result<(PackedBlock, u8)> {
let repetitiveness = check_repetitiveness(data);
let debug_ref = crate::env_cache::debug_ref();
if debug_ref {
eprintln!(
"RAGC_REF_COMPRESS: len={} rep={:.4} threshold={:.4}",
data.len(),
repetitiveness,
REPETITIVENESS_THRESHOLD
);
}
if repetitiveness < REPETITIVENESS_THRESHOLD {
let tuples = bytes_to_tuples(data);
let compressed = zstd_pool::compress_segment_pooled(&tuples, REF_TUPLES_COMPRESSION_LEVEL)?;
if debug_ref {
eprintln!(
"RAGC_REF_DECISION: TUPLE_PACK marker=1 level={} tuple_len={} compressed_len={}",
REF_TUPLES_COMPRESSION_LEVEL,
tuples.len(),
compressed.len()
);
}
Ok((compressed, 1)) } else {
let compressed = zstd_pool::compress_segment_pooled(data, REF_PLAIN_COMPRESSION_LEVEL)?;
if debug_ref {
eprintln!(
"RAGC_REF_DECISION: PLAIN marker=0 level={} compressed_len={}",
REF_PLAIN_COMPRESSION_LEVEL,
compressed.len()
);
}
Ok((compressed, 0)) }
}
pub fn compress_segment_plain(data: &Contig, level: i32) -> Result<PackedBlock> {
zstd_pool::compress_segment_pooled(data, level)
}
pub fn decompress_segment_with_marker(compressed: &[u8], marker: u8) -> Result<Contig> {
if compressed.is_empty() {
return Ok(Vec::new());
}
if marker == 0 {
zstd_pool::decompress_segment_pooled(&compressed.to_vec())
} else {
let tuples = zstd_pool::decompress_segment_pooled(&compressed.to_vec())?;
Ok(tuples_to_bytes(&tuples))
}
}
pub fn decompress_segment(compressed: &[u8]) -> Result<Contig> {
zstd_pool::decompress_segment_pooled(&compressed.to_vec())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compress_decompress_roundtrip() {
let original = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
let compressed = compress_segment(&original).unwrap();
let decompressed = decompress_segment(&compressed).unwrap();
assert_eq!(original, decompressed);
}
#[test]
fn test_compress_empty() {
let original = vec![];
let compressed = compress_segment(&original).unwrap();
let decompressed = decompress_segment(&compressed).unwrap();
assert_eq!(original, decompressed);
}
#[test]
fn test_compress_large() {
let mut original = Vec::new();
for i in 0..1000 {
original.push((i % 4) as u8);
}
let compressed = compress_segment(&original).unwrap();
let decompressed = decompress_segment(&compressed).unwrap();
assert_eq!(original, decompressed);
assert!(compressed.len() < original.len());
}
#[test]
fn test_different_compression_levels() {
let original = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
for level in [1, 3, 9, 19].iter() {
let compressed = compress_segment_configured(&original, *level).unwrap();
let decompressed = decompress_segment(&compressed).unwrap();
assert_eq!(original, decompressed);
}
}
#[test]
fn test_repetitiveness_check() {
let repetitive = vec![0; 100];
let rep1 = check_repetitiveness(&repetitive);
assert_eq!(
rep1, 1.0,
"All-zero sequence should have perfect repetitiveness"
);
let mixed = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1];
let rep2 = check_repetitiveness(&mixed);
assert!(
(0.0..=1.0).contains(&rep2),
"Repetitiveness should be in [0, 1]"
);
}
#[test]
fn test_reference_compression_with_tuple_packing() {
let high_rep = vec![0; 100];
let (compressed1, marker1) = compress_reference_segment(&high_rep).unwrap();
assert_eq!(marker1, 0, "High repetitiveness should use plain ZSTD");
let decompressed1 = decompress_segment_with_marker(&compressed1, marker1).unwrap();
assert_eq!(high_rep, decompressed1);
}
#[test]
fn test_reference_compression_without_tuple_packing() {
let high_rep = vec![0; 100];
let (compressed, marker) = compress_reference_segment(&high_rep).unwrap();
assert_eq!(marker, 0, "High repetitiveness should use plain ZSTD");
let decompressed = decompress_segment_with_marker(&compressed, marker).unwrap();
assert_eq!(high_rep, decompressed);
}
#[test]
fn test_tuple_packing_compression_path() {
let original = vec![0, 1, 2, 3, 0, 1, 2, 3];
let tuples = bytes_to_tuples(&original);
let compressed = zstd_pool::compress_segment_pooled(&tuples, 13).unwrap();
let decompressed = decompress_segment_with_marker(&compressed, 1).unwrap();
assert_eq!(
original, decompressed,
"Tuple packing roundtrip should work"
);
}
}