use crate::kmer::{Kmer, KmerMode};
use crate::segment_buffer::BufferedSegments;
use ahash::{AHashMap, AHashSet};
use std::sync::{Arc, Mutex};
pub type Contig = Vec<u8>;
pub const MISSING_KMER: u64 = u64::MAX;
#[derive(Debug, Clone)]
pub struct SegmentPart {
pub kmer1: u64,
pub kmer2: u64,
pub sample_name: String,
pub contig_name: String,
pub seg_data: Vec<u8>,
pub is_rev_comp: bool,
pub seg_part_no: u32,
}
pub struct CompressionContext {
pub splitters: Arc<Mutex<AHashSet<u64>>>,
pub bloom_splitters: Arc<Mutex<crate::bloom_filter::BloomFilter>>,
pub buffered_segments: Arc<Mutex<BufferedSegments>>,
pub kmer_length: usize,
pub adaptive_mode: bool,
pub map_segments: Arc<Mutex<AHashMap<(u64, u64), u32>>>,
pub map_segments_terminators: Arc<Mutex<AHashMap<u64, Vec<u64>>>>,
pub concatenated_genomes: bool,
}
pub fn compress_contig(
sample_name: &str,
contig_name: &str,
contig: &Contig,
ctx: &CompressionContext,
) -> bool {
let kmer_length = ctx.kmer_length as u32;
let mut kmer = Kmer::new(kmer_length, KmerMode::Canonical);
let mut split_pos: usize = 0; let mut split_kmer = Kmer::new(kmer_length, KmerMode::Canonical); let mut seg_part_no: u32 = 0;
for (pos, &base) in contig.iter().enumerate() {
kmer.insert(base as u64);
if !kmer.is_full() {
continue;
}
let kmer_val = kmer.data_canonical();
if is_splitter(kmer_val, ctx) {
let new_split_pos = pos + 1 - kmer_length as usize;
if new_split_pos > split_pos {
let seg_start = split_pos;
let seg_end = pos + 1; let segment = contig[seg_start..seg_end].to_vec();
add_segment(
sample_name,
contig_name,
seg_part_no,
segment,
&split_kmer,
&kmer,
ctx,
);
seg_part_no += 1;
}
split_pos = new_split_pos;
split_kmer = kmer.clone();
}
}
if ctx.adaptive_mode && seg_part_no == 0 {
return false; }
if split_pos < contig.len() && contig.len() - split_pos > ctx.kmer_length {
let segment = contig[split_pos..].to_vec();
add_segment(
sample_name,
contig_name,
seg_part_no,
segment,
&split_kmer,
&Kmer::new(kmer_length, KmerMode::Canonical), ctx,
);
}
true }
fn add_segment(
sample_name: &str,
contig_name: &str,
seg_part_no: u32,
mut segment: Vec<u8>,
kmer_front: &Kmer,
kmer_back: &Kmer,
ctx: &CompressionContext,
) {
let k1 = if kmer_front.is_full() {
kmer_front.data_canonical()
} else {
MISSING_KMER
};
let k2 = if kmer_back.is_full() {
kmer_back.data_canonical()
} else {
MISSING_KMER
};
let pk = if k1 <= k2 { (k1, k2) } else { (k2, k1) };
let map_segments = ctx.map_segments.lock().unwrap();
let group_id = map_segments.get(&pk).copied();
drop(map_segments);
let mut segment2: Option<Vec<u8>> = None;
let mut pk2: Option<(u64, u64)> = None;
if group_id.is_none() && !ctx.concatenated_genomes {
if k1 != MISSING_KMER && k2 != MISSING_KMER {
let terminators = ctx.map_segments_terminators.lock().unwrap();
let k1_in_map = terminators.contains_key(&k1);
let k2_in_map = terminators.contains_key(&k2);
drop(terminators);
if k1_in_map && k2_in_map {
if let Some((k_middle, left_size, right_size)) =
find_cand_segment_with_missing_middle_splitter(kmer_front, kmer_back, ctx)
{
if left_size == 0 || right_size == 0 {
} else {
let kmer_length = ctx.kmer_length;
let seg2_start_pos = left_size - kmer_length / 2;
segment2 = Some(segment[seg2_start_pos..].to_vec());
segment.resize(seg2_start_pos + kmer_length, 0);
pk2 = Some(if k_middle <= k2 {
(k_middle, k2)
} else {
(k2, k_middle)
});
}
}
}
}
}
let compressed_seg = segment.clone();
if group_id.is_some() {
let buffered = ctx.buffered_segments.lock().unwrap();
buffered.add_known(
group_id.unwrap(),
MISSING_KMER,
MISSING_KMER,
sample_name.to_string(),
contig_name.to_string(),
compressed_seg,
false, seg_part_no,
);
} else {
let buffered = ctx.buffered_segments.lock().unwrap();
buffered.add_new(
pk.0,
pk.1,
sample_name.to_string(),
contig_name.to_string(),
compressed_seg,
false, seg_part_no,
);
}
if let (Some(seg2), Some(pk2_key)) = (segment2, pk2) {
let compressed_seg2 = seg2.clone();
let map_segments = ctx.map_segments.lock().unwrap();
let group_id2 = map_segments.get(&pk2_key).copied();
drop(map_segments);
let buffered = ctx.buffered_segments.lock().unwrap();
if let Some(gid2) = group_id2 {
buffered.add_known(
gid2,
MISSING_KMER,
MISSING_KMER,
sample_name.to_string(),
contig_name.to_string(),
compressed_seg2,
false, seg_part_no + 1,
);
} else {
buffered.add_new(
pk2_key.0,
pk2_key.1,
sample_name.to_string(),
contig_name.to_string(),
compressed_seg2,
false, seg_part_no + 1,
);
}
}
}
fn is_splitter(kmer: u64, ctx: &CompressionContext) -> bool {
let bloom = ctx.bloom_splitters.lock().unwrap();
if !bloom.check(kmer) {
return false; }
drop(bloom);
let splitters = ctx.splitters.lock().unwrap();
splitters.contains(&kmer)
}
fn find_cand_segment_with_missing_middle_splitter(
kmer_front: &Kmer,
kmer_back: &Kmer,
ctx: &CompressionContext,
) -> Option<(u64, usize, usize)> {
let k1 = kmer_front.data_canonical();
let k2 = kmer_back.data_canonical();
let terminators = ctx.map_segments_terminators.lock().unwrap();
let k1_terminators = terminators.get(&k1)?;
let k2_terminators = terminators.get(&k2)?;
let mut shared_splitters = Vec::new();
let mut i = 0;
let mut j = 0;
while i < k1_terminators.len() && j < k2_terminators.len() {
if k1_terminators[i] == k2_terminators[j] {
shared_splitters.push(k1_terminators[i]);
i += 1;
j += 1;
} else if k1_terminators[i] < k2_terminators[j] {
i += 1;
} else {
j += 1;
}
}
if shared_splitters.is_empty() {
return None;
}
drop(terminators);
let map_segments = ctx.map_segments.lock().unwrap();
for &k_middle in &shared_splitters {
let pk_left = if k1 <= k_middle {
(k1, k_middle)
} else {
(k_middle, k1)
};
let pk_right = if k_middle <= k2 {
(k_middle, k2)
} else {
(k2, k_middle)
};
if map_segments.contains_key(&pk_left) && map_segments.contains_key(&pk_right) {
drop(map_segments);
let left_size = ctx.kmer_length; let right_size = ctx.kmer_length;
return Some((k_middle, left_size, right_size));
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
#[test]
fn test_missing_kmer_constant() {
assert_eq!(MISSING_KMER, u64::MAX);
}
#[test]
fn test_segment_part_creation() {
let part = SegmentPart {
kmer1: 12345,
kmer2: 67890,
sample_name: "sample1".to_string(),
contig_name: "chr1".to_string(),
seg_data: vec![0, 1, 2, 3],
is_rev_comp: false,
seg_part_no: 0,
};
assert_eq!(part.kmer1, 12345);
assert_eq!(part.kmer2, 67890);
assert_eq!(part.sample_name, "sample1");
assert_eq!(part.seg_part_no, 0);
}
#[test]
fn test_compress_contig_no_splitters() {
let ctx = CompressionContext {
splitters: Arc::new(Mutex::new(AHashSet::new())),
bloom_splitters: Arc::new(Mutex::new(crate::bloom_filter::BloomFilter::new(1024))),
buffered_segments: Arc::new(Mutex::new(BufferedSegments::new(0))),
kmer_length: 3, adaptive_mode: false,
map_segments: Arc::new(Mutex::new(AHashMap::new())),
map_segments_terminators: Arc::new(Mutex::new(AHashMap::new())),
concatenated_genomes: false,
};
let contig = vec![0, 1, 2, 3, 0, 1, 2, 3];
let result = compress_contig("sample1", "chr1", &contig, &ctx);
assert!(result);
let buffered = ctx.buffered_segments.lock().unwrap();
assert_eq!(buffered.get_num_new(), 1);
}
#[test]
fn test_compress_contig_adaptive_failure() {
let ctx = CompressionContext {
splitters: Arc::new(Mutex::new(AHashSet::new())),
bloom_splitters: Arc::new(Mutex::new(crate::bloom_filter::BloomFilter::new(1024))),
buffered_segments: Arc::new(Mutex::new(BufferedSegments::new(0))),
kmer_length: 3, adaptive_mode: true, map_segments: Arc::new(Mutex::new(AHashMap::new())),
map_segments_terminators: Arc::new(Mutex::new(AHashMap::new())),
concatenated_genomes: false,
};
let contig = vec![0, 1, 2, 3, 0, 1, 2, 3];
let result = compress_contig("sample1", "chr1", &contig, &ctx);
assert!(!result);
}
}