use crate::kmer::{Kmer, KmerMode};
use ahash::AHashSet;
use ragc_common::Contig;
pub const MISSING_KMER: u64 = u64::MAX;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Segment {
pub data: Contig,
pub front_kmer: u64,
pub back_kmer: u64,
pub front_kmer_is_dir: bool,
pub back_kmer_is_dir: bool,
}
impl Segment {
pub fn new(
data: Contig,
front_kmer: u64,
back_kmer: u64,
front_kmer_is_dir: bool,
back_kmer_is_dir: bool,
) -> Self {
Segment {
data,
front_kmer,
back_kmer,
front_kmer_is_dir,
back_kmer_is_dir,
}
}
pub fn len(&self) -> usize {
self.data.len()
}
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
}
pub fn split_at_splitters_with_size(
contig: &Contig,
splitters: &AHashSet<u64>,
k: usize,
_min_segment_size: usize,
) -> Vec<Segment> {
let debug = crate::env_cache::debug_segment_coverage();
if debug {
eprintln!(
"\n=== SEGMENTATION START: contig_len={} k={} ===",
contig.len(),
k
);
}
let mut segments = Vec::new();
if contig.len() < k {
if debug {
eprintln!("Contig too short for k-mers, returning as single segment");
}
return vec![Segment::new(
contig.clone(),
MISSING_KMER,
MISSING_KMER,
false,
false,
)];
}
let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
let mut segment_start = 0;
let mut front_kmer = MISSING_KMER;
let mut front_kmer_is_dir = false;
for (pos, &base) in contig.iter().enumerate() {
if base > 3 {
kmer.reset();
} else {
kmer.insert(base as u64);
if kmer.is_full() {
let kmer_value = kmer.data();
let is_dir = kmer.is_dir_oriented();
if crate::env_cache::debug_endpos() && pos + 50 >= contig.len() {
let is_splitter = splitters.contains(&kmer_value);
let bytes_left = contig.len() - (pos + 1);
eprintln!("ENDPOS_TRACE: pos={} kmer={:#x} is_splitter={} bytes_left={} k={} contig_len={}",
pos, kmer_value, is_splitter, bytes_left, k, contig.len());
}
if splitters.contains(&kmer_value) {
if crate::env_cache::trace_all_splits() {
let segment_len = (pos + 1) - segment_start;
eprintln!("RAGC_SPLIT: pos={} kmer={} segment_start={} segment_len={} contig_len={}",
pos, kmer_value, segment_start, segment_len, contig.len());
}
let segment_end = pos + 1;
let segment_data = contig[segment_start..segment_end].to_vec();
if !segment_data.is_empty() {
let (seg_front, seg_back, seg_front_is_dir, seg_back_is_dir) =
if front_kmer == MISSING_KMER {
(MISSING_KMER, kmer_value, false, is_dir)
} else {
(front_kmer, kmer_value, front_kmer_is_dir, is_dir)
};
if debug {
eprintln!(
" MAIN_LOOP_SPLIT: segment=[{}..{}) len={}",
segment_start,
segment_end,
segment_data.len()
);
}
#[cfg(feature = "verbose_debug")]
if crate::env_cache::debug_overlap() {
let first_5: Vec<u8> = segment_data.iter().take(5).copied().collect();
let last_5: Vec<u8> =
segment_data.iter().rev().take(5).rev().copied().collect();
eprintln!("RAGC_SEG_SPLIT: pos={} splitter={} segment=[{}..{}) len={} front={} back={} first_5={:?} last_5={:?}",
pos, kmer_value, segment_start, segment_end, segment_data.len(),
if seg_front == MISSING_KMER { "MISSING".to_string() } else { seg_front.to_string() },
if seg_back == MISSING_KMER { "MISSING".to_string() } else { seg_back.to_string() },
first_5, last_5);
} else {
#[cfg(feature = "verbose_debug")]
eprintln!("RAGC_SEG_SPLIT: pos={} splitter={} segment=[{}..{}) len={} front={} back={}",
pos, kmer_value, segment_start, segment_end, segment_data.len(),
if seg_front == MISSING_KMER { "MISSING".to_string() } else { seg_front.to_string() },
if seg_back == MISSING_KMER { "MISSING".to_string() } else { seg_back.to_string() });
}
segments.push(Segment::new(
segment_data,
seg_front,
seg_back,
seg_front_is_dir,
seg_back_is_dir,
));
}
let new_start = (pos + 1).saturating_sub(k);
if debug {
eprintln!(
" Setting segment_start: {} -> {} (overlap of {} bytes)",
segment_start,
new_start,
(pos + 1) - new_start
);
}
segment_start = new_start;
front_kmer = kmer_value;
front_kmer_is_dir = is_dir;
kmer.reset();
}
}
}
}
if debug {
eprintln!("\n=== FINAL SEGMENT ===");
eprintln!(
" segment_start={}, contig.len()={}",
segment_start,
contig.len()
);
}
if segment_start < contig.len() {
let segment_data = contig[segment_start..].to_vec();
if !segment_data.is_empty() {
let (final_front, final_back, final_front_is_dir, final_back_is_dir) =
if front_kmer == MISSING_KMER {
(MISSING_KMER, MISSING_KMER, false, false)
} else {
(front_kmer, MISSING_KMER, front_kmer_is_dir, false)
};
if debug {
eprintln!(
" FINAL: segment=[{}..{}) len={}",
segment_start,
contig.len(),
segment_data.len()
);
}
#[cfg(feature = "verbose_debug")]
eprintln!(
"RAGC_SEG_FINAL: segment=[{}..{}) len={} front={} back={}",
segment_start,
contig.len(),
segment_data.len(),
if final_front == MISSING_KMER {
"MISSING".to_string()
} else {
final_front.to_string()
},
if final_back == MISSING_KMER {
"MISSING".to_string()
} else {
final_back.to_string()
}
);
if crate::env_cache::debug_is_dir()
&& final_back == MISSING_KMER
&& final_front != MISSING_KMER
{
eprintln!(
"RAGC_FINAL_SEG_IS_DIR: front_kmer={} front_kmer_is_dir={}",
final_front, final_front_is_dir
);
}
segments.push(Segment::new(
segment_data,
final_front,
final_back,
final_front_is_dir,
final_back_is_dir,
));
}
}
if segments.is_empty() {
if debug {
eprintln!(" NO_SPLIT: Returning entire contig as single segment");
}
#[cfg(feature = "verbose_debug")]
eprintln!(
"RAGC_SEG_NOSPLIT: len={} front=MISSING back=MISSING",
contig.len()
);
segments.push(Segment::new(
contig.clone(),
MISSING_KMER,
MISSING_KMER,
false,
false,
));
}
if debug {
eprintln!("\n=== SEGMENTATION SUMMARY ===");
eprintln!(" Original contig length: {}", contig.len());
eprintln!(" Number of segments: {}", segments.len());
let total_segment_bytes: usize = segments.iter().map(|s| s.len()).sum();
eprintln!(" Total segment bytes: {}", total_segment_bytes);
let expected_size = if segments.is_empty() {
0
} else if segments.len() == 1 {
segments[0].len()
} else {
segments[0].len()
+ segments[1..]
.iter()
.map(|s| s.len().saturating_sub(k - 1))
.sum::<usize>()
};
eprintln!(
" Expected reconstructed size: {} (with {} overlaps of {} bytes)",
expected_size,
segments.len().saturating_sub(1),
k - 1
);
if expected_size != contig.len() {
eprintln!(
" ⚠️ SIZE MISMATCH: Expected {} but contig is {} (diff: {})",
expected_size,
contig.len(),
contig.len() as i64 - expected_size as i64
);
} else {
eprintln!(" ✓ Size matches!");
}
eprintln!("\n Segment coverage:");
for (i, seg) in segments.iter().enumerate() {
eprintln!(" Segment {}: len={}", i, seg.len());
}
}
segments
}
pub fn split_at_splitters(contig: &Contig, splitters: &AHashSet<u64>, k: usize) -> Vec<Segment> {
let mut segments = Vec::new();
if contig.len() < k {
return vec![Segment::new(
contig.clone(),
MISSING_KMER,
MISSING_KMER,
false,
false,
)];
}
let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
let mut segment_start = 0;
let mut front_kmer = MISSING_KMER;
let mut front_kmer_is_dir = false;
for (pos, &base) in contig.iter().enumerate() {
if base > 3 {
kmer.reset();
} else {
kmer.insert(base as u64);
if kmer.is_full() {
let kmer_value = kmer.data();
let is_dir = kmer.is_dir_oriented();
if splitters.contains(&kmer_value) {
let segment_end = pos + 1; let segment_data = contig[segment_start..segment_end].to_vec();
if !segment_data.is_empty() {
#[cfg(feature = "verbose_debug")]
eprintln!("RAGC_SEG_SPLIT: pos={} splitter={} segment=[{}..{}) len={} front={} back={}",
pos, kmer_value, segment_start, segment_end, segment_data.len(),
if front_kmer == MISSING_KMER { "MISSING".to_string() } else { front_kmer.to_string() },
kmer_value);
segments.push(Segment::new(
segment_data,
front_kmer,
kmer_value,
front_kmer_is_dir,
is_dir,
));
}
segment_start = (pos + 1).saturating_sub(k);
front_kmer = kmer_value;
front_kmer_is_dir = is_dir;
}
}
}
}
if segment_start < contig.len() {
let segment_data = contig[segment_start..].to_vec();
if !segment_data.is_empty() {
#[cfg(feature = "verbose_debug")]
eprintln!(
"RAGC_SEG_FINAL: segment=[{}..{}) len={} front={} back=MISSING",
segment_start,
contig.len(),
segment_data.len(),
if front_kmer == MISSING_KMER {
"MISSING".to_string()
} else {
front_kmer.to_string()
}
);
segments.push(Segment::new(
segment_data,
front_kmer,
MISSING_KMER,
front_kmer_is_dir,
false,
));
}
}
if segments.is_empty() {
#[cfg(feature = "verbose_debug")]
eprintln!(
"RAGC_SEG_NOSPLIT: len={} front=MISSING back=MISSING",
contig.len()
);
segments.push(Segment::new(
contig.clone(),
MISSING_KMER,
MISSING_KMER,
false,
false,
));
}
segments
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_segment_new() {
let seg = Segment::new(vec![0, 1, 2, 3], 123, 456, true, false);
assert_eq!(seg.len(), 4);
assert_eq!(seg.front_kmer, 123);
assert_eq!(seg.back_kmer, 456);
assert!(!seg.is_empty());
}
#[test]
fn test_split_no_splitters() {
let contig = vec![0, 1, 2, 3, 0, 1, 2, 3];
let splitters = AHashSet::new();
let segments = split_at_splitters(&contig, &splitters, 3);
assert_eq!(segments.len(), 1);
assert_eq!(segments[0].data, contig);
assert_eq!(segments[0].front_kmer, MISSING_KMER);
assert_eq!(segments[0].back_kmer, MISSING_KMER);
}
#[test]
fn test_split_with_splitters() {
let contig = vec![0, 0, 0, 1, 1, 1, 2, 2, 2];
let mut kmer = Kmer::new(3, KmerMode::Canonical);
let mut kmers = Vec::new();
for &base in &contig {
kmer.insert(base as u64);
if kmer.is_full() {
kmers.push(kmer.data());
}
}
let mut splitters = AHashSet::new();
if !kmers.is_empty() {
splitters.insert(kmers[0]);
}
let segments = split_at_splitters(&contig, &splitters, 3);
assert!(!segments.is_empty());
let k = 3;
let reconstructed_len: usize = if segments.is_empty() {
0
} else {
segments[0].len()
+ segments[1..]
.iter()
.map(|s| s.len().saturating_sub(k))
.sum::<usize>()
};
assert_eq!(reconstructed_len, contig.len());
}
#[test]
fn test_split_short_contig() {
let contig = vec![0, 1]; let splitters = AHashSet::new();
let segments = split_at_splitters(&contig, &splitters, 3);
assert_eq!(segments.len(), 1);
assert_eq!(segments[0].data, contig);
}
#[test]
fn test_split_consecutive_splitters() {
let contig = vec![0, 0, 0, 0, 0, 0];
let mut kmer = Kmer::new(3, KmerMode::Canonical);
kmer.insert(0);
kmer.insert(0);
kmer.insert(0);
let aaa_kmer = kmer.data();
let mut splitters = AHashSet::new();
splitters.insert(aaa_kmer);
let segments = split_at_splitters(&contig, &splitters, 3);
assert!(!segments.is_empty());
}
#[test]
fn test_split_with_n_bases() {
let contig = vec![0, 0, 0, 4, 1, 1, 1];
let mut splitters = AHashSet::new();
splitters.insert(12345);
let segments = split_at_splitters(&contig, &splitters, 3);
assert!(!segments.is_empty());
}
}