use crate::genome_io::GenomeIO;
use crate::kmer_extract::{enumerate_kmers, remove_non_singletons};
use ahash::AHashSet;
use anyhow::Result;
use ragc_common::Contig;
use rayon::prelude::*;
use rdst::RadixSort;
use std::io::Read;
use std::path::Path;
pub fn determine_splitters(
contigs: &[Contig],
k: usize,
segment_size: usize,
) -> (AHashSet<u64>, AHashSet<u64>, AHashSet<u64>) {
let total_bases: usize = contigs.iter().map(|c| c.len()).sum();
eprintln!(
"DEBUG: determine_splitters() received {} contigs with {} total bases",
contigs.len(),
total_bases
);
let all_kmers_vec: Vec<Vec<u64>> = contigs
.par_iter()
.map(|contig| enumerate_kmers(contig, k))
.collect();
let mut all_kmers: Vec<u64> = all_kmers_vec.into_iter().flatten().collect();
eprintln!(
"DEBUG: Extracted {} k-mers from reference contigs",
all_kmers.len()
);
all_kmers.radix_sort_unstable();
let mut duplicates = AHashSet::new();
let mut i = 0;
while i < all_kmers.len() {
let kmer = all_kmers[i];
let mut count = 1;
let mut j = i + 1;
while j < all_kmers.len() && all_kmers[j] == kmer {
count += 1;
j += 1;
}
if count > 1 {
duplicates.insert(kmer);
}
i = j;
}
remove_non_singletons(&mut all_kmers, 0);
let candidates: AHashSet<u64> = all_kmers.into_iter().collect();
eprintln!(
"DEBUG: Found {} candidate singleton k-mers from reference",
candidates.len()
);
eprintln!(
"DEBUG: Found {} duplicate k-mers from reference",
duplicates.len()
);
let splitter_vecs: Vec<Vec<u64>> = contigs
.par_iter()
.map(|contig| find_actual_splitters_in_contig(contig, &candidates, k, segment_size))
.collect();
let splitters: AHashSet<u64> = splitter_vecs.into_iter().flatten().collect();
eprintln!(
"DEBUG: {} actually-used splitters (after distance check)",
splitters.len()
);
(splitters, candidates, duplicates)
}
pub fn determine_splitters_streaming(
fasta_path: &Path,
k: usize,
segment_size: usize,
) -> Result<(AHashSet<u64>, AHashSet<u64>, AHashSet<u64>)> {
#[cfg(feature = "verbose_debug")]
eprintln!("DEBUG: Pass 1 - Collecting k-mers from reference (streaming)...");
let mut all_kmers = Vec::new();
let mut reference_sample = String::new();
{
let mut reader = GenomeIO::<Box<dyn Read>>::open(fasta_path)?;
while let Some((_full_header, sample_name, _contig_name, sequence)) =
reader.read_contig_with_sample()?
{
if !sequence.is_empty() {
if reference_sample.is_empty() {
reference_sample = sample_name.clone();
#[cfg(feature = "verbose_debug")]
eprintln!(
"DEBUG: Collecting k-mers from ALL contigs (first sample: {})",
reference_sample
);
}
let contig_kmers = enumerate_kmers(&sequence, k);
all_kmers.extend(contig_kmers);
}
}
}
#[cfg(feature = "verbose_debug")]
eprintln!(
"DEBUG: Collected {} k-mers (with duplicates)",
all_kmers.len()
);
#[cfg(feature = "verbose_debug")]
eprintln!(
"DEBUG: Vec memory usage: ~{} MB",
all_kmers.len() * 8 / 1_000_000
);
#[cfg(feature = "verbose_debug")]
eprintln!("DEBUG: Sorting k-mers...");
all_kmers.radix_sort_unstable();
let mut duplicates = AHashSet::new();
let mut i = 0;
while i < all_kmers.len() {
let kmer = all_kmers[i];
let mut count = 1;
let mut j = i + 1;
while j < all_kmers.len() && all_kmers[j] == kmer {
count += 1;
j += 1;
}
if count > 1 {
duplicates.insert(kmer);
}
i = j;
}
#[cfg(feature = "verbose_debug")]
eprintln!("DEBUG: Found {} duplicate k-mers", duplicates.len());
remove_non_singletons(&mut all_kmers, 0);
let candidates: AHashSet<u64> = all_kmers.into_iter().collect();
#[cfg(feature = "verbose_debug")]
eprintln!(
"DEBUG: Found {} candidate singleton k-mers",
candidates.len()
);
#[cfg(feature = "verbose_debug")]
eprintln!("DEBUG: Pass 2 - Finding actually-used splitters (streaming)...");
let mut splitters = AHashSet::new();
{
let mut reader = GenomeIO::<Box<dyn Read>>::open(fasta_path)?;
while let Some((_full_header, _sample_name, _contig_name, sequence)) =
reader.read_contig_with_sample()?
{
if !sequence.is_empty() {
let used = find_actual_splitters_in_contig_named(
&sequence,
&_contig_name,
&candidates,
k,
segment_size,
);
splitters.extend(used);
}
}
}
#[cfg(feature = "verbose_debug")]
eprintln!("DEBUG: {} actually-used splitters", splitters.len());
Ok((splitters, candidates, duplicates))
}
pub fn determine_splitters_streaming_first_sample(
fasta_path: &Path,
k: usize,
segment_size: usize,
) -> Result<(AHashSet<u64>, AHashSet<u64>, AHashSet<u64>)> {
eprintln!("DEBUG: Pass 1 - Collecting k-mers from first sample only (PanSN mode)...");
let mut all_kmers = Vec::new();
let mut first_sample: Option<String> = None;
{
let mut reader = GenomeIO::<Box<dyn Read>>::open(fasta_path)?;
while let Some((_full_header, sample_name, _contig_name, sequence)) =
reader.read_contig_with_sample()?
{
if first_sample.is_none() {
first_sample = Some(sample_name.clone());
eprintln!("DEBUG: First sample detected: {}", sample_name);
}
if first_sample.as_ref() != Some(&sample_name) {
eprintln!("DEBUG: Stopping at sample boundary (saw {})", sample_name);
break;
}
if !sequence.is_empty() {
let contig_kmers = enumerate_kmers(&sequence, k);
all_kmers.extend(contig_kmers);
}
}
}
eprintln!(
"DEBUG: Collected {} k-mers from first sample",
all_kmers.len()
);
all_kmers.radix_sort_unstable();
let mut duplicates = AHashSet::new();
let mut i = 0;
while i < all_kmers.len() {
let kmer = all_kmers[i];
let mut count = 1;
let mut j = i + 1;
while j < all_kmers.len() && all_kmers[j] == kmer {
count += 1;
j += 1;
}
if count > 1 {
duplicates.insert(kmer);
}
i = j;
}
eprintln!("DEBUG: Found {} duplicate k-mers", duplicates.len());
remove_non_singletons(&mut all_kmers, 0);
let candidates: AHashSet<u64> = all_kmers.into_iter().collect();
eprintln!(
"DEBUG: Found {} candidate singleton k-mers",
candidates.len()
);
eprintln!("DEBUG: Pass 2 - Finding actually-used splitters from first sample...");
let mut splitters = AHashSet::new();
{
let mut reader = GenomeIO::<Box<dyn Read>>::open(fasta_path)?;
let mut first_sample_pass2: Option<String> = None;
while let Some((_full_header, sample_name, _contig_name, sequence)) =
reader.read_contig_with_sample()?
{
if first_sample_pass2.is_none() {
first_sample_pass2 = Some(sample_name.clone());
}
if first_sample_pass2.as_ref() != Some(&sample_name) {
break;
}
if !sequence.is_empty() {
let used = find_actual_splitters_in_contig_named(
&sequence,
&_contig_name,
&candidates,
k,
segment_size,
);
splitters.extend(used);
}
}
}
eprintln!(
"DEBUG: {} actually-used splitters from first sample",
splitters.len()
);
Ok((splitters, candidates, duplicates))
}
fn find_actual_splitters_in_contig_named(
contig: &Contig,
contig_name: &str,
candidates: &AHashSet<u64>,
k: usize,
segment_size: usize,
) -> Vec<u64> {
use crate::kmer::{Kmer, KmerMode};
let mut used_splitters = Vec::new();
let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
let mut current_len = segment_size; let mut recent_kmers = Vec::new();
let mut pos = 0usize;
for &base in contig {
let current_pos = pos;
if base > 3 {
kmer.reset();
recent_kmers.clear();
} else {
kmer.insert(base as u64);
if kmer.is_full() {
let kmer_value = kmer.data();
recent_kmers.push(kmer_value);
#[cfg(feature = "verbose_debug")]
if kmer_value == 4991190226639519744 || kmer_value == 1518275220618608640 {
eprintln!("DEBUG_RAGC_FOUND_KMER: contig={} pos={} kmer={} current_len={} is_candidate={} will_mark={}",
contig_name, current_pos, kmer_value, current_len,
candidates.contains(&kmer_value),
current_len >= segment_size && candidates.contains(&kmer_value));
}
if current_len >= segment_size && candidates.contains(&kmer_value) {
#[cfg(feature = "verbose_debug")]
eprintln!(
"DEBUG_RAGC_SPLITTER_USED: contig={} pos={} kmer={} current_len={}",
contig_name, current_pos, kmer_value, current_len
);
used_splitters.push(kmer_value);
current_len = 0;
kmer.reset();
recent_kmers.clear();
}
}
}
current_len += 1;
pos += 1;
}
for &kmer_value in recent_kmers.iter().rev() {
if candidates.contains(&kmer_value) {
eprintln!(
"RAGC_END_SPLITTER: contig={} kmer={} recent_kmers_len={} pos={}",
contig_name,
kmer_value,
recent_kmers.len(),
pos
);
used_splitters.push(kmer_value);
break;
}
}
used_splitters
}
fn find_actual_splitters_in_contig(
contig: &Contig,
candidates: &AHashSet<u64>,
k: usize,
segment_size: usize,
) -> Vec<u64> {
use crate::kmer::{Kmer, KmerMode};
let mut used_splitters = Vec::new();
let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
let mut current_len = segment_size; let mut recent_kmers = Vec::new();
let mut pos = 0usize;
for &base in contig {
let current_pos = pos;
if base > 3 {
kmer.reset();
recent_kmers.clear();
} else {
kmer.insert(base as u64);
if kmer.is_full() {
let kmer_value = kmer.data();
recent_kmers.push(kmer_value);
if kmer_value == 4991190226639519744 {
eprintln!("DEBUG_RAGC_FOUND_KMER: pos={} kmer={} current_len={} is_candidate={} will_mark={}",
current_pos, kmer_value, current_len,
candidates.contains(&kmer_value),
current_len >= segment_size && candidates.contains(&kmer_value));
}
if current_len >= segment_size && candidates.contains(&kmer_value) {
#[cfg(feature = "verbose_debug")]
eprintln!(
"DEBUG_RAGC_SPLITTER_USED: pos={} kmer={} current_len={}",
current_pos, kmer_value, current_len
);
used_splitters.push(kmer_value);
current_len = 0;
kmer.reset();
recent_kmers.clear();
}
}
}
current_len += 1;
pos += 1;
}
for &kmer_value in recent_kmers.iter().rev() {
if candidates.contains(&kmer_value) {
#[cfg(feature = "verbose_debug")]
eprintln!("DEBUG_RAGC_SPLITTER_USED_RIGHTMOST: kmer={}", kmer_value);
used_splitters.push(kmer_value);
break;
}
}
used_splitters
}
pub fn find_candidate_kmers_multi(contigs: &[Contig], k: usize) -> Vec<u64> {
let all_kmers_vec: Vec<Vec<u64>> = contigs
.par_iter()
.map(|contig| enumerate_kmers(contig, k))
.collect();
let mut all_kmers: Vec<u64> = all_kmers_vec.into_iter().flatten().collect();
all_kmers.radix_sort_unstable();
remove_non_singletons(&mut all_kmers, 0);
all_kmers
}
#[inline]
pub fn is_splitter(kmer: u64, splitters: &AHashSet<u64>) -> bool {
splitters.contains(&kmer)
}
pub fn is_hard_contig(contig: &Contig, k: usize, splitters: &AHashSet<u64>) -> bool {
use crate::kmer::{Kmer, KmerMode};
if contig.len() < k {
return true; }
let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
for &base in contig.iter() {
if base > 3 {
kmer.reset();
continue;
}
kmer.insert(base as u64);
if kmer.is_full() {
let kmer_value = kmer.data_canonical();
if splitters.contains(&kmer_value) {
return false;
}
}
}
true
}
pub fn find_new_splitters_for_contig(
contig: &Contig,
k: usize,
segment_size: usize,
ref_singletons: &[u64],
ref_duplicates: &AHashSet<u64>,
) -> Vec<u64> {
use crate::kmer_extract::{enumerate_kmers, remove_non_singletons};
let mut contig_kmers = enumerate_kmers(contig, k);
contig_kmers.sort_unstable();
remove_non_singletons(&mut contig_kmers, 0);
let mut tmp: Vec<u64> = Vec::with_capacity(contig_kmers.len());
let mut i = 0;
let mut j = 0;
while i < contig_kmers.len() && j < ref_singletons.len() {
match contig_kmers[i].cmp(&ref_singletons[j]) {
std::cmp::Ordering::Less => {
tmp.push(contig_kmers[i]);
i += 1;
}
std::cmp::Ordering::Equal => {
i += 1;
j += 1;
}
std::cmp::Ordering::Greater => {
j += 1;
}
}
}
tmp.extend_from_slice(&contig_kmers[i..]);
tmp.retain(|kmer| !ref_duplicates.contains(kmer));
let candidates: AHashSet<u64> = tmp.into_iter().collect();
find_actual_splitters_in_contig(contig, &candidates, k, segment_size)
}
pub fn two_pass_splitter_discovery(
input_files: &[std::path::PathBuf],
k: usize,
segment_size: usize,
verbosity: usize,
) -> Result<AHashSet<u64>> {
use crate::genome_io::GenomeIO;
use std::io::Read;
if input_files.is_empty() {
anyhow::bail!("No input files provided");
}
if verbosity > 0 {
eprintln!("Two-pass splitter discovery (matching C++ AGC batch mode)");
eprintln!("Pass 1: Discovering splitters from all files...");
}
let (initial_splitters, ref_singletons_set, ref_duplicates) =
determine_splitters_streaming(&input_files[0], k, segment_size)?;
let mut ref_singletons: Vec<u64> = ref_singletons_set.into_iter().collect();
ref_singletons.sort_unstable();
if verbosity > 0 {
eprintln!(
" Reference: {} initial splitters, {} singletons, {} duplicates",
initial_splitters.len(),
ref_singletons.len(),
ref_duplicates.len()
);
}
let mut combined_splitters = initial_splitters;
let mut hard_contigs_found = 0;
let mut new_splitters_found = 0;
for (file_idx, input_file) in input_files.iter().enumerate() {
let is_reference = file_idx == 0;
let mut reader = GenomeIO::<Box<dyn Read>>::open(input_file)?;
while let Some((_full_header, _sample_name, contig_name, sequence)) =
reader.read_contig_with_sample()?
{
if sequence.is_empty() {
continue;
}
if sequence.len() >= segment_size && is_hard_contig(&sequence, k, &combined_splitters) {
hard_contigs_found += 1;
if verbosity > 1 {
eprintln!(
" Found hard contig: {} ({} bp, file {})",
contig_name,
sequence.len(),
input_file.display()
);
}
let new_splitters = find_new_splitters_for_contig(
&sequence,
k,
segment_size,
&ref_singletons,
&ref_duplicates,
);
for splitter in new_splitters {
if combined_splitters.insert(splitter) {
new_splitters_found += 1;
}
}
}
}
if verbosity > 0 && !is_reference {
eprintln!(
" File {}: {} hard contigs so far, {} new splitters",
file_idx, hard_contigs_found, new_splitters_found
);
}
}
if verbosity > 0 {
eprintln!("Pass 1 complete:");
eprintln!(" Total hard contigs: {}", hard_contigs_found);
eprintln!(" New splitters discovered: {}", new_splitters_found);
eprintln!(
" Final splitter count: {} (was {})",
combined_splitters.len(),
combined_splitters.len() - new_splitters_found
);
eprintln!();
}
Ok(combined_splitters)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_determine_splitters_simple() {
let contigs = vec![
vec![0, 0, 0, 1], vec![2, 2, 2, 3], ];
let (splitters, _singletons, _duplicates) = determine_splitters(&contigs, 3, 100);
assert!(!splitters.is_empty());
}
#[test]
fn test_determine_splitters_no_singletons() {
let contigs = vec![vec![0, 1, 2, 3], vec![0, 1, 2, 3]];
let (splitters, singletons, duplicates) = determine_splitters(&contigs, 3, 100);
assert_eq!(splitters.len(), 0);
assert_eq!(singletons.len(), 0);
assert!(!duplicates.is_empty());
}
#[test]
fn test_determine_splitters_all_unique() {
let contigs = vec![
vec![0, 0, 0, 0], vec![1, 1, 1, 1], ];
let (splitters, _singletons, duplicates) = determine_splitters(&contigs, 3, 100);
assert_eq!(splitters.len(), 0);
assert!(!duplicates.is_empty());
}
#[test]
fn test_determine_splitters_mixed() {
let contigs = vec![
vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], vec![0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], ];
let (splitters, singletons, duplicates) = determine_splitters(&contigs, 3, 100);
let _ = (splitters, singletons, duplicates); }
#[test]
fn test_is_splitter() {
let mut splitters = AHashSet::new();
splitters.insert(123);
splitters.insert(456);
assert!(is_splitter(123, &splitters));
assert!(is_splitter(456, &splitters));
assert!(!is_splitter(789, &splitters));
}
#[test]
fn test_find_candidate_kmers_multi() {
let contigs = vec![
vec![0, 1, 2, 3], vec![3, 2, 1, 0], ];
let candidates = find_candidate_kmers_multi(&contigs, 3);
for i in 1..candidates.len() {
assert!(candidates[i] >= candidates[i - 1]);
}
}
}