1use crate::genome_io::GenomeIO;
5use crate::kmer_extract::{enumerate_kmers, remove_non_singletons};
6use ahash::AHashSet;
7use anyhow::Result;
8use ragc_common::Contig;
9use rayon::prelude::*;
10use rdst::RadixSort;
11use std::io::Read;
12use std::path::Path;
13
14pub fn determine_splitters(
34 contigs: &[Contig],
35 k: usize,
36 segment_size: usize,
37) -> (AHashSet<u64>, AHashSet<u64>, AHashSet<u64>) {
38 let total_bases: usize = contigs.iter().map(|c| c.len()).sum();
40 eprintln!(
41 "DEBUG: determine_splitters() received {} contigs with {} total bases",
42 contigs.len(),
43 total_bases
44 );
45
46 let all_kmers_vec: Vec<Vec<u64>> = contigs
49 .par_iter()
50 .map(|contig| enumerate_kmers(contig, k))
51 .collect();
52
53 let mut all_kmers: Vec<u64> = all_kmers_vec.into_iter().flatten().collect();
54 eprintln!(
55 "DEBUG: Extracted {} k-mers from reference contigs",
56 all_kmers.len()
57 );
58
59 all_kmers.radix_sort_unstable();
61
62 let mut duplicates = AHashSet::new();
65 let mut i = 0;
66 while i < all_kmers.len() {
67 let kmer = all_kmers[i];
68 let mut count = 1;
69 let mut j = i + 1;
70
71 while j < all_kmers.len() && all_kmers[j] == kmer {
73 count += 1;
74 j += 1;
75 }
76
77 if count > 1 {
79 duplicates.insert(kmer);
80 }
81
82 i = j;
83 }
84
85 remove_non_singletons(&mut all_kmers, 0);
87
88 let candidates: AHashSet<u64> = all_kmers.into_iter().collect();
89 eprintln!(
90 "DEBUG: Found {} candidate singleton k-mers from reference",
91 candidates.len()
92 );
93 eprintln!(
94 "DEBUG: Found {} duplicate k-mers from reference",
95 duplicates.len()
96 );
97
98 let splitter_vecs: Vec<Vec<u64>> = contigs
101 .par_iter()
102 .map(|contig| find_actual_splitters_in_contig(contig, &candidates, k, segment_size))
103 .collect();
104
105 let splitters: AHashSet<u64> = splitter_vecs.into_iter().flatten().collect();
106 eprintln!(
107 "DEBUG: {} actually-used splitters (after distance check)",
108 splitters.len()
109 );
110
111 (splitters, candidates, duplicates)
112}
113
114pub fn determine_splitters_streaming(
129 fasta_path: &Path,
130 k: usize,
131 segment_size: usize,
132) -> Result<(AHashSet<u64>, AHashSet<u64>, AHashSet<u64>)> {
133 #[cfg(feature = "verbose_debug")]
135 eprintln!("DEBUG: Pass 1 - Collecting k-mers from reference (streaming)...");
136 let mut all_kmers = Vec::new();
137 let mut reference_sample = String::new();
138
139 {
140 let mut reader = GenomeIO::<Box<dyn Read>>::open(fasta_path)?;
141 while let Some((_full_header, sample_name, _contig_name, sequence)) =
142 reader.read_contig_with_sample()?
143 {
144 if !sequence.is_empty() {
145 if reference_sample.is_empty() {
147 reference_sample = sample_name.clone();
148 #[cfg(feature = "verbose_debug")]
149 eprintln!(
150 "DEBUG: Collecting k-mers from ALL contigs (first sample: {})",
151 reference_sample
152 );
153 }
154
155 let contig_kmers = enumerate_kmers(&sequence, k);
158 all_kmers.extend(contig_kmers);
159 }
160 }
161 }
162
163 #[cfg(feature = "verbose_debug")]
164 eprintln!(
165 "DEBUG: Collected {} k-mers (with duplicates)",
166 all_kmers.len()
167 );
168 #[cfg(feature = "verbose_debug")]
169 eprintln!(
170 "DEBUG: Vec memory usage: ~{} MB",
171 all_kmers.len() * 8 / 1_000_000
172 );
173
174 #[cfg(feature = "verbose_debug")]
176 eprintln!("DEBUG: Sorting k-mers...");
177 all_kmers.radix_sort_unstable();
178
179 let mut duplicates = AHashSet::new();
181 let mut i = 0;
182 while i < all_kmers.len() {
183 let kmer = all_kmers[i];
184 let mut count = 1;
185 let mut j = i + 1;
186
187 while j < all_kmers.len() && all_kmers[j] == kmer {
188 count += 1;
189 j += 1;
190 }
191
192 if count > 1 {
193 duplicates.insert(kmer);
194 }
195
196 i = j;
197 }
198
199 #[cfg(feature = "verbose_debug")]
200 eprintln!("DEBUG: Found {} duplicate k-mers", duplicates.len());
201
202 remove_non_singletons(&mut all_kmers, 0);
204 let candidates: AHashSet<u64> = all_kmers.into_iter().collect();
205 #[cfg(feature = "verbose_debug")]
206 eprintln!(
207 "DEBUG: Found {} candidate singleton k-mers",
208 candidates.len()
209 );
210
211 #[cfg(feature = "verbose_debug")]
213 eprintln!("DEBUG: Pass 2 - Finding actually-used splitters (streaming)...");
214 let mut splitters = AHashSet::new();
215
216 {
217 let mut reader = GenomeIO::<Box<dyn Read>>::open(fasta_path)?;
218 while let Some((_full_header, _sample_name, _contig_name, sequence)) =
219 reader.read_contig_with_sample()?
220 {
221 if !sequence.is_empty() {
224 let used = find_actual_splitters_in_contig_named(
225 &sequence,
226 &_contig_name,
227 &candidates,
228 k,
229 segment_size,
230 );
231 splitters.extend(used);
232 }
233 }
234 }
235
236 #[cfg(feature = "verbose_debug")]
237 eprintln!("DEBUG: {} actually-used splitters", splitters.len());
238
239 Ok((splitters, candidates, duplicates))
240}
241
242pub fn determine_splitters_streaming_first_sample(
256 fasta_path: &Path,
257 k: usize,
258 segment_size: usize,
259) -> Result<(AHashSet<u64>, AHashSet<u64>, AHashSet<u64>)> {
260 eprintln!("DEBUG: Pass 1 - Collecting k-mers from first sample only (PanSN mode)...");
262 let mut all_kmers = Vec::new();
263 let mut first_sample: Option<String> = None;
264
265 {
266 let mut reader = GenomeIO::<Box<dyn Read>>::open(fasta_path)?;
267 while let Some((_full_header, sample_name, _contig_name, sequence)) =
268 reader.read_contig_with_sample()?
269 {
270 if first_sample.is_none() {
272 first_sample = Some(sample_name.clone());
273 eprintln!("DEBUG: First sample detected: {}", sample_name);
274 }
275
276 if first_sample.as_ref() != Some(&sample_name) {
278 eprintln!("DEBUG: Stopping at sample boundary (saw {})", sample_name);
279 break;
280 }
281
282 if !sequence.is_empty() {
283 let contig_kmers = enumerate_kmers(&sequence, k);
284 all_kmers.extend(contig_kmers);
285 }
286 }
287 }
288
289 eprintln!(
290 "DEBUG: Collected {} k-mers from first sample",
291 all_kmers.len()
292 );
293
294 all_kmers.radix_sort_unstable();
296
297 let mut duplicates = AHashSet::new();
299 let mut i = 0;
300 while i < all_kmers.len() {
301 let kmer = all_kmers[i];
302 let mut count = 1;
303 let mut j = i + 1;
304
305 while j < all_kmers.len() && all_kmers[j] == kmer {
306 count += 1;
307 j += 1;
308 }
309
310 if count > 1 {
311 duplicates.insert(kmer);
312 }
313
314 i = j;
315 }
316
317 eprintln!("DEBUG: Found {} duplicate k-mers", duplicates.len());
318
319 remove_non_singletons(&mut all_kmers, 0);
321 let candidates: AHashSet<u64> = all_kmers.into_iter().collect();
322 eprintln!(
323 "DEBUG: Found {} candidate singleton k-mers",
324 candidates.len()
325 );
326
327 eprintln!("DEBUG: Pass 2 - Finding actually-used splitters from first sample...");
329 let mut splitters = AHashSet::new();
330
331 {
332 let mut reader = GenomeIO::<Box<dyn Read>>::open(fasta_path)?;
333 let mut first_sample_pass2: Option<String> = None;
334
335 while let Some((_full_header, sample_name, _contig_name, sequence)) =
336 reader.read_contig_with_sample()?
337 {
338 if first_sample_pass2.is_none() {
340 first_sample_pass2 = Some(sample_name.clone());
341 }
342
343 if first_sample_pass2.as_ref() != Some(&sample_name) {
345 break;
346 }
347
348 if !sequence.is_empty() {
349 let used = find_actual_splitters_in_contig_named(
350 &sequence,
351 &_contig_name,
352 &candidates,
353 k,
354 segment_size,
355 );
356 splitters.extend(used);
357 }
358 }
359 }
360
361 eprintln!(
362 "DEBUG: {} actually-used splitters from first sample",
363 splitters.len()
364 );
365
366 Ok((splitters, candidates, duplicates))
367}
368
369fn find_actual_splitters_in_contig_named(
371 contig: &Contig,
372 contig_name: &str,
373 candidates: &AHashSet<u64>,
374 k: usize,
375 segment_size: usize,
376) -> Vec<u64> {
377 use crate::kmer::{Kmer, KmerMode};
378
379 let mut used_splitters = Vec::new();
380 let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
381 let mut current_len = segment_size; let mut recent_kmers = Vec::new();
383 let mut pos = 0usize;
384
385 for &base in contig {
386 let current_pos = pos;
387 if base > 3 {
388 kmer.reset();
389 recent_kmers.clear();
390 } else {
391 kmer.insert(base as u64);
392
393 if kmer.is_full() {
394 let kmer_value = kmer.data();
395 recent_kmers.push(kmer_value);
396
397 #[cfg(feature = "verbose_debug")]
399 if kmer_value == 4991190226639519744 || kmer_value == 1518275220618608640 {
400 eprintln!("DEBUG_RAGC_FOUND_KMER: contig={} pos={} kmer={} current_len={} is_candidate={} will_mark={}",
401 contig_name, current_pos, kmer_value, current_len,
402 candidates.contains(&kmer_value),
403 current_len >= segment_size && candidates.contains(&kmer_value));
404 }
405
406 if current_len >= segment_size && candidates.contains(&kmer_value) {
407 #[cfg(feature = "verbose_debug")]
409 eprintln!(
410 "DEBUG_RAGC_SPLITTER_USED: contig={} pos={} kmer={} current_len={}",
411 contig_name, current_pos, kmer_value, current_len
412 );
413 used_splitters.push(kmer_value);
414 current_len = 0;
415 kmer.reset();
416 recent_kmers.clear();
417 }
418 }
419 }
420
421 current_len += 1;
422 pos += 1;
423 }
424
425 for &kmer_value in recent_kmers.iter().rev() {
427 if candidates.contains(&kmer_value) {
428 eprintln!(
430 "RAGC_END_SPLITTER: contig={} kmer={} recent_kmers_len={} pos={}",
431 contig_name,
432 kmer_value,
433 recent_kmers.len(),
434 pos
435 );
436 used_splitters.push(kmer_value);
437 break;
438 }
439 }
440
441 used_splitters
442}
443
444fn find_actual_splitters_in_contig(
448 contig: &Contig,
449 candidates: &AHashSet<u64>,
450 k: usize,
451 segment_size: usize,
452) -> Vec<u64> {
453 use crate::kmer::{Kmer, KmerMode};
454
455 let mut used_splitters = Vec::new();
456 let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
457 let mut current_len = segment_size; let mut recent_kmers = Vec::new();
459 let mut pos = 0usize;
460
461 for &base in contig {
462 let current_pos = pos;
463 if base > 3 {
464 kmer.reset();
465 recent_kmers.clear();
466 } else {
467 kmer.insert(base as u64);
468
469 if kmer.is_full() {
470 let kmer_value = kmer.data();
471 recent_kmers.push(kmer_value);
472
473 if kmer_value == 4991190226639519744 {
475 eprintln!("DEBUG_RAGC_FOUND_KMER: pos={} kmer={} current_len={} is_candidate={} will_mark={}",
476 current_pos, kmer_value, current_len,
477 candidates.contains(&kmer_value),
478 current_len >= segment_size && candidates.contains(&kmer_value));
479 }
480
481 if current_len >= segment_size && candidates.contains(&kmer_value) {
482 #[cfg(feature = "verbose_debug")]
484 eprintln!(
485 "DEBUG_RAGC_SPLITTER_USED: pos={} kmer={} current_len={}",
486 current_pos, kmer_value, current_len
487 );
488 used_splitters.push(kmer_value);
489 current_len = 0;
490 kmer.reset();
491 recent_kmers.clear();
492 }
493 }
494 }
495
496 current_len += 1;
497 pos += 1;
498 }
499
500 for &kmer_value in recent_kmers.iter().rev() {
502 if candidates.contains(&kmer_value) {
503 #[cfg(feature = "verbose_debug")]
504 eprintln!("DEBUG_RAGC_SPLITTER_USED_RIGHTMOST: kmer={}", kmer_value);
505 used_splitters.push(kmer_value);
506 break;
507 }
508 }
509
510 used_splitters
511}
512
513pub fn find_candidate_kmers_multi(contigs: &[Contig], k: usize) -> Vec<u64> {
524 let all_kmers_vec: Vec<Vec<u64>> = contigs
526 .par_iter()
527 .map(|contig| enumerate_kmers(contig, k))
528 .collect();
529
530 let mut all_kmers: Vec<u64> = all_kmers_vec.into_iter().flatten().collect();
531
532 all_kmers.radix_sort_unstable();
534 remove_non_singletons(&mut all_kmers, 0);
535
536 all_kmers
537}
538
539#[inline]
548pub fn is_splitter(kmer: u64, splitters: &AHashSet<u64>) -> bool {
549 splitters.contains(&kmer)
550}
551
552pub fn is_hard_contig(contig: &Contig, k: usize, splitters: &AHashSet<u64>) -> bool {
565 use crate::kmer::{Kmer, KmerMode};
566
567 if contig.len() < k {
568 return true; }
570
571 let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
572
573 for &base in contig.iter() {
574 if base > 3 {
575 kmer.reset();
577 continue;
578 }
579
580 kmer.insert(base as u64);
581
582 if kmer.is_full() {
583 let kmer_value = kmer.data_canonical();
584 if splitters.contains(&kmer_value) {
585 return false;
587 }
588 }
589 }
590
591 true
593}
594
595pub fn find_new_splitters_for_contig(
618 contig: &Contig,
619 k: usize,
620 segment_size: usize,
621 ref_singletons: &[u64],
622 ref_duplicates: &AHashSet<u64>,
623) -> Vec<u64> {
624 use crate::kmer_extract::{enumerate_kmers, remove_non_singletons};
625
626 let mut contig_kmers = enumerate_kmers(contig, k);
628
629 contig_kmers.sort_unstable();
631 remove_non_singletons(&mut contig_kmers, 0);
632
633 let mut tmp: Vec<u64> = Vec::with_capacity(contig_kmers.len());
636 let mut i = 0;
637 let mut j = 0;
638 while i < contig_kmers.len() && j < ref_singletons.len() {
639 match contig_kmers[i].cmp(&ref_singletons[j]) {
640 std::cmp::Ordering::Less => {
641 tmp.push(contig_kmers[i]);
642 i += 1;
643 }
644 std::cmp::Ordering::Equal => {
645 i += 1;
646 j += 1;
647 }
648 std::cmp::Ordering::Greater => {
649 j += 1;
650 }
651 }
652 }
653 tmp.extend_from_slice(&contig_kmers[i..]);
655
656 tmp.retain(|kmer| !ref_duplicates.contains(kmer));
659
660 let candidates: AHashSet<u64> = tmp.into_iter().collect();
663
664 find_actual_splitters_in_contig(contig, &candidates, k, segment_size)
666}
667
668pub fn two_pass_splitter_discovery(
688 input_files: &[std::path::PathBuf],
689 k: usize,
690 segment_size: usize,
691 verbosity: usize,
692) -> Result<AHashSet<u64>> {
693 use crate::genome_io::GenomeIO;
694 use std::io::Read;
695
696 if input_files.is_empty() {
697 anyhow::bail!("No input files provided");
698 }
699
700 if verbosity > 0 {
701 eprintln!("Two-pass splitter discovery (matching C++ AGC batch mode)");
702 eprintln!("Pass 1: Discovering splitters from all files...");
703 }
704
705 let (initial_splitters, ref_singletons_set, ref_duplicates) =
707 determine_splitters_streaming(&input_files[0], k, segment_size)?;
708
709 let mut ref_singletons: Vec<u64> = ref_singletons_set.into_iter().collect();
711 ref_singletons.sort_unstable();
712
713 if verbosity > 0 {
714 eprintln!(
715 " Reference: {} initial splitters, {} singletons, {} duplicates",
716 initial_splitters.len(),
717 ref_singletons.len(),
718 ref_duplicates.len()
719 );
720 }
721
722 let mut combined_splitters = initial_splitters;
724 let mut hard_contigs_found = 0;
725 let mut new_splitters_found = 0;
726
727 for (file_idx, input_file) in input_files.iter().enumerate() {
730 let is_reference = file_idx == 0;
731 let mut reader = GenomeIO::<Box<dyn Read>>::open(input_file)?;
732
733 while let Some((_full_header, _sample_name, contig_name, sequence)) =
734 reader.read_contig_with_sample()?
735 {
736 if sequence.is_empty() {
737 continue;
738 }
739
740 if sequence.len() >= segment_size && is_hard_contig(&sequence, k, &combined_splitters) {
743 hard_contigs_found += 1;
744
745 if verbosity > 1 {
746 eprintln!(
747 " Found hard contig: {} ({} bp, file {})",
748 contig_name,
749 sequence.len(),
750 input_file.display()
751 );
752 }
753
754 let new_splitters = find_new_splitters_for_contig(
756 &sequence,
757 k,
758 segment_size,
759 &ref_singletons,
760 &ref_duplicates,
761 );
762
763 for splitter in new_splitters {
765 if combined_splitters.insert(splitter) {
766 new_splitters_found += 1;
767 }
768 }
769 }
770 }
771
772 if verbosity > 0 && !is_reference {
773 eprintln!(
774 " File {}: {} hard contigs so far, {} new splitters",
775 file_idx, hard_contigs_found, new_splitters_found
776 );
777 }
778 }
779
780 if verbosity > 0 {
781 eprintln!("Pass 1 complete:");
782 eprintln!(" Total hard contigs: {}", hard_contigs_found);
783 eprintln!(" New splitters discovered: {}", new_splitters_found);
784 eprintln!(
785 " Final splitter count: {} (was {})",
786 combined_splitters.len(),
787 combined_splitters.len() - new_splitters_found
788 );
789 eprintln!();
790 }
791
792 Ok(combined_splitters)
793}
794
795#[cfg(test)]
796mod tests {
797 use super::*;
798
799 #[test]
800 fn test_determine_splitters_simple() {
801 let contigs = vec![
803 vec![0, 0, 0, 1], vec![2, 2, 2, 3], ];
806
807 let (splitters, _singletons, _duplicates) = determine_splitters(&contigs, 3, 100);
808
809 assert!(!splitters.is_empty());
813 }
814
815 #[test]
816 fn test_determine_splitters_no_singletons() {
817 let contigs = vec![vec![0, 1, 2, 3], vec![0, 1, 2, 3]];
819
820 let (splitters, singletons, duplicates) = determine_splitters(&contigs, 3, 100);
821
822 assert_eq!(splitters.len(), 0);
824 assert_eq!(singletons.len(), 0);
825 assert!(!duplicates.is_empty());
827 }
828
829 #[test]
830 fn test_determine_splitters_all_unique() {
831 let contigs = vec![
833 vec![0, 0, 0, 0], vec![1, 1, 1, 1], ];
836
837 let (splitters, _singletons, duplicates) = determine_splitters(&contigs, 3, 100);
838
839 assert_eq!(splitters.len(), 0);
843 assert!(!duplicates.is_empty());
845 }
846
847 #[test]
848 fn test_determine_splitters_mixed() {
849 let contigs = vec![
852 vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], vec![0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], ];
855
856 let (splitters, singletons, duplicates) = determine_splitters(&contigs, 3, 100);
857
858 let _ = (splitters, singletons, duplicates); }
864
865 #[test]
866 fn test_is_splitter() {
867 let mut splitters = AHashSet::new();
868 splitters.insert(123);
869 splitters.insert(456);
870
871 assert!(is_splitter(123, &splitters));
872 assert!(is_splitter(456, &splitters));
873 assert!(!is_splitter(789, &splitters));
874 }
875
876 #[test]
877 fn test_find_candidate_kmers_multi() {
878 let contigs = vec![
879 vec![0, 1, 2, 3], vec![3, 2, 1, 0], ];
882
883 let candidates = find_candidate_kmers_multi(&contigs, 3);
884
885 for i in 1..candidates.len() {
887 assert!(candidates[i] >= candidates[i - 1]);
888 }
889 }
890}