1use crate::genome_io::GenomeIO;
5use crate::kmer_extract::{enumerate_kmers, remove_non_singletons};
6use anyhow::Result;
7use ragc_common::Contig;
8use rayon::prelude::*;
9use rdst::RadixSort;
10use std::collections::HashSet;
11use std::io::Read;
12use std::path::Path;
13
14pub fn determine_splitters(
34 contigs: &[Contig],
35 k: usize,
36 segment_size: usize,
37) -> (HashSet<u64>, HashSet<u64>, HashSet<u64>) {
38 let total_bases: usize = contigs.iter().map(|c| c.len()).sum();
40 eprintln!(
41 "DEBUG: determine_splitters() received {} contigs with {} total bases",
42 contigs.len(),
43 total_bases
44 );
45
46 let all_kmers_vec: Vec<Vec<u64>> = contigs
49 .par_iter()
50 .map(|contig| enumerate_kmers(contig, k))
51 .collect();
52
53 let mut all_kmers: Vec<u64> = all_kmers_vec.into_iter().flatten().collect();
54 eprintln!(
55 "DEBUG: Extracted {} k-mers from reference contigs",
56 all_kmers.len()
57 );
58
59 all_kmers.radix_sort_unstable();
61
62 let mut duplicates = HashSet::new();
65 let mut i = 0;
66 while i < all_kmers.len() {
67 let kmer = all_kmers[i];
68 let mut count = 1;
69 let mut j = i + 1;
70
71 while j < all_kmers.len() && all_kmers[j] == kmer {
73 count += 1;
74 j += 1;
75 }
76
77 if count > 1 {
79 duplicates.insert(kmer);
80 }
81
82 i = j;
83 }
84
85 remove_non_singletons(&mut all_kmers, 0);
87
88 let candidates: HashSet<u64> = all_kmers.into_iter().collect();
89 eprintln!(
90 "DEBUG: Found {} candidate singleton k-mers from reference",
91 candidates.len()
92 );
93 eprintln!(
94 "DEBUG: Found {} duplicate k-mers from reference",
95 duplicates.len()
96 );
97
98 let splitter_vecs: Vec<Vec<u64>> = contigs
101 .par_iter()
102 .map(|contig| find_actual_splitters_in_contig(contig, &candidates, k, segment_size))
103 .collect();
104
105 let splitters: HashSet<u64> = splitter_vecs.into_iter().flatten().collect();
106 eprintln!(
107 "DEBUG: {} actually-used splitters (after distance check)",
108 splitters.len()
109 );
110
111 (splitters, candidates, duplicates)
112}
113
114pub fn determine_splitters_streaming(
129 fasta_path: &Path,
130 k: usize,
131 segment_size: usize,
132) -> Result<(HashSet<u64>, HashSet<u64>, HashSet<u64>)> {
133 eprintln!("DEBUG: Pass 1 - Collecting k-mers from reference (streaming)...");
135 let mut all_kmers = Vec::new();
136 let mut reference_sample = String::new();
137
138 {
139 let mut reader = GenomeIO::<Box<dyn Read>>::open(fasta_path)?;
140 while let Some((_full_header, sample_name, _contig_name, sequence)) =
141 reader.read_contig_with_sample()?
142 {
143 if !sequence.is_empty() {
144 if reference_sample.is_empty() {
146 reference_sample = sample_name.clone();
147 eprintln!(
148 "DEBUG: Collecting k-mers from FIRST sample only ({})",
149 reference_sample
150 );
151 }
152
153 if sample_name == reference_sample {
156 let contig_kmers = enumerate_kmers(&sequence, k);
157 all_kmers.extend(contig_kmers);
158 }
159 }
160 }
161 }
162
163 eprintln!(
164 "DEBUG: Collected {} k-mers (with duplicates)",
165 all_kmers.len()
166 );
167 eprintln!(
168 "DEBUG: Vec memory usage: ~{} MB",
169 all_kmers.len() * 8 / 1_000_000
170 );
171
172 eprintln!("DEBUG: Sorting k-mers...");
174 all_kmers.radix_sort_unstable();
175
176 let mut duplicates = HashSet::new();
178 let mut i = 0;
179 while i < all_kmers.len() {
180 let kmer = all_kmers[i];
181 let mut count = 1;
182 let mut j = i + 1;
183
184 while j < all_kmers.len() && all_kmers[j] == kmer {
185 count += 1;
186 j += 1;
187 }
188
189 if count > 1 {
190 duplicates.insert(kmer);
191 }
192
193 i = j;
194 }
195
196 eprintln!("DEBUG: Found {} duplicate k-mers", duplicates.len());
197
198 remove_non_singletons(&mut all_kmers, 0);
200 let candidates: HashSet<u64> = all_kmers.into_iter().collect();
201 eprintln!(
202 "DEBUG: Found {} candidate singleton k-mers",
203 candidates.len()
204 );
205
206 eprintln!("DEBUG: Pass 2 - Finding actually-used splitters (streaming)...");
208 let mut splitters = HashSet::new();
209
210 {
211 let mut reader = GenomeIO::<Box<dyn Read>>::open(fasta_path)?;
212 while let Some((_full_header, sample_name, _contig_name, sequence)) =
213 reader.read_contig_with_sample()?
214 {
215 if !sequence.is_empty() && sample_name == reference_sample {
218 let used = find_actual_splitters_in_contig(&sequence, &candidates, k, segment_size);
219 splitters.extend(used);
220 }
221 }
222 }
223
224 eprintln!("DEBUG: {} actually-used splitters", splitters.len());
225
226 Ok((splitters, candidates, duplicates))
227}
228
229fn find_actual_splitters_in_contig(
233 contig: &Contig,
234 candidates: &HashSet<u64>,
235 k: usize,
236 segment_size: usize,
237) -> Vec<u64> {
238 use crate::kmer::{Kmer, KmerMode};
239
240 let mut used_splitters = Vec::new();
241 let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
242 let mut current_len = segment_size; let mut recent_kmers = Vec::new();
244
245 for &base in contig {
246 if base > 3 {
247 kmer.reset();
248 recent_kmers.clear();
249 } else {
250 kmer.insert(base as u64);
251
252 if kmer.is_full() {
253 let kmer_value = kmer.data();
254 recent_kmers.push(kmer_value);
255
256 if current_len >= segment_size && candidates.contains(&kmer_value) {
257 used_splitters.push(kmer_value);
259 current_len = 0;
260 kmer.reset();
261 recent_kmers.clear();
262 }
263 }
264 }
265
266 current_len += 1;
267 }
268
269 for &kmer_value in recent_kmers.iter().rev() {
271 if candidates.contains(&kmer_value) {
272 used_splitters.push(kmer_value);
273 break;
274 }
275 }
276
277 used_splitters
278}
279
280pub fn find_candidate_kmers_multi(contigs: &[Contig], k: usize) -> Vec<u64> {
291 let all_kmers_vec: Vec<Vec<u64>> = contigs
293 .par_iter()
294 .map(|contig| enumerate_kmers(contig, k))
295 .collect();
296
297 let mut all_kmers: Vec<u64> = all_kmers_vec.into_iter().flatten().collect();
298
299 all_kmers.radix_sort_unstable();
301 remove_non_singletons(&mut all_kmers, 0);
302
303 all_kmers
304}
305
306#[inline]
315pub fn is_splitter(kmer: u64, splitters: &HashSet<u64>) -> bool {
316 splitters.contains(&kmer)
317}
318
319#[cfg(test)]
320mod tests {
321 use super::*;
322
323 #[test]
324 fn test_determine_splitters_simple() {
325 let contigs = vec![
327 vec![0, 0, 0, 1], vec![2, 2, 2, 3], ];
330
331 let (splitters, _singletons, _duplicates) = determine_splitters(&contigs, 3, 100);
332
333 assert!(!splitters.is_empty());
337 }
338
339 #[test]
340 fn test_determine_splitters_no_singletons() {
341 let contigs = vec![vec![0, 1, 2, 3], vec![0, 1, 2, 3]];
343
344 let (splitters, singletons, duplicates) = determine_splitters(&contigs, 3, 100);
345
346 assert_eq!(splitters.len(), 0);
348 assert_eq!(singletons.len(), 0);
349 assert!(!duplicates.is_empty());
351 }
352
353 #[test]
354 fn test_determine_splitters_all_unique() {
355 let contigs = vec![
357 vec![0, 0, 0, 0], vec![1, 1, 1, 1], ];
360
361 let (splitters, _singletons, duplicates) = determine_splitters(&contigs, 3, 100);
362
363 assert_eq!(splitters.len(), 0);
367 assert!(!duplicates.is_empty());
369 }
370
371 #[test]
372 fn test_determine_splitters_mixed() {
373 let contigs = vec![
376 vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], vec![0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], ];
379
380 let (splitters, singletons, duplicates) = determine_splitters(&contigs, 3, 100);
381
382 let _ = (splitters, singletons, duplicates); }
388
389 #[test]
390 fn test_is_splitter() {
391 let mut splitters = HashSet::new();
392 splitters.insert(123);
393 splitters.insert(456);
394
395 assert!(is_splitter(123, &splitters));
396 assert!(is_splitter(456, &splitters));
397 assert!(!is_splitter(789, &splitters));
398 }
399
400 #[test]
401 fn test_find_candidate_kmers_multi() {
402 let contigs = vec![
403 vec![0, 1, 2, 3], vec![3, 2, 1, 0], ];
406
407 let candidates = find_candidate_kmers_multi(&contigs, 3);
408
409 for i in 1..candidates.len() {
411 assert!(candidates[i] >= candidates[i - 1]);
412 }
413 }
414}