1use crate::kmer::{Kmer, KmerMode};
5use ahash::AHashSet;
6use ragc_common::Contig;
7
8pub const MISSING_KMER: u64 = u64::MAX;
11
12#[derive(Debug, Clone, PartialEq, Eq)]
14pub struct Segment {
15 pub data: Contig,
17 pub front_kmer: u64,
19 pub back_kmer: u64,
21 pub front_kmer_is_dir: bool,
23 pub back_kmer_is_dir: bool,
25}
26
27impl Segment {
28 pub fn new(
30 data: Contig,
31 front_kmer: u64,
32 back_kmer: u64,
33 front_kmer_is_dir: bool,
34 back_kmer_is_dir: bool,
35 ) -> Self {
36 Segment {
37 data,
38 front_kmer,
39 back_kmer,
40 front_kmer_is_dir,
41 back_kmer_is_dir,
42 }
43 }
44
45 pub fn len(&self) -> usize {
47 self.data.len()
48 }
49
50 pub fn is_empty(&self) -> bool {
52 self.data.is_empty()
53 }
54}
55
56pub fn split_at_splitters_with_size(
83 contig: &Contig,
84 splitters: &AHashSet<u64>,
85 k: usize,
86 _min_segment_size: usize,
87) -> Vec<Segment> {
88 let debug = crate::env_cache::debug_segment_coverage();
89 if debug {
90 eprintln!(
91 "\n=== SEGMENTATION START: contig_len={} k={} ===",
92 contig.len(),
93 k
94 );
95 }
96
97 let mut segments = Vec::new();
98
99 if contig.len() < k {
100 if debug {
102 eprintln!("Contig too short for k-mers, returning as single segment");
103 }
104 return vec![Segment::new(
105 contig.clone(),
106 MISSING_KMER,
107 MISSING_KMER,
108 false,
109 false,
110 )];
111 }
112
113 let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
114 let mut segment_start = 0;
115 let mut front_kmer = MISSING_KMER;
116 let mut front_kmer_is_dir = false;
117
118 for (pos, &base) in contig.iter().enumerate() {
119 if base > 3 {
120 kmer.reset();
122 } else {
123 kmer.insert(base as u64);
124
125 if kmer.is_full() {
126 let kmer_value = kmer.data();
127 let is_dir = kmer.is_dir_oriented();
128
129 if crate::env_cache::debug_endpos() && pos + 50 >= contig.len() {
136 let is_splitter = splitters.contains(&kmer_value);
137 let bytes_left = contig.len() - (pos + 1);
138 eprintln!("ENDPOS_TRACE: pos={} kmer={:#x} is_splitter={} bytes_left={} k={} contig_len={}",
139 pos, kmer_value, is_splitter, bytes_left, k, contig.len());
140 }
141
142 if splitters.contains(&kmer_value) {
143 if crate::env_cache::trace_all_splits() {
145 let segment_len = (pos + 1) - segment_start;
146 eprintln!("RAGC_SPLIT: pos={} kmer={} segment_start={} segment_len={} contig_len={}",
147 pos, kmer_value, segment_start, segment_len, contig.len());
148 }
149 let segment_end = pos + 1;
151 let segment_data = contig[segment_start..segment_end].to_vec();
152
153 if !segment_data.is_empty() {
154 let (seg_front, seg_back, seg_front_is_dir, seg_back_is_dir) =
156 if front_kmer == MISSING_KMER {
157 (MISSING_KMER, kmer_value, false, is_dir)
160 } else {
161 (front_kmer, kmer_value, front_kmer_is_dir, is_dir)
163 };
164 if debug {
166 eprintln!(
167 " MAIN_LOOP_SPLIT: segment=[{}..{}) len={}",
168 segment_start,
169 segment_end,
170 segment_data.len()
171 );
172 }
173 #[cfg(feature = "verbose_debug")]
174 if crate::env_cache::debug_overlap() {
175 let first_5: Vec<u8> = segment_data.iter().take(5).copied().collect();
176 let last_5: Vec<u8> =
177 segment_data.iter().rev().take(5).rev().copied().collect();
178 eprintln!("RAGC_SEG_SPLIT: pos={} splitter={} segment=[{}..{}) len={} front={} back={} first_5={:?} last_5={:?}",
179 pos, kmer_value, segment_start, segment_end, segment_data.len(),
180 if seg_front == MISSING_KMER { "MISSING".to_string() } else { seg_front.to_string() },
181 if seg_back == MISSING_KMER { "MISSING".to_string() } else { seg_back.to_string() },
182 first_5, last_5);
183 } else {
184 #[cfg(feature = "verbose_debug")]
185 eprintln!("RAGC_SEG_SPLIT: pos={} splitter={} segment=[{}..{}) len={} front={} back={}",
186 pos, kmer_value, segment_start, segment_end, segment_data.len(),
187 if seg_front == MISSING_KMER { "MISSING".to_string() } else { seg_front.to_string() },
188 if seg_back == MISSING_KMER { "MISSING".to_string() } else { seg_back.to_string() });
189 }
190 segments.push(Segment::new(
191 segment_data,
192 seg_front,
193 seg_back,
194 seg_front_is_dir,
195 seg_back_is_dir,
196 ));
197 }
198
199 let new_start = (pos + 1).saturating_sub(k);
203 if debug {
204 eprintln!(
205 " Setting segment_start: {} -> {} (overlap of {} bytes)",
206 segment_start,
207 new_start,
208 (pos + 1) - new_start
209 );
210 }
211 segment_start = new_start;
212 front_kmer = kmer_value;
213 front_kmer_is_dir = is_dir;
214 kmer.reset();
215 }
216 }
217 }
218 }
219
220 if debug {
230 eprintln!("\n=== FINAL SEGMENT ===");
231 eprintln!(
232 " segment_start={}, contig.len()={}",
233 segment_start,
234 contig.len()
235 );
236 }
237 if segment_start < contig.len() {
238 let segment_data = contig[segment_start..].to_vec();
239 if !segment_data.is_empty() {
240 let (final_front, final_back, final_front_is_dir, final_back_is_dir) =
242 if front_kmer == MISSING_KMER {
243 (MISSING_KMER, MISSING_KMER, false, false)
248 } else {
249 (front_kmer, MISSING_KMER, front_kmer_is_dir, false)
253 };
254 if debug {
255 eprintln!(
256 " FINAL: segment=[{}..{}) len={}",
257 segment_start,
258 contig.len(),
259 segment_data.len()
260 );
261 }
262 #[cfg(feature = "verbose_debug")]
263 eprintln!(
264 "RAGC_SEG_FINAL: segment=[{}..{}) len={} front={} back={}",
265 segment_start,
266 contig.len(),
267 segment_data.len(),
268 if final_front == MISSING_KMER {
269 "MISSING".to_string()
270 } else {
271 final_front.to_string()
272 },
273 if final_back == MISSING_KMER {
274 "MISSING".to_string()
275 } else {
276 final_back.to_string()
277 }
278 );
279 if crate::env_cache::debug_is_dir()
281 && final_back == MISSING_KMER
282 && final_front != MISSING_KMER
283 {
284 eprintln!(
285 "RAGC_FINAL_SEG_IS_DIR: front_kmer={} front_kmer_is_dir={}",
286 final_front, final_front_is_dir
287 );
288 }
289 segments.push(Segment::new(
290 segment_data,
291 final_front,
292 final_back,
293 final_front_is_dir,
294 final_back_is_dir,
295 ));
296 }
297 }
298
299 if segments.is_empty() {
301 if debug {
302 eprintln!(" NO_SPLIT: Returning entire contig as single segment");
303 }
304 #[cfg(feature = "verbose_debug")]
305 eprintln!(
306 "RAGC_SEG_NOSPLIT: len={} front=MISSING back=MISSING",
307 contig.len()
308 );
309 segments.push(Segment::new(
310 contig.clone(),
311 MISSING_KMER,
312 MISSING_KMER,
313 false,
314 false,
315 ));
316 }
317
318 if debug {
320 eprintln!("\n=== SEGMENTATION SUMMARY ===");
321 eprintln!(" Original contig length: {}", contig.len());
322 eprintln!(" Number of segments: {}", segments.len());
323 let total_segment_bytes: usize = segments.iter().map(|s| s.len()).sum();
324 eprintln!(" Total segment bytes: {}", total_segment_bytes);
325
326 let expected_size = if segments.is_empty() {
328 0
329 } else if segments.len() == 1 {
330 segments[0].len()
331 } else {
332 segments[0].len()
334 + segments[1..]
335 .iter()
336 .map(|s| s.len().saturating_sub(k - 1))
337 .sum::<usize>()
338 };
339 eprintln!(
340 " Expected reconstructed size: {} (with {} overlaps of {} bytes)",
341 expected_size,
342 segments.len().saturating_sub(1),
343 k - 1
344 );
345
346 if expected_size != contig.len() {
347 eprintln!(
348 " ⚠️ SIZE MISMATCH: Expected {} but contig is {} (diff: {})",
349 expected_size,
350 contig.len(),
351 contig.len() as i64 - expected_size as i64
352 );
353 } else {
354 eprintln!(" ✓ Size matches!");
355 }
356
357 eprintln!("\n Segment coverage:");
359 for (i, seg) in segments.iter().enumerate() {
360 eprintln!(" Segment {}: len={}", i, seg.len());
361 }
362 }
363
364 segments
365}
366
367pub fn split_at_splitters(contig: &Contig, splitters: &AHashSet<u64>, k: usize) -> Vec<Segment> {
372 let mut segments = Vec::new();
373
374 if contig.len() < k {
375 return vec![Segment::new(
377 contig.clone(),
378 MISSING_KMER,
379 MISSING_KMER,
380 false,
381 false,
382 )];
383 }
384
385 let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
386 let mut segment_start = 0;
387 let mut front_kmer = MISSING_KMER;
388 let mut front_kmer_is_dir = false;
389
390 for (pos, &base) in contig.iter().enumerate() {
391 if base > 3 {
392 kmer.reset();
394 } else {
395 kmer.insert(base as u64);
396
397 if kmer.is_full() {
398 let kmer_value = kmer.data();
399 let is_dir = kmer.is_dir_oriented();
400
401 if splitters.contains(&kmer_value) {
403 let segment_end = pos + 1; let segment_data = contig[segment_start..segment_end].to_vec();
406
407 if !segment_data.is_empty() {
408 #[cfg(feature = "verbose_debug")]
409 eprintln!("RAGC_SEG_SPLIT: pos={} splitter={} segment=[{}..{}) len={} front={} back={}",
410 pos, kmer_value, segment_start, segment_end, segment_data.len(),
411 if front_kmer == MISSING_KMER { "MISSING".to_string() } else { front_kmer.to_string() },
412 kmer_value);
413 segments.push(Segment::new(
414 segment_data,
415 front_kmer,
416 kmer_value,
417 front_kmer_is_dir,
418 is_dir,
419 ));
420 }
421
422 segment_start = (pos + 1).saturating_sub(k);
424 front_kmer = kmer_value;
425 front_kmer_is_dir = is_dir;
426 }
427 }
428 }
429 }
430
431 if segment_start < contig.len() {
433 let segment_data = contig[segment_start..].to_vec();
434 if !segment_data.is_empty() {
435 #[cfg(feature = "verbose_debug")]
436 eprintln!(
437 "RAGC_SEG_FINAL: segment=[{}..{}) len={} front={} back=MISSING",
438 segment_start,
439 contig.len(),
440 segment_data.len(),
441 if front_kmer == MISSING_KMER {
442 "MISSING".to_string()
443 } else {
444 front_kmer.to_string()
445 }
446 );
447 segments.push(Segment::new(
448 segment_data,
449 front_kmer,
450 MISSING_KMER,
451 front_kmer_is_dir,
452 false,
453 ));
454 }
455 }
456
457 if segments.is_empty() {
459 #[cfg(feature = "verbose_debug")]
460 eprintln!(
461 "RAGC_SEG_NOSPLIT: len={} front=MISSING back=MISSING",
462 contig.len()
463 );
464 segments.push(Segment::new(
465 contig.clone(),
466 MISSING_KMER,
467 MISSING_KMER,
468 false,
469 false,
470 ));
471 }
472
473 segments
474}
475
476#[cfg(test)]
477mod tests {
478 use super::*;
479
480 #[test]
481 fn test_segment_new() {
482 let seg = Segment::new(vec![0, 1, 2, 3], 123, 456, true, false);
483 assert_eq!(seg.len(), 4);
484 assert_eq!(seg.front_kmer, 123);
485 assert_eq!(seg.back_kmer, 456);
486 assert!(!seg.is_empty());
487 }
488
489 #[test]
490 fn test_split_no_splitters() {
491 let contig = vec![0, 1, 2, 3, 0, 1, 2, 3];
492 let splitters = AHashSet::new();
493 let segments = split_at_splitters(&contig, &splitters, 3);
494
495 assert_eq!(segments.len(), 1);
497 assert_eq!(segments[0].data, contig);
498 assert_eq!(segments[0].front_kmer, MISSING_KMER);
499 assert_eq!(segments[0].back_kmer, MISSING_KMER);
500 }
501
502 #[test]
503 fn test_split_with_splitters() {
504 let contig = vec![0, 0, 0, 1, 1, 1, 2, 2, 2];
506
507 let mut kmer = Kmer::new(3, KmerMode::Canonical);
509 let mut kmers = Vec::new();
510 for &base in &contig {
511 kmer.insert(base as u64);
512 if kmer.is_full() {
513 kmers.push(kmer.data());
514 }
515 }
516
517 let mut splitters = AHashSet::new();
519 if !kmers.is_empty() {
520 splitters.insert(kmers[0]);
521 }
522
523 let segments = split_at_splitters(&contig, &splitters, 3);
524
525 assert!(!segments.is_empty());
527
528 let k = 3;
531 let reconstructed_len: usize = if segments.is_empty() {
532 0
533 } else {
534 segments[0].len()
535 + segments[1..]
536 .iter()
537 .map(|s| s.len().saturating_sub(k))
538 .sum::<usize>()
539 };
540 assert_eq!(reconstructed_len, contig.len());
541 }
542
543 #[test]
544 fn test_split_short_contig() {
545 let contig = vec![0, 1]; let splitters = AHashSet::new();
547 let segments = split_at_splitters(&contig, &splitters, 3);
548
549 assert_eq!(segments.len(), 1);
550 assert_eq!(segments[0].data, contig);
551 }
552
553 #[test]
554 fn test_split_consecutive_splitters() {
555 let contig = vec![0, 0, 0, 0, 0, 0]; let mut kmer = Kmer::new(3, KmerMode::Canonical);
559 kmer.insert(0);
560 kmer.insert(0);
561 kmer.insert(0);
562 let aaa_kmer = kmer.data();
563
564 let mut splitters = AHashSet::new();
565 splitters.insert(aaa_kmer);
566
567 let segments = split_at_splitters(&contig, &splitters, 3);
568
569 assert!(!segments.is_empty());
571 }
572
573 #[test]
574 fn test_split_with_n_bases() {
575 let contig = vec![0, 0, 0, 4, 1, 1, 1];
577
578 let mut splitters = AHashSet::new();
579 splitters.insert(12345); let segments = split_at_splitters(&contig, &splitters, 3);
582
583 assert!(!segments.is_empty());
585 }
586}