ragc_core/
segment.rs

1// Segmentation logic
2// Splits contigs at splitter k-mer positions
3
4use crate::kmer::{Kmer, KmerMode};
5use ahash::AHashSet;
6use ragc_common::Contig;
7
8/// Missing k-mer sentinel value (matches C++ AGC's kmer_t(-1) = u64::MAX)
9/// Used when a segment doesn't have a front or back k-mer (e.g., at contig boundaries)
10pub const MISSING_KMER: u64 = u64::MAX;
11
12/// A segment of a contig bounded by splitter k-mers
13#[derive(Debug, Clone, PartialEq, Eq)]
14pub struct Segment {
15    /// The sequence data of this segment
16    pub data: Contig,
17    /// K-mer value at the start of the segment (or MISSING_KMER if at contig start)
18    pub front_kmer: u64,
19    /// K-mer value at the end of the segment (or MISSING_KMER if at contig end)
20    pub back_kmer: u64,
21    /// Whether front k-mer is in direct orientation (for C++ AGC's is_dir_oriented())
22    pub front_kmer_is_dir: bool,
23    /// Whether back k-mer is in direct orientation (for C++ AGC's is_dir_oriented())
24    pub back_kmer_is_dir: bool,
25}
26
27impl Segment {
28    /// Create a new segment
29    pub fn new(
30        data: Contig,
31        front_kmer: u64,
32        back_kmer: u64,
33        front_kmer_is_dir: bool,
34        back_kmer_is_dir: bool,
35    ) -> Self {
36        Segment {
37            data,
38            front_kmer,
39            back_kmer,
40            front_kmer_is_dir,
41            back_kmer_is_dir,
42        }
43    }
44
45    /// Get the length of the segment
46    pub fn len(&self) -> usize {
47        self.data.len()
48    }
49
50    /// Check if the segment is empty
51    pub fn is_empty(&self) -> bool {
52        self.data.is_empty()
53    }
54}
55
56/// Split a contig at splitter k-mer positions
57///
58/// This implements the C++ AGC segment splitting algorithm.
59/// The splitters passed should be the ACTUALLY USED splitters from the reference
60/// (not all candidates), ensuring all genomes split at the same positions.
61///
62/// # Arguments
63/// * `contig` - The contig to split
64/// * `splitters` - Set of splitter k-mer values (actually used on reference)
65/// * `k` - K-mer length
66/// * `_min_segment_size` - Unused (kept for API compatibility, see note below)
67///
68/// # Returns
69/// Vector of segments
70///
71/// # Algorithm (matching C++ AGC)
72/// 1. Scan through contig
73/// 2. Split at EVERY occurrence of a splitter k-mer (no distance check!)
74/// 3. Keep track of recent k-mers for end-of-contig handling
75/// 4. At contig end, look backward through recent k-mers to find rightmost splitter
76///
77/// # Note on min_segment_size
78/// The distance check only happens during SPLITTER FINDING (in splitters.rs) to select
79/// which k-mers become splitters. During SEGMENTATION (this function), C++ AGC splits
80/// at every occurrence of those splitters WITHOUT checking distance. This is the key
81/// difference that was causing 2.3x worse compression!
82pub fn split_at_splitters_with_size(
83    contig: &Contig,
84    splitters: &AHashSet<u64>,
85    k: usize,
86    _min_segment_size: usize,
87) -> Vec<Segment> {
88    let debug = crate::env_cache::debug_segment_coverage();
89    if debug {
90        eprintln!(
91            "\n=== SEGMENTATION START: contig_len={} k={} ===",
92            contig.len(),
93            k
94        );
95    }
96
97    let mut segments = Vec::new();
98
99    if contig.len() < k {
100        // Contig too short for k-mers, return as single segment
101        if debug {
102            eprintln!("Contig too short for k-mers, returning as single segment");
103        }
104        return vec![Segment::new(
105            contig.clone(),
106            MISSING_KMER,
107            MISSING_KMER,
108            false,
109            false,
110        )];
111    }
112
113    let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
114    let mut segment_start = 0;
115    let mut front_kmer = MISSING_KMER;
116    let mut front_kmer_is_dir = false;
117
118    for (pos, &base) in contig.iter().enumerate() {
119        if base > 3 {
120            // Non-ACGT base, reset k-mer
121            kmer.reset();
122        } else {
123            kmer.insert(base as u64);
124
125            if kmer.is_full() {
126                let kmer_value = kmer.data();
127                let is_dir = kmer.is_dir_oriented();
128
129                // CRITICAL FIX: C++ AGC splits at EVERY splitter occurrence!
130                // The distance check (current_len >= min_segment_size) only happens
131                // during SPLITTER FINDING to select which k-mers become splitters.
132                // During SEGMENTATION, we split at every occurrence without distance check.
133
134                // DEBUG: Trace positions near end of contig
135                if crate::env_cache::debug_endpos() && pos + 50 >= contig.len() {
136                    let is_splitter = splitters.contains(&kmer_value);
137                    let bytes_left = contig.len() - (pos + 1);
138                    eprintln!("ENDPOS_TRACE: pos={} kmer={:#x} is_splitter={} bytes_left={} k={} contig_len={}",
139                              pos, kmer_value, is_splitter, bytes_left, k, contig.len());
140                }
141
142                if splitters.contains(&kmer_value) {
143                    // Comprehensive split logging for debugging
144                    if crate::env_cache::trace_all_splits() {
145                        let segment_len = (pos + 1) - segment_start;
146                        eprintln!("RAGC_SPLIT: pos={} kmer={} segment_start={} segment_len={} contig_len={}",
147                                  pos, kmer_value, segment_start, segment_len, contig.len());
148                    }
149                    // Use this as a splitter
150                    let segment_end = pos + 1;
151                    let segment_data = contig[segment_start..segment_end].to_vec();
152
153                    if !segment_data.is_empty() {
154                        // Determine front and back k-mers and their orientations
155                        let (seg_front, seg_back, seg_front_is_dir, seg_back_is_dir) =
156                            if front_kmer == MISSING_KMER {
157                                // First segment of contig - ALWAYS back k-mer only
158                                // The splitter is at the END of the segment, so it's the back k-mer
159                                (MISSING_KMER, kmer_value, false, is_dir)
160                            } else {
161                                // Normal case: both k-mers present
162                                (front_kmer, kmer_value, front_kmer_is_dir, is_dir)
163                            };
164                        // Note: Contig name is logged at call site, this is just position info
165                        if debug {
166                            eprintln!(
167                                "  MAIN_LOOP_SPLIT: segment=[{}..{}) len={}",
168                                segment_start,
169                                segment_end,
170                                segment_data.len()
171                            );
172                        }
173                        #[cfg(feature = "verbose_debug")]
174                        if crate::env_cache::debug_overlap() {
175                            let first_5: Vec<u8> = segment_data.iter().take(5).copied().collect();
176                            let last_5: Vec<u8> =
177                                segment_data.iter().rev().take(5).rev().copied().collect();
178                            eprintln!("RAGC_SEG_SPLIT: pos={} splitter={} segment=[{}..{}) len={} front={} back={} first_5={:?} last_5={:?}",
179                                pos, kmer_value, segment_start, segment_end, segment_data.len(),
180                                if seg_front == MISSING_KMER { "MISSING".to_string() } else { seg_front.to_string() },
181                                if seg_back == MISSING_KMER { "MISSING".to_string() } else { seg_back.to_string() },
182                                first_5, last_5);
183                        } else {
184                            #[cfg(feature = "verbose_debug")]
185                            eprintln!("RAGC_SEG_SPLIT: pos={} splitter={} segment=[{}..{}) len={} front={} back={}",
186                                pos, kmer_value, segment_start, segment_end, segment_data.len(),
187                                if seg_front == MISSING_KMER { "MISSING".to_string() } else { seg_front.to_string() },
188                                if seg_back == MISSING_KMER { "MISSING".to_string() } else { seg_back.to_string() });
189                        }
190                        segments.push(Segment::new(
191                            segment_data,
192                            seg_front,
193                            seg_back,
194                            seg_front_is_dir,
195                            seg_back_is_dir,
196                        ));
197                    }
198
199                    // Reset for next segment
200                    // CRITICAL: Create k-byte overlap to match C++ AGC behavior
201                    // Each segment must include the FULL k-mer at the start
202                    let new_start = (pos + 1).saturating_sub(k);
203                    if debug {
204                        eprintln!(
205                            "  Setting segment_start: {} -> {} (overlap of {} bytes)",
206                            segment_start,
207                            new_start,
208                            (pos + 1) - new_start
209                        );
210                    }
211                    segment_start = new_start;
212                    front_kmer = kmer_value;
213                    front_kmer_is_dir = is_dir;
214                    kmer.reset();
215                }
216            }
217        }
218    }
219
220    // End-of-contig handling: C++ AGC segmentation does not perform any
221    // backward search for a last-minute splitter. It simply splits at every
222    // occurrence encountered in the main loop and then emits the final segment.
223    // We therefore intentionally do nothing here to match C++ behavior exactly.
224
225    // Add any remaining data as final segment
226    // This will either be:
227    // - The entire remainder if no suitable splitter was found, OR
228    // - A segment >= k bytes if we split at a splitter that left enough room
229    if debug {
230        eprintln!("\n=== FINAL SEGMENT ===");
231        eprintln!(
232            "  segment_start={}, contig.len()={}",
233            segment_start,
234            contig.len()
235        );
236    }
237    if segment_start < contig.len() {
238        let segment_data = contig[segment_start..].to_vec();
239        if !segment_data.is_empty() {
240            // Final segment k-mer extraction
241            let (final_front, final_back, final_front_is_dir, final_back_is_dir) =
242                if front_kmer == MISSING_KMER {
243                    // No splitters found in entire contig - this is an orphan segment
244                    // C++ AGC uses (MISSING_KMER, MISSING_KMER) for orphan segments,
245                    // which routes them to raw group 0 (see distribute_segments in agc_compressor.h)
246                    // FIX 15: Match C++ AGC behavior - use MISSING for both k-mers
247                    (MISSING_KMER, MISSING_KMER, false, false)
248                } else {
249                    // Front k-mer present (from last splitter)
250                    // C++ AGC uses MISSING_KMER for back k-mer of final segments
251                    // (see agc_compressor.cpp line 2188)
252                    (front_kmer, MISSING_KMER, front_kmer_is_dir, false)
253                };
254            if debug {
255                eprintln!(
256                    "  FINAL: segment=[{}..{}) len={}",
257                    segment_start,
258                    contig.len(),
259                    segment_data.len()
260                );
261            }
262            #[cfg(feature = "verbose_debug")]
263            eprintln!(
264                "RAGC_SEG_FINAL: segment=[{}..{}) len={} front={} back={}",
265                segment_start,
266                contig.len(),
267                segment_data.len(),
268                if final_front == MISSING_KMER {
269                    "MISSING".to_string()
270                } else {
271                    final_front.to_string()
272                },
273                if final_back == MISSING_KMER {
274                    "MISSING".to_string()
275                } else {
276                    final_back.to_string()
277                }
278            );
279            // Debug logging for Case 3 is_dir investigation
280            if crate::env_cache::debug_is_dir()
281                && final_back == MISSING_KMER
282                && final_front != MISSING_KMER
283            {
284                eprintln!(
285                    "RAGC_FINAL_SEG_IS_DIR: front_kmer={} front_kmer_is_dir={}",
286                    final_front, final_front_is_dir
287                );
288            }
289            segments.push(Segment::new(
290                segment_data,
291                final_front,
292                final_back,
293                final_front_is_dir,
294                final_back_is_dir,
295            ));
296        }
297    }
298
299    // If no segments were created, return entire contig as one segment
300    if segments.is_empty() {
301        if debug {
302            eprintln!("  NO_SPLIT: Returning entire contig as single segment");
303        }
304        #[cfg(feature = "verbose_debug")]
305        eprintln!(
306            "RAGC_SEG_NOSPLIT: len={} front=MISSING back=MISSING",
307            contig.len()
308        );
309        segments.push(Segment::new(
310            contig.clone(),
311            MISSING_KMER,
312            MISSING_KMER,
313            false,
314            false,
315        ));
316    }
317
318    // Summary
319    if debug {
320        eprintln!("\n=== SEGMENTATION SUMMARY ===");
321        eprintln!("  Original contig length: {}", contig.len());
322        eprintln!("  Number of segments: {}", segments.len());
323        let total_segment_bytes: usize = segments.iter().map(|s| s.len()).sum();
324        eprintln!("  Total segment bytes: {}", total_segment_bytes);
325
326        // Calculate expected reconstructed size
327        let expected_size = if segments.is_empty() {
328            0
329        } else if segments.len() == 1 {
330            segments[0].len()
331        } else {
332            // First segment contributes all bytes, subsequent segments skip (k-1) overlap
333            segments[0].len()
334                + segments[1..]
335                    .iter()
336                    .map(|s| s.len().saturating_sub(k - 1))
337                    .sum::<usize>()
338        };
339        eprintln!(
340            "  Expected reconstructed size: {} (with {} overlaps of {} bytes)",
341            expected_size,
342            segments.len().saturating_sub(1),
343            k - 1
344        );
345
346        if expected_size != contig.len() {
347            eprintln!(
348                "  ⚠️  SIZE MISMATCH: Expected {} but contig is {} (diff: {})",
349                expected_size,
350                contig.len(),
351                contig.len() as i64 - expected_size as i64
352            );
353        } else {
354            eprintln!("  ✓ Size matches!");
355        }
356
357        // Show segment ranges to identify gaps
358        eprintln!("\n  Segment coverage:");
359        for (i, seg) in segments.iter().enumerate() {
360            eprintln!("    Segment {}: len={}", i, seg.len());
361        }
362    }
363
364    segments
365}
366
367/// Split a contig at splitter k-mer positions (no minimum size constraint)
368///
369/// This is the old behavior - splits at every splitter regardless of segment size.
370/// Kept for backwards compatibility but not recommended for use.
371pub fn split_at_splitters(contig: &Contig, splitters: &AHashSet<u64>, k: usize) -> Vec<Segment> {
372    let mut segments = Vec::new();
373
374    if contig.len() < k {
375        // Contig too short for k-mers, return as single segment
376        return vec![Segment::new(
377            contig.clone(),
378            MISSING_KMER,
379            MISSING_KMER,
380            false,
381            false,
382        )];
383    }
384
385    let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
386    let mut segment_start = 0;
387    let mut front_kmer = MISSING_KMER;
388    let mut front_kmer_is_dir = false;
389
390    for (pos, &base) in contig.iter().enumerate() {
391        if base > 3 {
392            // Non-ACGT base, reset k-mer
393            kmer.reset();
394        } else {
395            kmer.insert(base as u64);
396
397            if kmer.is_full() {
398                let kmer_value = kmer.data();
399                let is_dir = kmer.is_dir_oriented();
400
401                // Check if this is a splitter
402                if splitters.contains(&kmer_value) {
403                    // Create segment from segment_start to current position (inclusive of splitter)
404                    let segment_end = pos + 1; // Include the base that completes the splitter
405                    let segment_data = contig[segment_start..segment_end].to_vec();
406
407                    if !segment_data.is_empty() {
408                        #[cfg(feature = "verbose_debug")]
409                        eprintln!("RAGC_SEG_SPLIT: pos={} splitter={} segment=[{}..{}) len={} front={} back={}",
410                            pos, kmer_value, segment_start, segment_end, segment_data.len(),
411                            if front_kmer == MISSING_KMER { "MISSING".to_string() } else { front_kmer.to_string() },
412                            kmer_value);
413                        segments.push(Segment::new(
414                            segment_data,
415                            front_kmer,
416                            kmer_value,
417                            front_kmer_is_dir,
418                            is_dir,
419                        ));
420                    }
421
422                    // Start new segment with k-base overlap
423                    segment_start = (pos + 1).saturating_sub(k);
424                    front_kmer = kmer_value;
425                    front_kmer_is_dir = is_dir;
426                }
427            }
428        }
429    }
430
431    // Add final segment if there's data remaining
432    if segment_start < contig.len() {
433        let segment_data = contig[segment_start..].to_vec();
434        if !segment_data.is_empty() {
435            #[cfg(feature = "verbose_debug")]
436            eprintln!(
437                "RAGC_SEG_FINAL: segment=[{}..{}) len={} front={} back=MISSING",
438                segment_start,
439                contig.len(),
440                segment_data.len(),
441                if front_kmer == MISSING_KMER {
442                    "MISSING".to_string()
443                } else {
444                    front_kmer.to_string()
445                }
446            );
447            segments.push(Segment::new(
448                segment_data,
449                front_kmer,
450                MISSING_KMER,
451                front_kmer_is_dir,
452                false,
453            ));
454        }
455    }
456
457    // If no segments were created (no splitters), return entire contig as one segment
458    if segments.is_empty() {
459        #[cfg(feature = "verbose_debug")]
460        eprintln!(
461            "RAGC_SEG_NOSPLIT: len={} front=MISSING back=MISSING",
462            contig.len()
463        );
464        segments.push(Segment::new(
465            contig.clone(),
466            MISSING_KMER,
467            MISSING_KMER,
468            false,
469            false,
470        ));
471    }
472
473    segments
474}
475
476#[cfg(test)]
477mod tests {
478    use super::*;
479
480    #[test]
481    fn test_segment_new() {
482        let seg = Segment::new(vec![0, 1, 2, 3], 123, 456, true, false);
483        assert_eq!(seg.len(), 4);
484        assert_eq!(seg.front_kmer, 123);
485        assert_eq!(seg.back_kmer, 456);
486        assert!(!seg.is_empty());
487    }
488
489    #[test]
490    fn test_split_no_splitters() {
491        let contig = vec![0, 1, 2, 3, 0, 1, 2, 3];
492        let splitters = AHashSet::new();
493        let segments = split_at_splitters(&contig, &splitters, 3);
494
495        // Should return entire contig as one segment
496        assert_eq!(segments.len(), 1);
497        assert_eq!(segments[0].data, contig);
498        assert_eq!(segments[0].front_kmer, MISSING_KMER);
499        assert_eq!(segments[0].back_kmer, MISSING_KMER);
500    }
501
502    #[test]
503    fn test_split_with_splitters() {
504        // Create a test where we know the k-mer values
505        let contig = vec![0, 0, 0, 1, 1, 1, 2, 2, 2];
506
507        // We need to figure out what the k-mer values are
508        let mut kmer = Kmer::new(3, KmerMode::Canonical);
509        let mut kmers = Vec::new();
510        for &base in &contig {
511            kmer.insert(base as u64);
512            if kmer.is_full() {
513                kmers.push(kmer.data());
514            }
515        }
516
517        // Use the first k-mer as a splitter
518        let mut splitters = AHashSet::new();
519        if !kmers.is_empty() {
520            splitters.insert(kmers[0]);
521        }
522
523        let segments = split_at_splitters(&contig, &splitters, 3);
524
525        // Should split at the splitter position
526        assert!(!segments.is_empty());
527
528        // Verify reconstructed length (accounting for k-base overlaps)
529        // First segment contributes all its bytes, subsequent segments skip first k bytes
530        let k = 3;
531        let reconstructed_len: usize = if segments.is_empty() {
532            0
533        } else {
534            segments[0].len()
535                + segments[1..]
536                    .iter()
537                    .map(|s| s.len().saturating_sub(k))
538                    .sum::<usize>()
539        };
540        assert_eq!(reconstructed_len, contig.len());
541    }
542
543    #[test]
544    fn test_split_short_contig() {
545        let contig = vec![0, 1]; // Too short for k=3
546        let splitters = AHashSet::new();
547        let segments = split_at_splitters(&contig, &splitters, 3);
548
549        assert_eq!(segments.len(), 1);
550        assert_eq!(segments[0].data, contig);
551    }
552
553    #[test]
554    fn test_split_consecutive_splitters() {
555        let contig = vec![0, 0, 0, 0, 0, 0]; // AAAAAA
556
557        // All k-mers will be AAA (same value)
558        let mut kmer = Kmer::new(3, KmerMode::Canonical);
559        kmer.insert(0);
560        kmer.insert(0);
561        kmer.insert(0);
562        let aaa_kmer = kmer.data();
563
564        let mut splitters = AHashSet::new();
565        splitters.insert(aaa_kmer);
566
567        let segments = split_at_splitters(&contig, &splitters, 3);
568
569        // Should create multiple small segments
570        assert!(!segments.is_empty());
571    }
572
573    #[test]
574    fn test_split_with_n_bases() {
575        // Contig with N (represented as 4)
576        let contig = vec![0, 0, 0, 4, 1, 1, 1];
577
578        let mut splitters = AHashSet::new();
579        splitters.insert(12345); // Some random splitter that won't match
580
581        let segments = split_at_splitters(&contig, &splitters, 3);
582
583        // Should handle N bases without crashing
584        assert!(!segments.is_empty());
585    }
586}