ragc_core/
segment.rs

1// Segmentation logic
2// Splits contigs at splitter k-mer positions
3
4use crate::kmer::{Kmer, KmerMode};
5use ragc_common::Contig;
6use std::collections::HashSet;
7
8/// Missing k-mer sentinel value (matches C++ AGC's ~0ull)
9/// Used when a segment doesn't have a front or back k-mer (e.g., at contig boundaries)
10pub const MISSING_KMER: u64 = u64::MAX;
11
12/// A segment of a contig bounded by splitter k-mers
13#[derive(Debug, Clone, PartialEq, Eq)]
14pub struct Segment {
15    /// The sequence data of this segment
16    pub data: Contig,
17    /// K-mer value at the start of the segment (or MISSING_KMER if at contig start)
18    pub front_kmer: u64,
19    /// K-mer value at the end of the segment (or MISSING_KMER if at contig end)
20    pub back_kmer: u64,
21}
22
23impl Segment {
24    /// Create a new segment
25    pub fn new(data: Contig, front_kmer: u64, back_kmer: u64) -> Self {
26        Segment {
27            data,
28            front_kmer,
29            back_kmer,
30        }
31    }
32
33    /// Get the length of the segment
34    pub fn len(&self) -> usize {
35        self.data.len()
36    }
37
38    /// Check if the segment is empty
39    pub fn is_empty(&self) -> bool {
40        self.data.is_empty()
41    }
42}
43
44/// Split a contig at splitter k-mer positions
45///
46/// This implements the C++ AGC segment splitting algorithm.
47/// The splitters passed should be the ACTUALLY USED splitters from the reference
48/// (not all candidates), ensuring all genomes split at the same positions.
49///
50/// # Arguments
51/// * `contig` - The contig to split
52/// * `splitters` - Set of splitter k-mer values (actually used on reference)
53/// * `k` - K-mer length
54/// * `_min_segment_size` - Unused (kept for API compatibility, see note below)
55///
56/// # Returns
57/// Vector of segments
58///
59/// # Algorithm (matching C++ AGC)
60/// 1. Scan through contig
61/// 2. Split at EVERY occurrence of a splitter k-mer (no distance check!)
62/// 3. Keep track of recent k-mers for end-of-contig handling
63/// 4. At contig end, look backward through recent k-mers to find rightmost splitter
64///
65/// # Note on min_segment_size
66/// The distance check only happens during SPLITTER FINDING (in splitters.rs) to select
67/// which k-mers become splitters. During SEGMENTATION (this function), C++ AGC splits
68/// at every occurrence of those splitters WITHOUT checking distance. This is the key
69/// difference that was causing 2.3x worse compression!
70pub fn split_at_splitters_with_size(
71    contig: &Contig,
72    splitters: &HashSet<u64>,
73    k: usize,
74    _min_segment_size: usize,
75) -> Vec<Segment> {
76    let mut segments = Vec::new();
77
78    if contig.len() < k {
79        // Contig too short for k-mers, return as single segment
80        return vec![Segment::new(contig.clone(), MISSING_KMER, MISSING_KMER)];
81    }
82
83    let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
84    let mut segment_start = 0;
85    let mut front_kmer = MISSING_KMER;
86    let mut front_kmer_is_dir = false;
87
88    // Track recent k-mers for end-of-contig handling
89    // C++ AGC doesn't limit this - it accumulates all k-mers since last split
90    // Store (position, kmer_value, is_dir_oriented) to match C++ AGC's orientation logic
91    let mut recent_kmers: Vec<(usize, u64, bool)> = Vec::new();
92
93    for (pos, &base) in contig.iter().enumerate() {
94        if base > 3 {
95            // Non-ACGT base, reset k-mer
96            kmer.reset();
97            recent_kmers.clear();
98        } else {
99            kmer.insert(base as u64);
100
101            if kmer.is_full() {
102                let kmer_value = kmer.data();
103                let is_dir = kmer.is_dir_oriented();
104                recent_kmers.push((pos, kmer_value, is_dir));
105
106                // CRITICAL FIX: C++ AGC splits at EVERY splitter occurrence!
107                // The distance check (current_len >= min_segment_size) only happens
108                // during SPLITTER FINDING to select which k-mers become splitters.
109                // During SEGMENTATION, we split at every occurrence without distance check.
110                if splitters.contains(&kmer_value) {
111                    // Use this as a splitter
112                    let segment_end = pos + 1;
113                    let segment_data = contig[segment_start..segment_end].to_vec();
114
115                    if !segment_data.is_empty() {
116                        // Apply C++ AGC's orientation logic for segments with one MISSING k-mer
117                        let (seg_front, seg_back) = if front_kmer == MISSING_KMER {
118                            // First segment of contig - only back k-mer present
119                            // C++ AGC swaps the k-mer (swap_dir_rc) before checking orientation (lines 1341-1365)
120                            // So we invert the orientation check: if dir → (MISSING, kmer), else → (kmer, MISSING)
121                            if is_dir {
122                                (MISSING_KMER, kmer_value)
123                            } else {
124                                (kmer_value, MISSING_KMER)
125                            }
126                        } else {
127                            // Normal case: both k-mers present
128                            (front_kmer, kmer_value)
129                        };
130                        segments.push(Segment::new(segment_data, seg_front, seg_back));
131                    }
132
133                    // Reset for next segment
134                    // CRITICAL: Create k-base overlap so decompressor can skip first k bases
135                    // The k-mer ends at position pos, occupies [pos-k+1, pos]
136                    // Next segment should start at pos-k+1 to create k-base overlap
137                    segment_start = (pos + 1).saturating_sub(k);
138                    front_kmer = kmer_value;
139                    front_kmer_is_dir = is_dir;
140                    recent_kmers.clear();
141                    kmer.reset();
142                }
143            }
144        }
145    }
146
147    // At end of contig, look backward through recent k-mers to find rightmost splitter
148    // CRITICAL: Only split if the remaining data after the splitter will be >= k bytes
149    // This ensures C++ AGC can skip k overlap bytes without hitting corruption
150    for (pos, kmer_value, is_dir) in recent_kmers.iter().rev() {
151        if splitters.contains(kmer_value) {
152            let segment_end = pos + 1;
153            let remaining_after = contig.len() - segment_end;
154
155            // Only split here if remaining data is > k bytes
156            // (We need > k, not >= k, because after creating k-base overlap,
157            // the final segment must still have > k bytes for C++ AGC decompressor)
158            if remaining_after > k {
159                if segment_end > segment_start {
160                    let segment_data = contig[segment_start..segment_end].to_vec();
161                    if !segment_data.is_empty() {
162                        segments.push(Segment::new(segment_data, front_kmer, *kmer_value));
163                        // Create k-base overlap for next segment
164                        segment_start = (pos + 1).saturating_sub(k);
165                        front_kmer = *kmer_value;
166                        front_kmer_is_dir = *is_dir;
167                    }
168                }
169                break;
170            } else if remaining_after == 0 {
171                // Splitter is exactly at contig end - include it in current segment
172                // and don't update segment_start (no next segment to create)
173                if segment_end > segment_start {
174                    let segment_data = contig[segment_start..segment_end].to_vec();
175                    if !segment_data.is_empty() {
176                        segments.push(Segment::new(segment_data, front_kmer, *kmer_value));
177                        // Mark that we've consumed the entire contig
178                        segment_start = contig.len();
179                    }
180                }
181                break;
182            }
183            // Otherwise, continue looking for an earlier splitter that leaves enough room
184        }
185    }
186
187    // Add any remaining data as final segment
188    // This will either be:
189    // - The entire remainder if no suitable splitter was found, OR
190    // - A segment >= k bytes if we split at a splitter that left enough room
191    if segment_start < contig.len() {
192        let segment_data = contig[segment_start..].to_vec();
193        if !segment_data.is_empty() {
194            // Match C++ AGC's orientation logic for segments with one MISSING k-mer
195            // (agc_compressor.cpp lines 1649-1657)
196            let (final_front, final_back) = if front_kmer == MISSING_KMER {
197                // No front k-mer at all
198                (MISSING_KMER, MISSING_KMER)
199            } else if front_kmer_is_dir {
200                // K-mer is dir-oriented: keep as (kmer, MISSING)
201                (front_kmer, MISSING_KMER)
202            } else {
203                // K-mer is NOT dir-oriented: swap to (MISSING, kmer)
204                (MISSING_KMER, front_kmer)
205            };
206            segments.push(Segment::new(segment_data, final_front, final_back));
207        }
208    }
209
210    // If no segments were created, return entire contig as one segment
211    if segments.is_empty() {
212        segments.push(Segment::new(contig.clone(), MISSING_KMER, MISSING_KMER));
213    }
214
215    // CRITICAL FIX: Merge final segment with previous if it's too short for overlap
216    // Segments after the first must be >= k bytes to handle the k-base overlap
217    if segments.len() >= 2 {
218        let last_idx = segments.len() - 1;
219        if segments[last_idx].data.len() < k {
220            // Merge last two segments
221            let last_seg = segments.pop().unwrap();
222            let second_last = segments.last_mut().unwrap();
223
224            // Append last segment data to second-last
225            second_last.data.extend_from_slice(&last_seg.data);
226            // Keep the back_kmer from the merged segment (should be 0 for final segment)
227            second_last.back_kmer = last_seg.back_kmer;
228        }
229    }
230
231    segments
232}
233
234/// Split a contig at splitter k-mer positions (no minimum size constraint)
235///
236/// This is the old behavior - splits at every splitter regardless of segment size.
237/// Kept for backwards compatibility but not recommended for use.
238pub fn split_at_splitters(contig: &Contig, splitters: &HashSet<u64>, k: usize) -> Vec<Segment> {
239    let mut segments = Vec::new();
240
241    if contig.len() < k {
242        // Contig too short for k-mers, return as single segment
243        return vec![Segment::new(contig.clone(), MISSING_KMER, MISSING_KMER)];
244    }
245
246    let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
247    let mut segment_start = 0;
248    let mut front_kmer = MISSING_KMER;
249
250    for (pos, &base) in contig.iter().enumerate() {
251        if base > 3 {
252            // Non-ACGT base, reset k-mer
253            kmer.reset();
254        } else {
255            kmer.insert(base as u64);
256
257            if kmer.is_full() {
258                let kmer_value = kmer.data();
259
260                // Check if this is a splitter
261                if splitters.contains(&kmer_value) {
262                    // Create segment from segment_start to current position (inclusive of splitter)
263                    let segment_end = pos + 1; // Include the base that completes the splitter
264                    let segment_data = contig[segment_start..segment_end].to_vec();
265
266                    if !segment_data.is_empty() {
267                        segments.push(Segment::new(segment_data, front_kmer, kmer_value));
268                    }
269
270                    // Start new segment with k-base overlap
271                    segment_start = (pos + 1).saturating_sub(k);
272                    front_kmer = kmer_value;
273                }
274            }
275        }
276    }
277
278    // Add final segment if there's data remaining
279    if segment_start < contig.len() {
280        let segment_data = contig[segment_start..].to_vec();
281        if !segment_data.is_empty() {
282            segments.push(Segment::new(segment_data, front_kmer, MISSING_KMER));
283        }
284    }
285
286    // If no segments were created (no splitters), return entire contig as one segment
287    if segments.is_empty() {
288        segments.push(Segment::new(contig.clone(), MISSING_KMER, MISSING_KMER));
289    }
290
291    segments
292}
293
294#[cfg(test)]
295mod tests {
296    use super::*;
297
298    #[test]
299    fn test_segment_new() {
300        let seg = Segment::new(vec![0, 1, 2, 3], 123, 456);
301        assert_eq!(seg.len(), 4);
302        assert_eq!(seg.front_kmer, 123);
303        assert_eq!(seg.back_kmer, 456);
304        assert!(!seg.is_empty());
305    }
306
307    #[test]
308    fn test_split_no_splitters() {
309        let contig = vec![0, 1, 2, 3, 0, 1, 2, 3];
310        let splitters = HashSet::new();
311        let segments = split_at_splitters(&contig, &splitters, 3);
312
313        // Should return entire contig as one segment
314        assert_eq!(segments.len(), 1);
315        assert_eq!(segments[0].data, contig);
316        assert_eq!(segments[0].front_kmer, MISSING_KMER);
317        assert_eq!(segments[0].back_kmer, MISSING_KMER);
318    }
319
320    #[test]
321    fn test_split_with_splitters() {
322        // Create a test where we know the k-mer values
323        let contig = vec![0, 0, 0, 1, 1, 1, 2, 2, 2];
324
325        // We need to figure out what the k-mer values are
326        let mut kmer = Kmer::new(3, KmerMode::Canonical);
327        let mut kmers = Vec::new();
328        for &base in &contig {
329            kmer.insert(base as u64);
330            if kmer.is_full() {
331                kmers.push(kmer.data());
332            }
333        }
334
335        // Use the first k-mer as a splitter
336        let mut splitters = HashSet::new();
337        if !kmers.is_empty() {
338            splitters.insert(kmers[0]);
339        }
340
341        let segments = split_at_splitters(&contig, &splitters, 3);
342
343        // Should split at the splitter position
344        assert!(!segments.is_empty());
345
346        // Verify reconstructed length (accounting for k-base overlaps)
347        // First segment contributes all its bytes, subsequent segments skip first k bytes
348        let k = 3;
349        let reconstructed_len: usize = if segments.is_empty() {
350            0
351        } else {
352            segments[0].len()
353                + segments[1..]
354                    .iter()
355                    .map(|s| s.len().saturating_sub(k))
356                    .sum::<usize>()
357        };
358        assert_eq!(reconstructed_len, contig.len());
359    }
360
361    #[test]
362    fn test_split_short_contig() {
363        let contig = vec![0, 1]; // Too short for k=3
364        let splitters = HashSet::new();
365        let segments = split_at_splitters(&contig, &splitters, 3);
366
367        assert_eq!(segments.len(), 1);
368        assert_eq!(segments[0].data, contig);
369    }
370
371    #[test]
372    fn test_split_consecutive_splitters() {
373        let contig = vec![0, 0, 0, 0, 0, 0]; // AAAAAA
374
375        // All k-mers will be AAA (same value)
376        let mut kmer = Kmer::new(3, KmerMode::Canonical);
377        kmer.insert(0);
378        kmer.insert(0);
379        kmer.insert(0);
380        let aaa_kmer = kmer.data();
381
382        let mut splitters = HashSet::new();
383        splitters.insert(aaa_kmer);
384
385        let segments = split_at_splitters(&contig, &splitters, 3);
386
387        // Should create multiple small segments
388        assert!(!segments.is_empty());
389    }
390
391    #[test]
392    fn test_split_with_n_bases() {
393        // Contig with N (represented as 4)
394        let contig = vec![0, 0, 0, 4, 1, 1, 1];
395
396        let mut splitters = HashSet::new();
397        splitters.insert(12345); // Some random splitter that won't match
398
399        let segments = split_at_splitters(&contig, &splitters, 3);
400
401        // Should handle N bases without crashing
402        assert!(!segments.is_empty());
403    }
404}