ragc_core/segment.rs
1// Segmentation logic
2// Splits contigs at splitter k-mer positions
3
4use crate::kmer::{Kmer, KmerMode};
5use ragc_common::Contig;
6use std::collections::HashSet;
7
8/// Missing k-mer sentinel value (matches C++ AGC's ~0ull)
9/// Used when a segment doesn't have a front or back k-mer (e.g., at contig boundaries)
10pub const MISSING_KMER: u64 = u64::MAX;
11
12/// A segment of a contig bounded by splitter k-mers
13#[derive(Debug, Clone, PartialEq, Eq)]
14pub struct Segment {
15 /// The sequence data of this segment
16 pub data: Contig,
17 /// K-mer value at the start of the segment (or MISSING_KMER if at contig start)
18 pub front_kmer: u64,
19 /// K-mer value at the end of the segment (or MISSING_KMER if at contig end)
20 pub back_kmer: u64,
21}
22
23impl Segment {
24 /// Create a new segment
25 pub fn new(data: Contig, front_kmer: u64, back_kmer: u64) -> Self {
26 Segment {
27 data,
28 front_kmer,
29 back_kmer,
30 }
31 }
32
33 /// Get the length of the segment
34 pub fn len(&self) -> usize {
35 self.data.len()
36 }
37
38 /// Check if the segment is empty
39 pub fn is_empty(&self) -> bool {
40 self.data.is_empty()
41 }
42}
43
44/// Split a contig at splitter k-mer positions
45///
46/// This implements the C++ AGC segment splitting algorithm.
47/// The splitters passed should be the ACTUALLY USED splitters from the reference
48/// (not all candidates), ensuring all genomes split at the same positions.
49///
50/// # Arguments
51/// * `contig` - The contig to split
52/// * `splitters` - Set of splitter k-mer values (actually used on reference)
53/// * `k` - K-mer length
54/// * `_min_segment_size` - Unused (kept for API compatibility, see note below)
55///
56/// # Returns
57/// Vector of segments
58///
59/// # Algorithm (matching C++ AGC)
60/// 1. Scan through contig
61/// 2. Split at EVERY occurrence of a splitter k-mer (no distance check!)
62/// 3. Keep track of recent k-mers for end-of-contig handling
63/// 4. At contig end, look backward through recent k-mers to find rightmost splitter
64///
65/// # Note on min_segment_size
66/// The distance check only happens during SPLITTER FINDING (in splitters.rs) to select
67/// which k-mers become splitters. During SEGMENTATION (this function), C++ AGC splits
68/// at every occurrence of those splitters WITHOUT checking distance. This is the key
69/// difference that was causing 2.3x worse compression!
70pub fn split_at_splitters_with_size(
71 contig: &Contig,
72 splitters: &HashSet<u64>,
73 k: usize,
74 _min_segment_size: usize,
75) -> Vec<Segment> {
76 let mut segments = Vec::new();
77
78 if contig.len() < k {
79 // Contig too short for k-mers, return as single segment
80 return vec![Segment::new(contig.clone(), MISSING_KMER, MISSING_KMER)];
81 }
82
83 let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
84 let mut segment_start = 0;
85 let mut front_kmer = MISSING_KMER;
86 let mut front_kmer_is_dir = false;
87
88 // Track recent k-mers for end-of-contig handling
89 // C++ AGC doesn't limit this - it accumulates all k-mers since last split
90 // Store (position, kmer_value, is_dir_oriented) to match C++ AGC's orientation logic
91 let mut recent_kmers: Vec<(usize, u64, bool)> = Vec::new();
92
93 for (pos, &base) in contig.iter().enumerate() {
94 if base > 3 {
95 // Non-ACGT base, reset k-mer
96 kmer.reset();
97 recent_kmers.clear();
98 } else {
99 kmer.insert(base as u64);
100
101 if kmer.is_full() {
102 let kmer_value = kmer.data();
103 let is_dir = kmer.is_dir_oriented();
104 recent_kmers.push((pos, kmer_value, is_dir));
105
106 // CRITICAL FIX: C++ AGC splits at EVERY splitter occurrence!
107 // The distance check (current_len >= min_segment_size) only happens
108 // during SPLITTER FINDING to select which k-mers become splitters.
109 // During SEGMENTATION, we split at every occurrence without distance check.
110 if splitters.contains(&kmer_value) {
111 // Use this as a splitter
112 let segment_end = pos + 1;
113 let segment_data = contig[segment_start..segment_end].to_vec();
114
115 if !segment_data.is_empty() {
116 // Apply C++ AGC's orientation logic for segments with one MISSING k-mer
117 let (seg_front, seg_back) = if front_kmer == MISSING_KMER {
118 // First segment of contig - only back k-mer present
119 // C++ AGC swaps the k-mer (swap_dir_rc) before checking orientation (lines 1341-1365)
120 // So we invert the orientation check: if dir → (MISSING, kmer), else → (kmer, MISSING)
121 if is_dir {
122 (MISSING_KMER, kmer_value)
123 } else {
124 (kmer_value, MISSING_KMER)
125 }
126 } else {
127 // Normal case: both k-mers present
128 (front_kmer, kmer_value)
129 };
130 segments.push(Segment::new(segment_data, seg_front, seg_back));
131 }
132
133 // Reset for next segment
134 // CRITICAL: Create k-base overlap so decompressor can skip first k bases
135 // The k-mer ends at position pos, occupies [pos-k+1, pos]
136 // Next segment should start at pos-k+1 to create k-base overlap
137 segment_start = (pos + 1).saturating_sub(k);
138 front_kmer = kmer_value;
139 front_kmer_is_dir = is_dir;
140 recent_kmers.clear();
141 kmer.reset();
142 }
143 }
144 }
145 }
146
147 // At end of contig, look backward through recent k-mers to find rightmost splitter
148 // CRITICAL: Only split if the remaining data after the splitter will be >= k bytes
149 // This ensures C++ AGC can skip k overlap bytes without hitting corruption
150 for (pos, kmer_value, is_dir) in recent_kmers.iter().rev() {
151 if splitters.contains(kmer_value) {
152 let segment_end = pos + 1;
153 let remaining_after = contig.len() - segment_end;
154
155 // Only split here if remaining data is > k bytes
156 // (We need > k, not >= k, because after creating k-base overlap,
157 // the final segment must still have > k bytes for C++ AGC decompressor)
158 if remaining_after > k {
159 if segment_end > segment_start {
160 let segment_data = contig[segment_start..segment_end].to_vec();
161 if !segment_data.is_empty() {
162 segments.push(Segment::new(segment_data, front_kmer, *kmer_value));
163 // Create k-base overlap for next segment
164 segment_start = (pos + 1).saturating_sub(k);
165 front_kmer = *kmer_value;
166 front_kmer_is_dir = *is_dir;
167 }
168 }
169 break;
170 } else if remaining_after == 0 {
171 // Splitter is exactly at contig end - include it in current segment
172 // and don't update segment_start (no next segment to create)
173 if segment_end > segment_start {
174 let segment_data = contig[segment_start..segment_end].to_vec();
175 if !segment_data.is_empty() {
176 segments.push(Segment::new(segment_data, front_kmer, *kmer_value));
177 // Mark that we've consumed the entire contig
178 segment_start = contig.len();
179 }
180 }
181 break;
182 }
183 // Otherwise, continue looking for an earlier splitter that leaves enough room
184 }
185 }
186
187 // Add any remaining data as final segment
188 // This will either be:
189 // - The entire remainder if no suitable splitter was found, OR
190 // - A segment >= k bytes if we split at a splitter that left enough room
191 if segment_start < contig.len() {
192 let segment_data = contig[segment_start..].to_vec();
193 if !segment_data.is_empty() {
194 // Match C++ AGC's orientation logic for segments with one MISSING k-mer
195 // (agc_compressor.cpp lines 1649-1657)
196 let (final_front, final_back) = if front_kmer == MISSING_KMER {
197 // No front k-mer at all
198 (MISSING_KMER, MISSING_KMER)
199 } else if front_kmer_is_dir {
200 // K-mer is dir-oriented: keep as (kmer, MISSING)
201 (front_kmer, MISSING_KMER)
202 } else {
203 // K-mer is NOT dir-oriented: swap to (MISSING, kmer)
204 (MISSING_KMER, front_kmer)
205 };
206 segments.push(Segment::new(segment_data, final_front, final_back));
207 }
208 }
209
210 // If no segments were created, return entire contig as one segment
211 if segments.is_empty() {
212 segments.push(Segment::new(contig.clone(), MISSING_KMER, MISSING_KMER));
213 }
214
215 // CRITICAL FIX: Merge final segment with previous if it's too short for overlap
216 // Segments after the first must be >= k bytes to handle the k-base overlap
217 if segments.len() >= 2 {
218 let last_idx = segments.len() - 1;
219 if segments[last_idx].data.len() < k {
220 // Merge last two segments
221 let last_seg = segments.pop().unwrap();
222 let second_last = segments.last_mut().unwrap();
223
224 // Append last segment data to second-last
225 second_last.data.extend_from_slice(&last_seg.data);
226 // Keep the back_kmer from the merged segment (should be 0 for final segment)
227 second_last.back_kmer = last_seg.back_kmer;
228 }
229 }
230
231 segments
232}
233
234/// Split a contig at splitter k-mer positions (no minimum size constraint)
235///
236/// This is the old behavior - splits at every splitter regardless of segment size.
237/// Kept for backwards compatibility but not recommended for use.
238pub fn split_at_splitters(contig: &Contig, splitters: &HashSet<u64>, k: usize) -> Vec<Segment> {
239 let mut segments = Vec::new();
240
241 if contig.len() < k {
242 // Contig too short for k-mers, return as single segment
243 return vec![Segment::new(contig.clone(), MISSING_KMER, MISSING_KMER)];
244 }
245
246 let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
247 let mut segment_start = 0;
248 let mut front_kmer = MISSING_KMER;
249
250 for (pos, &base) in contig.iter().enumerate() {
251 if base > 3 {
252 // Non-ACGT base, reset k-mer
253 kmer.reset();
254 } else {
255 kmer.insert(base as u64);
256
257 if kmer.is_full() {
258 let kmer_value = kmer.data();
259
260 // Check if this is a splitter
261 if splitters.contains(&kmer_value) {
262 // Create segment from segment_start to current position (inclusive of splitter)
263 let segment_end = pos + 1; // Include the base that completes the splitter
264 let segment_data = contig[segment_start..segment_end].to_vec();
265
266 if !segment_data.is_empty() {
267 segments.push(Segment::new(segment_data, front_kmer, kmer_value));
268 }
269
270 // Start new segment with k-base overlap
271 segment_start = (pos + 1).saturating_sub(k);
272 front_kmer = kmer_value;
273 }
274 }
275 }
276 }
277
278 // Add final segment if there's data remaining
279 if segment_start < contig.len() {
280 let segment_data = contig[segment_start..].to_vec();
281 if !segment_data.is_empty() {
282 segments.push(Segment::new(segment_data, front_kmer, MISSING_KMER));
283 }
284 }
285
286 // If no segments were created (no splitters), return entire contig as one segment
287 if segments.is_empty() {
288 segments.push(Segment::new(contig.clone(), MISSING_KMER, MISSING_KMER));
289 }
290
291 segments
292}
293
294#[cfg(test)]
295mod tests {
296 use super::*;
297
298 #[test]
299 fn test_segment_new() {
300 let seg = Segment::new(vec![0, 1, 2, 3], 123, 456);
301 assert_eq!(seg.len(), 4);
302 assert_eq!(seg.front_kmer, 123);
303 assert_eq!(seg.back_kmer, 456);
304 assert!(!seg.is_empty());
305 }
306
307 #[test]
308 fn test_split_no_splitters() {
309 let contig = vec![0, 1, 2, 3, 0, 1, 2, 3];
310 let splitters = HashSet::new();
311 let segments = split_at_splitters(&contig, &splitters, 3);
312
313 // Should return entire contig as one segment
314 assert_eq!(segments.len(), 1);
315 assert_eq!(segments[0].data, contig);
316 assert_eq!(segments[0].front_kmer, MISSING_KMER);
317 assert_eq!(segments[0].back_kmer, MISSING_KMER);
318 }
319
320 #[test]
321 fn test_split_with_splitters() {
322 // Create a test where we know the k-mer values
323 let contig = vec![0, 0, 0, 1, 1, 1, 2, 2, 2];
324
325 // We need to figure out what the k-mer values are
326 let mut kmer = Kmer::new(3, KmerMode::Canonical);
327 let mut kmers = Vec::new();
328 for &base in &contig {
329 kmer.insert(base as u64);
330 if kmer.is_full() {
331 kmers.push(kmer.data());
332 }
333 }
334
335 // Use the first k-mer as a splitter
336 let mut splitters = HashSet::new();
337 if !kmers.is_empty() {
338 splitters.insert(kmers[0]);
339 }
340
341 let segments = split_at_splitters(&contig, &splitters, 3);
342
343 // Should split at the splitter position
344 assert!(!segments.is_empty());
345
346 // Verify reconstructed length (accounting for k-base overlaps)
347 // First segment contributes all its bytes, subsequent segments skip first k bytes
348 let k = 3;
349 let reconstructed_len: usize = if segments.is_empty() {
350 0
351 } else {
352 segments[0].len()
353 + segments[1..]
354 .iter()
355 .map(|s| s.len().saturating_sub(k))
356 .sum::<usize>()
357 };
358 assert_eq!(reconstructed_len, contig.len());
359 }
360
361 #[test]
362 fn test_split_short_contig() {
363 let contig = vec![0, 1]; // Too short for k=3
364 let splitters = HashSet::new();
365 let segments = split_at_splitters(&contig, &splitters, 3);
366
367 assert_eq!(segments.len(), 1);
368 assert_eq!(segments[0].data, contig);
369 }
370
371 #[test]
372 fn test_split_consecutive_splitters() {
373 let contig = vec![0, 0, 0, 0, 0, 0]; // AAAAAA
374
375 // All k-mers will be AAA (same value)
376 let mut kmer = Kmer::new(3, KmerMode::Canonical);
377 kmer.insert(0);
378 kmer.insert(0);
379 kmer.insert(0);
380 let aaa_kmer = kmer.data();
381
382 let mut splitters = HashSet::new();
383 splitters.insert(aaa_kmer);
384
385 let segments = split_at_splitters(&contig, &splitters, 3);
386
387 // Should create multiple small segments
388 assert!(!segments.is_empty());
389 }
390
391 #[test]
392 fn test_split_with_n_bases() {
393 // Contig with N (represented as 4)
394 let contig = vec![0, 0, 0, 4, 1, 1, 1];
395
396 let mut splitters = HashSet::new();
397 splitters.insert(12345); // Some random splitter that won't match
398
399 let segments = split_at_splitters(&contig, &splitters, 3);
400
401 // Should handle N bases without crashing
402 assert!(!segments.is_empty());
403 }
404}