ragc_core/contig_compression.rs
1//! Contig compression with inline segmentation
2//!
3//! This module implements C++ AGC's compress_contig and add_segment logic
4//! for inline segmentation during compression.
5//!
6//! See docs/INLINE_SEGMENTATION_PATTERN.md for detailed C++ AGC analysis.
7
8use crate::kmer::{Kmer, KmerMode};
9use crate::segment_buffer::BufferedSegments;
10use std::sync::{Arc, Mutex};
11
12/// Contig type (vector of nucleotides in numeric encoding)
13pub type Contig = Vec<u8>;
14
15/// Missing k-mer marker (both k1 and k2 = ~0 means no terminal splitters)
16pub const MISSING_KMER: u64 = !0u64;
17
18/// Segment part for inline buffering
19#[derive(Debug, Clone)]
20pub struct SegmentPart {
21 pub kmer1: u64,
22 pub kmer2: u64,
23 pub sample_name: String,
24 pub contig_name: String,
25 pub seg_data: Vec<u8>,
26 pub is_rev_comp: bool,
27 pub seg_part_no: u32,
28}
29
30/// Shared state for compression (passed from worker threads)
31pub struct CompressionContext {
32 pub splitters: Arc<Mutex<std::collections::HashSet<u64>>>,
33 pub bloom_splitters: Arc<Mutex<crate::bloom_filter::BloomFilter>>,
34 pub buffered_segments: Arc<Mutex<BufferedSegments>>,
35 pub kmer_length: usize,
36 pub adaptive_mode: bool,
37 pub map_segments: Arc<Mutex<std::collections::HashMap<(u64, u64), u32>>>,
38 pub map_segments_terminators: Arc<Mutex<std::collections::HashMap<u64, Vec<u64>>>>,
39 pub concatenated_genomes: bool,
40}
41
42/// Compress a contig with inline segmentation
43///
44/// Matches C++ AGC's compress_contig (agc_compressor.cpp:2000-2054)
45///
46/// # Arguments
47/// * `sample_name` - Sample identifier
48/// * `contig_name` - Contig identifier
49/// * `contig` - Nucleotide sequence (numeric encoding: A=0, C=1, G=2, T=3)
50/// * `ctx` - Compression context with splitters and buffering
51///
52/// # Returns
53/// * `true` - Contig segmented successfully
54/// * `false` - Contig failed to segment (adaptive mode: needs new splitters)
55///
56/// # Algorithm
57/// 1. Scan contig with k-mer sliding window
58/// 2. Check each k-mer against splitters (bloom filter + hash set)
59/// 3. When splitter found: extract segment, call add_segment()
60/// 4. If no splitters found and adaptive mode: return false
61/// 5. Handle final segment (after last splitter to end)
62pub fn compress_contig(
63 sample_name: &str,
64 contig_name: &str,
65 contig: &Contig,
66 ctx: &CompressionContext,
67) -> bool {
68 let kmer_length = ctx.kmer_length as u32;
69
70 // Create k-mer scanner for canonical k-mers
71 let mut kmer = Kmer::new(kmer_length, KmerMode::Canonical);
72
73 // Track segment boundaries
74 let mut split_pos: usize = 0; // Start of current segment
75 let mut split_kmer = Kmer::new(kmer_length, KmerMode::Canonical); // K-mer at split position
76 let mut seg_part_no: u32 = 0;
77
78 // Scan contig for splitters
79 for (pos, &base) in contig.iter().enumerate() {
80 kmer.insert(base as u64);
81
82 if !kmer.is_full() {
83 continue;
84 }
85
86 // Get canonical k-mer value
87 let kmer_val = kmer.data_canonical();
88
89 // Check if this k-mer is a splitter
90 // C++ AGC: bloom_splitters.check(d) && hs_splitters.check(d)
91 if is_splitter(kmer_val, ctx) {
92 // Found splitter - extract segment from split_pos to current position
93 let seg_start = split_pos;
94 let seg_end = pos + 1; // Inclusive of current position
95 let segment = contig[seg_start..seg_end].to_vec();
96
97 // Add segment with terminal k-mers (split_kmer, current kmer)
98 add_segment(
99 sample_name,
100 contig_name,
101 seg_part_no,
102 segment,
103 &split_kmer,
104 &kmer,
105 ctx,
106 );
107
108 // Update for next segment
109 seg_part_no += 1;
110 split_pos = pos + 1 - kmer_length as usize; // Start of next segment overlaps by kmer_length
111 split_kmer = kmer.clone();
112 }
113 }
114
115 // Check if contig failed to segment (adaptive mode)
116 if ctx.adaptive_mode && seg_part_no == 0 {
117 // No splitters found - this is a "hard" contig
118 // split_kmer is still empty (never set), which C++ AGC checks as:
119 // if (adaptive_compression && split_kmer == CKmer(...))
120 return false; // Signal failure - needs new splitters
121 }
122
123 // Add final segment (from last splitter to end of contig)
124 if split_pos < contig.len() {
125 let segment = contig[split_pos..].to_vec();
126
127 add_segment(
128 sample_name,
129 contig_name,
130 seg_part_no,
131 segment,
132 &split_kmer,
133 &Kmer::new(kmer_length, KmerMode::Canonical), // Empty k-mer for end
134 ctx,
135 );
136 }
137
138 true // Success
139}
140
141/// Add segment to buffered storage with terminal k-mer detection
142///
143/// Matches C++ AGC's add_segment (agc_compressor.cpp:1275-1507)
144///
145/// # Terminal K-mer Cases
146/// 1. **No terminators** (k1 = k2 = ~0): Whole-contig segment
147/// 2. **Both terminators** (k1, k2 both valid): Normal segment
148/// 3. **Front-only** (k1 valid, k2 = ~0): First segment
149/// 4. **Back-only** (k1 = ~0, k2 valid): Last segment
150///
151/// # Split Detection
152/// If segment is NEW and both k1 and k2 are in map_segments_terminators:
153/// - Try to find shared middle splitter k_mid
154/// - Split segment into two overlapping parts
155///
156/// See INLINE_SEGMENTATION_PATTERN.md Section 2 for detailed algorithm.
157fn add_segment(
158 sample_name: &str,
159 contig_name: &str,
160 seg_part_no: u32,
161 mut segment: Vec<u8>,
162 kmer_front: &Kmer,
163 kmer_back: &Kmer,
164 ctx: &CompressionContext,
165) {
166 // Determine terminal k-mers (k1, k2) based on which are full
167 let k1 = if kmer_front.is_full() {
168 kmer_front.data_canonical()
169 } else {
170 MISSING_KMER
171 };
172
173 let k2 = if kmer_back.is_full() {
174 kmer_back.data_canonical()
175 } else {
176 MISSING_KMER
177 };
178
179 // Normalize key to canonical order (C++ AGC uses minmax)
180 let pk = if k1 <= k2 { (k1, k2) } else { (k2, k1) };
181
182 // Check if this segment key is already registered
183 let map_segments = ctx.map_segments.lock().unwrap();
184 let group_id = map_segments.get(&pk).copied();
185 drop(map_segments);
186
187 // Try split detection if NEW segment
188 let mut segment2: Option<Vec<u8>> = None;
189 let mut pk2: Option<(u64, u64)> = None;
190
191 if group_id.is_none() && !ctx.concatenated_genomes {
192 // NEW segment - check for split eligibility
193 if k1 != MISSING_KMER && k2 != MISSING_KMER {
194 // Both terminators exist - check if they're in terminators map
195 let terminators = ctx.map_segments_terminators.lock().unwrap();
196 let k1_in_map = terminators.contains_key(&k1);
197 let k2_in_map = terminators.contains_key(&k2);
198 drop(terminators);
199
200 if k1_in_map && k2_in_map {
201 // Try to find shared middle splitter
202 if let Some((k_middle, left_size, right_size)) =
203 find_cand_segment_with_missing_middle_splitter(kmer_front, kmer_back, ctx)
204 {
205 // Determine split outcome
206 if left_size == 0 || right_size == 0 {
207 // No actual split - entire segment goes to one group
208 // (This shouldn't happen but C++ AGC handles it)
209 } else {
210 // Split into 2 segments with overlap
211 let kmer_length = ctx.kmer_length;
212 let seg2_start_pos = left_size - kmer_length / 2;
213
214 // segment2: [seg2_start_pos, end)
215 segment2 = Some(segment[seg2_start_pos..].to_vec());
216
217 // segment: [0, seg2_start_pos + kmer_length)
218 segment.resize(seg2_start_pos + kmer_length, 0);
219
220 // Segment 1 key: (k1, k_middle)
221 pk2 = Some(if k_middle <= k2 {
222 (k_middle, k2)
223 } else {
224 (k2, k_middle)
225 });
226
227 // Note: pk remains (k1, k_middle) or (k_middle, k1) depending on order
228 }
229 }
230 }
231 }
232 }
233
234 // TODO: ZSTD compress segments
235 // For now, just store uncompressed
236 let compressed_seg = segment.clone();
237
238 // Add first segment (or whole segment if no split)
239 if group_id.is_some() {
240 // KNOWN segment
241 let buffered = ctx.buffered_segments.lock().unwrap();
242 buffered.add_known(
243 group_id.unwrap(),
244 MISSING_KMER,
245 MISSING_KMER,
246 sample_name.to_string(),
247 contig_name.to_string(),
248 compressed_seg,
249 false, // TODO: Determine is_rev_comp from k-mer comparison
250 seg_part_no,
251 );
252 } else {
253 // NEW segment
254 let buffered = ctx.buffered_segments.lock().unwrap();
255 buffered.add_new(
256 pk.0,
257 pk.1,
258 sample_name.to_string(),
259 contig_name.to_string(),
260 compressed_seg,
261 false, // TODO: Determine is_rev_comp
262 seg_part_no,
263 );
264 }
265
266 // Add second segment if split occurred
267 if let (Some(seg2), Some(pk2_key)) = (segment2, pk2) {
268 // TODO: ZSTD compress segment2
269 let compressed_seg2 = seg2.clone();
270
271 // Check if second segment key is known
272 let map_segments = ctx.map_segments.lock().unwrap();
273 let group_id2 = map_segments.get(&pk2_key).copied();
274 drop(map_segments);
275
276 let buffered = ctx.buffered_segments.lock().unwrap();
277 if let Some(gid2) = group_id2 {
278 // KNOWN segment
279 buffered.add_known(
280 gid2,
281 MISSING_KMER,
282 MISSING_KMER,
283 sample_name.to_string(),
284 contig_name.to_string(),
285 compressed_seg2,
286 false, // TODO: is_rev_comp
287 seg_part_no + 1,
288 );
289 } else {
290 // NEW segment
291 buffered.add_new(
292 pk2_key.0,
293 pk2_key.1,
294 sample_name.to_string(),
295 contig_name.to_string(),
296 compressed_seg2,
297 false, // TODO: is_rev_comp
298 seg_part_no + 1,
299 );
300 }
301 }
302}
303
304/// Check if k-mer is a splitter
305///
306/// Matches C++ AGC's two-stage check:
307/// 1. Bloom filter (fast, probabilistic)
308/// 2. Hash set (exact)
309fn is_splitter(kmer: u64, ctx: &CompressionContext) -> bool {
310 // Fast check: bloom filter (may have false positives)
311 let bloom = ctx.bloom_splitters.lock().unwrap();
312 if !bloom.check(kmer) {
313 return false; // Definitely not a splitter
314 }
315 drop(bloom); // Release lock before next check
316
317 // Exact check: hash set (no false positives)
318 let splitters = ctx.splitters.lock().unwrap();
319 splitters.contains(&kmer)
320}
321
322/// Find candidate segment with missing middle splitter (for split detection)
323///
324/// Matches C++ AGC's find_cand_segment_with_missing_middle_splitter
325/// (agc_compressor.cpp:1510-1597)
326///
327/// # Algorithm
328/// 1. Find intersection of terminators for k1 and k2
329/// 2. For each shared k_mid in intersection:
330/// - Check if (k1, k_mid) and (k_mid, k2) both exist in map_segments
331/// - Find k_mid position in segment (via find_middle_splitter)
332/// 3. Return first valid split: (middle_kmer, left_size, right_size)
333///
334/// # Returns
335/// - Some((k_middle, left_size, right_size)) if split found
336/// - None if no valid split
337fn find_cand_segment_with_missing_middle_splitter(
338 kmer_front: &Kmer,
339 kmer_back: &Kmer,
340 ctx: &CompressionContext,
341) -> Option<(u64, usize, usize)> {
342 let k1 = kmer_front.data_canonical();
343 let k2 = kmer_back.data_canonical();
344
345 // Get terminator lists for k1 and k2
346 let terminators = ctx.map_segments_terminators.lock().unwrap();
347 let k1_terminators = terminators.get(&k1)?;
348 let k2_terminators = terminators.get(&k2)?;
349
350 // Find shared terminators (intersection)
351 // Both lists are sorted, so we can use two-pointer technique
352 let mut shared_splitters = Vec::new();
353 let mut i = 0;
354 let mut j = 0;
355
356 while i < k1_terminators.len() && j < k2_terminators.len() {
357 if k1_terminators[i] == k2_terminators[j] {
358 shared_splitters.push(k1_terminators[i]);
359 i += 1;
360 j += 1;
361 } else if k1_terminators[i] < k2_terminators[j] {
362 i += 1;
363 } else {
364 j += 1;
365 }
366 }
367
368 if shared_splitters.is_empty() {
369 return None;
370 }
371
372 drop(terminators);
373
374 // Try each shared splitter
375 let map_segments = ctx.map_segments.lock().unwrap();
376
377 for &k_middle in &shared_splitters {
378 // Create keys (k1, k_middle) and (k_middle, k2) in canonical order
379 let pk_left = if k1 <= k_middle {
380 (k1, k_middle)
381 } else {
382 (k_middle, k1)
383 };
384
385 let pk_right = if k_middle <= k2 {
386 (k_middle, k2)
387 } else {
388 (k2, k_middle)
389 };
390
391 // Check if both groups exist
392 if map_segments.contains_key(&pk_left) && map_segments.contains_key(&pk_right) {
393 drop(map_segments);
394
395 // TODO: Find actual position of k_middle in segment using find_middle_splitter
396 // This requires:
397 // - Decompressing both segments from v_segments[pk_left] and v_segments[pk_right]
398 // - Finding the position where k_middle occurs
399 // - Computing optimal split position via cost analysis
400 //
401 // For now, return a simplified result with approximate position
402 // This is a placeholder - real implementation needed for correctness
403
404 // Placeholder: assume k_middle is in the middle
405 // TODO: Implement find_middle_splitter properly
406 let left_size = ctx.kmer_length; // Placeholder
407 let right_size = ctx.kmer_length; // Placeholder
408
409 return Some((k_middle, left_size, right_size));
410 }
411 }
412
413 None
414}
415
416#[cfg(test)]
417mod tests {
418 use super::*;
419 use std::collections::HashMap;
420
421 #[test]
422 fn test_missing_kmer_constant() {
423 assert_eq!(MISSING_KMER, !0u64);
424 }
425
426 #[test]
427 fn test_segment_part_creation() {
428 let part = SegmentPart {
429 kmer1: 12345,
430 kmer2: 67890,
431 sample_name: "sample1".to_string(),
432 contig_name: "chr1".to_string(),
433 seg_data: vec![0, 1, 2, 3],
434 is_rev_comp: false,
435 seg_part_no: 0,
436 };
437
438 assert_eq!(part.kmer1, 12345);
439 assert_eq!(part.kmer2, 67890);
440 assert_eq!(part.sample_name, "sample1");
441 assert_eq!(part.seg_part_no, 0);
442 }
443
444 #[test]
445 fn test_compress_contig_no_splitters() {
446 use std::collections::HashSet;
447
448 let ctx = CompressionContext {
449 splitters: Arc::new(Mutex::new(HashSet::new())),
450 bloom_splitters: Arc::new(Mutex::new(crate::bloom_filter::BloomFilter::new(1024))),
451 buffered_segments: Arc::new(Mutex::new(BufferedSegments::new(0))),
452 kmer_length: 21,
453 adaptive_mode: false,
454 map_segments: Arc::new(Mutex::new(HashMap::new())),
455 map_segments_terminators: Arc::new(Mutex::new(HashMap::new())),
456 concatenated_genomes: false,
457 };
458
459 // Simple contig with no splitters
460 let contig = vec![0, 1, 2, 3, 0, 1, 2, 3]; // ACGTACGT
461
462 let result = compress_contig("sample1", "chr1", &contig, &ctx);
463
464 // Should succeed (one whole-contig segment)
465 assert!(result);
466
467 // Should have 1 NEW segment
468 let buffered = ctx.buffered_segments.lock().unwrap();
469 assert_eq!(buffered.get_num_new(), 1);
470 }
471
472 #[test]
473 fn test_compress_contig_adaptive_failure() {
474 use std::collections::HashSet;
475
476 let ctx = CompressionContext {
477 splitters: Arc::new(Mutex::new(HashSet::new())),
478 bloom_splitters: Arc::new(Mutex::new(crate::bloom_filter::BloomFilter::new(1024))),
479 buffered_segments: Arc::new(Mutex::new(BufferedSegments::new(0))),
480 kmer_length: 21,
481 adaptive_mode: true, // Adaptive mode enabled
482 map_segments: Arc::new(Mutex::new(HashMap::new())),
483 map_segments_terminators: Arc::new(Mutex::new(HashMap::new())),
484 concatenated_genomes: false,
485 };
486
487 // Short contig with no splitters
488 let contig = vec![0, 1, 2, 3, 0, 1, 2, 3];
489
490 let result = compress_contig("sample1", "chr1", &contig, &ctx);
491
492 // Should FAIL in adaptive mode (no splitters found)
493 assert!(!result);
494 }
495}