ragc_core/contig_compression.rs
1//! Contig compression with inline segmentation
2//!
3//! This module implements C++ AGC's compress_contig and add_segment logic
4//! for inline segmentation during compression.
5//!
6//! See docs/INLINE_SEGMENTATION_PATTERN.md for detailed C++ AGC analysis.
7
8use crate::kmer::{Kmer, KmerMode};
9use crate::segment_buffer::BufferedSegments;
10use ahash::{AHashMap, AHashSet};
11use std::sync::{Arc, Mutex};
12
13/// Contig type (vector of nucleotides in numeric encoding)
14pub type Contig = Vec<u8>;
15
16/// Missing k-mer marker (matches C++ AGC's kmer_t(-1) = u64::MAX)
17pub const MISSING_KMER: u64 = u64::MAX;
18
19/// Segment part for inline buffering
20#[derive(Debug, Clone)]
21pub struct SegmentPart {
22 pub kmer1: u64,
23 pub kmer2: u64,
24 pub sample_name: String,
25 pub contig_name: String,
26 pub seg_data: Vec<u8>,
27 pub is_rev_comp: bool,
28 pub seg_part_no: u32,
29}
30
31/// Shared state for compression (passed from worker threads)
32pub struct CompressionContext {
33 pub splitters: Arc<Mutex<AHashSet<u64>>>,
34 pub bloom_splitters: Arc<Mutex<crate::bloom_filter::BloomFilter>>,
35 pub buffered_segments: Arc<Mutex<BufferedSegments>>,
36 pub kmer_length: usize,
37 pub adaptive_mode: bool,
38 pub map_segments: Arc<Mutex<AHashMap<(u64, u64), u32>>>,
39 pub map_segments_terminators: Arc<Mutex<AHashMap<u64, Vec<u64>>>>,
40 pub concatenated_genomes: bool,
41}
42
43/// Compress a contig with inline segmentation
44///
45/// Matches C++ AGC's compress_contig (agc_compressor.cpp:2000-2054)
46///
47/// # Arguments
48/// * `sample_name` - Sample identifier
49/// * `contig_name` - Contig identifier
50/// * `contig` - Nucleotide sequence (numeric encoding: A=0, C=1, G=2, T=3)
51/// * `ctx` - Compression context with splitters and buffering
52///
53/// # Returns
54/// * `true` - Contig segmented successfully
55/// * `false` - Contig failed to segment (adaptive mode: needs new splitters)
56///
57/// # Algorithm
58/// 1. Scan contig with k-mer sliding window
59/// 2. Check each k-mer against splitters (bloom filter + hash set)
60/// 3. When splitter found: extract segment, call add_segment()
61/// 4. If no splitters found and adaptive mode: return false
62/// 5. Handle final segment (after last splitter to end)
63pub fn compress_contig(
64 sample_name: &str,
65 contig_name: &str,
66 contig: &Contig,
67 ctx: &CompressionContext,
68) -> bool {
69 let kmer_length = ctx.kmer_length as u32;
70
71 // Create k-mer scanner for canonical k-mers
72 let mut kmer = Kmer::new(kmer_length, KmerMode::Canonical);
73
74 // Track segment boundaries
75 let mut split_pos: usize = 0; // Start of current segment
76 let mut split_kmer = Kmer::new(kmer_length, KmerMode::Canonical); // K-mer at split position
77 let mut seg_part_no: u32 = 0;
78
79 // Scan contig for splitters
80 for (pos, &base) in contig.iter().enumerate() {
81 kmer.insert(base as u64);
82
83 if !kmer.is_full() {
84 continue;
85 }
86
87 // Get canonical k-mer value
88 let kmer_val = kmer.data_canonical();
89
90 // Check if this k-mer is a splitter
91 // C++ AGC: bloom_splitters.check(d) && hs_splitters.check(d)
92 if is_splitter(kmer_val, ctx) {
93 // Calculate where next segment would start (with k-base overlap)
94 let new_split_pos = pos + 1 - kmer_length as usize;
95
96 // Only create a segment if it's longer than just the overlapping k-mer
97 // If new_split_pos <= split_pos, the segment would be at most k bases long
98 if new_split_pos > split_pos {
99 // Found splitter - extract segment from split_pos to current position
100 let seg_start = split_pos;
101 let seg_end = pos + 1; // Inclusive of current position
102 let segment = contig[seg_start..seg_end].to_vec();
103
104 // Add segment with terminal k-mers (split_kmer, current kmer)
105 add_segment(
106 sample_name,
107 contig_name,
108 seg_part_no,
109 segment,
110 &split_kmer,
111 &kmer,
112 ctx,
113 );
114
115 seg_part_no += 1;
116 }
117
118 // Always update split position and k-mer for next segment
119 split_pos = new_split_pos;
120 split_kmer = kmer.clone();
121 }
122 }
123
124 // Check if contig failed to segment (adaptive mode)
125 if ctx.adaptive_mode && seg_part_no == 0 {
126 // No splitters found - this is a "hard" contig
127 // split_kmer is still empty (never set), which C++ AGC checks as:
128 // if (adaptive_compression && split_kmer == CKmer(...))
129 return false; // Signal failure - needs new splitters
130 }
131
132 // Add final segment (from last splitter to end of contig)
133 // Skip if the final segment would be k bases or shorter (just the overlap)
134 if split_pos < contig.len() && contig.len() - split_pos > ctx.kmer_length {
135 let segment = contig[split_pos..].to_vec();
136
137 add_segment(
138 sample_name,
139 contig_name,
140 seg_part_no,
141 segment,
142 &split_kmer,
143 &Kmer::new(kmer_length, KmerMode::Canonical), // Empty k-mer for end
144 ctx,
145 );
146 }
147
148 true // Success
149}
150
151/// Add segment to buffered storage with terminal k-mer detection
152///
153/// Matches C++ AGC's add_segment (agc_compressor.cpp:1275-1507)
154///
155/// # Terminal K-mer Cases
156/// 1. **No terminators** (k1 = k2 = ~0): Whole-contig segment
157/// 2. **Both terminators** (k1, k2 both valid): Normal segment
158/// 3. **Front-only** (k1 valid, k2 = ~0): First segment
159/// 4. **Back-only** (k1 = ~0, k2 valid): Last segment
160///
161/// # Split Detection
162/// If segment is NEW and both k1 and k2 are in map_segments_terminators:
163/// - Try to find shared middle splitter k_mid
164/// - Split segment into two overlapping parts
165///
166/// See INLINE_SEGMENTATION_PATTERN.md Section 2 for detailed algorithm.
167fn add_segment(
168 sample_name: &str,
169 contig_name: &str,
170 seg_part_no: u32,
171 mut segment: Vec<u8>,
172 kmer_front: &Kmer,
173 kmer_back: &Kmer,
174 ctx: &CompressionContext,
175) {
176 // Determine terminal k-mers (k1, k2) based on which are full
177 let k1 = if kmer_front.is_full() {
178 kmer_front.data_canonical()
179 } else {
180 MISSING_KMER
181 };
182
183 let k2 = if kmer_back.is_full() {
184 kmer_back.data_canonical()
185 } else {
186 MISSING_KMER
187 };
188
189 // Normalize key to canonical order (C++ AGC uses minmax)
190 let pk = if k1 <= k2 { (k1, k2) } else { (k2, k1) };
191
192 // Check if this segment key is already registered
193 let map_segments = ctx.map_segments.lock().unwrap();
194 let group_id = map_segments.get(&pk).copied();
195 drop(map_segments);
196
197 // Try split detection if NEW segment
198 let mut segment2: Option<Vec<u8>> = None;
199 let mut pk2: Option<(u64, u64)> = None;
200
201 if group_id.is_none() && !ctx.concatenated_genomes {
202 // NEW segment - check for split eligibility
203 if k1 != MISSING_KMER && k2 != MISSING_KMER {
204 // Both terminators exist - check if they're in terminators map
205 let terminators = ctx.map_segments_terminators.lock().unwrap();
206 let k1_in_map = terminators.contains_key(&k1);
207 let k2_in_map = terminators.contains_key(&k2);
208 drop(terminators);
209
210 if k1_in_map && k2_in_map {
211 // Try to find shared middle splitter
212 if let Some((k_middle, left_size, right_size)) =
213 find_cand_segment_with_missing_middle_splitter(kmer_front, kmer_back, ctx)
214 {
215 // Determine split outcome
216 if left_size == 0 || right_size == 0 {
217 // No actual split - entire segment goes to one group
218 // (This shouldn't happen but C++ AGC handles it)
219 } else {
220 // Split into 2 segments with overlap
221 let kmer_length = ctx.kmer_length;
222 let seg2_start_pos = left_size - kmer_length / 2;
223
224 // segment2: [seg2_start_pos, end)
225 segment2 = Some(segment[seg2_start_pos..].to_vec());
226
227 // segment: [0, seg2_start_pos + kmer_length)
228 segment.resize(seg2_start_pos + kmer_length, 0);
229
230 // Segment 1 key: (k1, k_middle)
231 pk2 = Some(if k_middle <= k2 {
232 (k_middle, k2)
233 } else {
234 (k2, k_middle)
235 });
236
237 // Note: pk remains (k1, k_middle) or (k_middle, k1) depending on order
238 }
239 }
240 }
241 }
242 }
243
244 // TODO: ZSTD compress segments
245 // For now, just store uncompressed
246 let compressed_seg = segment.clone();
247
248 // Add first segment (or whole segment if no split)
249 if group_id.is_some() {
250 // KNOWN segment
251 let buffered = ctx.buffered_segments.lock().unwrap();
252 buffered.add_known(
253 group_id.unwrap(),
254 MISSING_KMER,
255 MISSING_KMER,
256 sample_name.to_string(),
257 contig_name.to_string(),
258 compressed_seg,
259 false, // TODO: Determine is_rev_comp from k-mer comparison
260 seg_part_no,
261 );
262 } else {
263 // NEW segment
264 let buffered = ctx.buffered_segments.lock().unwrap();
265 buffered.add_new(
266 pk.0,
267 pk.1,
268 sample_name.to_string(),
269 contig_name.to_string(),
270 compressed_seg,
271 false, // TODO: Determine is_rev_comp
272 seg_part_no,
273 );
274 }
275
276 // Add second segment if split occurred
277 if let (Some(seg2), Some(pk2_key)) = (segment2, pk2) {
278 // TODO: ZSTD compress segment2
279 let compressed_seg2 = seg2.clone();
280
281 // Check if second segment key is known
282 let map_segments = ctx.map_segments.lock().unwrap();
283 let group_id2 = map_segments.get(&pk2_key).copied();
284 drop(map_segments);
285
286 let buffered = ctx.buffered_segments.lock().unwrap();
287 if let Some(gid2) = group_id2 {
288 // KNOWN segment
289 buffered.add_known(
290 gid2,
291 MISSING_KMER,
292 MISSING_KMER,
293 sample_name.to_string(),
294 contig_name.to_string(),
295 compressed_seg2,
296 false, // TODO: is_rev_comp
297 seg_part_no + 1,
298 );
299 } else {
300 // NEW segment
301 buffered.add_new(
302 pk2_key.0,
303 pk2_key.1,
304 sample_name.to_string(),
305 contig_name.to_string(),
306 compressed_seg2,
307 false, // TODO: is_rev_comp
308 seg_part_no + 1,
309 );
310 }
311 }
312}
313
314/// Check if k-mer is a splitter
315///
316/// Matches C++ AGC's two-stage check:
317/// 1. Bloom filter (fast, probabilistic)
318/// 2. Hash set (exact)
319fn is_splitter(kmer: u64, ctx: &CompressionContext) -> bool {
320 // Fast check: bloom filter (may have false positives)
321 let bloom = ctx.bloom_splitters.lock().unwrap();
322 if !bloom.check(kmer) {
323 return false; // Definitely not a splitter
324 }
325 drop(bloom); // Release lock before next check
326
327 // Exact check: hash set (no false positives)
328 let splitters = ctx.splitters.lock().unwrap();
329 splitters.contains(&kmer)
330}
331
332/// Find candidate segment with missing middle splitter (for split detection)
333///
334/// Matches C++ AGC's find_cand_segment_with_missing_middle_splitter
335/// (agc_compressor.cpp:1510-1597)
336///
337/// # Algorithm
338/// 1. Find intersection of terminators for k1 and k2
339/// 2. For each shared k_mid in intersection:
340/// - Check if (k1, k_mid) and (k_mid, k2) both exist in map_segments
341/// - Find k_mid position in segment (via find_middle_splitter)
342/// 3. Return first valid split: (middle_kmer, left_size, right_size)
343///
344/// # Returns
345/// - Some((k_middle, left_size, right_size)) if split found
346/// - None if no valid split
347fn find_cand_segment_with_missing_middle_splitter(
348 kmer_front: &Kmer,
349 kmer_back: &Kmer,
350 ctx: &CompressionContext,
351) -> Option<(u64, usize, usize)> {
352 let k1 = kmer_front.data_canonical();
353 let k2 = kmer_back.data_canonical();
354
355 // Get terminator lists for k1 and k2
356 let terminators = ctx.map_segments_terminators.lock().unwrap();
357 let k1_terminators = terminators.get(&k1)?;
358 let k2_terminators = terminators.get(&k2)?;
359
360 // Find shared terminators (intersection)
361 // Both lists are sorted, so we can use two-pointer technique
362 let mut shared_splitters = Vec::new();
363 let mut i = 0;
364 let mut j = 0;
365
366 while i < k1_terminators.len() && j < k2_terminators.len() {
367 if k1_terminators[i] == k2_terminators[j] {
368 shared_splitters.push(k1_terminators[i]);
369 i += 1;
370 j += 1;
371 } else if k1_terminators[i] < k2_terminators[j] {
372 i += 1;
373 } else {
374 j += 1;
375 }
376 }
377
378 if shared_splitters.is_empty() {
379 return None;
380 }
381
382 drop(terminators);
383
384 // Try each shared splitter
385 let map_segments = ctx.map_segments.lock().unwrap();
386
387 for &k_middle in &shared_splitters {
388 // Create keys (k1, k_middle) and (k_middle, k2) in canonical order
389 let pk_left = if k1 <= k_middle {
390 (k1, k_middle)
391 } else {
392 (k_middle, k1)
393 };
394
395 let pk_right = if k_middle <= k2 {
396 (k_middle, k2)
397 } else {
398 (k2, k_middle)
399 };
400
401 // Check if both groups exist
402 if map_segments.contains_key(&pk_left) && map_segments.contains_key(&pk_right) {
403 drop(map_segments);
404
405 // TODO: Find actual position of k_middle in segment using find_middle_splitter
406 // This requires:
407 // - Decompressing both segments from v_segments[pk_left] and v_segments[pk_right]
408 // - Finding the position where k_middle occurs
409 // - Computing optimal split position via cost analysis
410 //
411 // For now, return a simplified result with approximate position
412 // This is a placeholder - real implementation needed for correctness
413
414 // Placeholder: assume k_middle is in the middle
415 // TODO: Implement find_middle_splitter properly
416 let left_size = ctx.kmer_length; // Placeholder
417 let right_size = ctx.kmer_length; // Placeholder
418
419 return Some((k_middle, left_size, right_size));
420 }
421 }
422
423 None
424}
425
426#[cfg(test)]
427mod tests {
428 use super::*;
429 use std::collections::HashMap;
430
431 #[test]
432 fn test_missing_kmer_constant() {
433 // MISSING_KMER must be u64::MAX to match C++ AGC's kmer_t(-1)
434 assert_eq!(MISSING_KMER, u64::MAX);
435 }
436
437 #[test]
438 fn test_segment_part_creation() {
439 let part = SegmentPart {
440 kmer1: 12345,
441 kmer2: 67890,
442 sample_name: "sample1".to_string(),
443 contig_name: "chr1".to_string(),
444 seg_data: vec![0, 1, 2, 3],
445 is_rev_comp: false,
446 seg_part_no: 0,
447 };
448
449 assert_eq!(part.kmer1, 12345);
450 assert_eq!(part.kmer2, 67890);
451 assert_eq!(part.sample_name, "sample1");
452 assert_eq!(part.seg_part_no, 0);
453 }
454
455 #[test]
456 fn test_compress_contig_no_splitters() {
457 let ctx = CompressionContext {
458 splitters: Arc::new(Mutex::new(AHashSet::new())),
459 bloom_splitters: Arc::new(Mutex::new(crate::bloom_filter::BloomFilter::new(1024))),
460 buffered_segments: Arc::new(Mutex::new(BufferedSegments::new(0))),
461 kmer_length: 3, // Use short k-mer to work with small test contig
462 adaptive_mode: false,
463 map_segments: Arc::new(Mutex::new(AHashMap::new())),
464 map_segments_terminators: Arc::new(Mutex::new(AHashMap::new())),
465 concatenated_genomes: false,
466 };
467
468 // Simple contig with no splitters (must be longer than kmer_length)
469 let contig = vec![0, 1, 2, 3, 0, 1, 2, 3]; // ACGTACGT
470
471 let result = compress_contig("sample1", "chr1", &contig, &ctx);
472
473 // Should succeed (one whole-contig segment)
474 assert!(result);
475
476 // Should have 1 NEW segment
477 let buffered = ctx.buffered_segments.lock().unwrap();
478 assert_eq!(buffered.get_num_new(), 1);
479 }
480
481 #[test]
482 fn test_compress_contig_adaptive_failure() {
483 let ctx = CompressionContext {
484 splitters: Arc::new(Mutex::new(AHashSet::new())),
485 bloom_splitters: Arc::new(Mutex::new(crate::bloom_filter::BloomFilter::new(1024))),
486 buffered_segments: Arc::new(Mutex::new(BufferedSegments::new(0))),
487 kmer_length: 3, // Use short k-mer to work with small test contig
488 adaptive_mode: true, // Adaptive mode enabled
489 map_segments: Arc::new(Mutex::new(AHashMap::new())),
490 map_segments_terminators: Arc::new(Mutex::new(AHashMap::new())),
491 concatenated_genomes: false,
492 };
493
494 // Short contig with no splitters
495 let contig = vec![0, 1, 2, 3, 0, 1, 2, 3];
496
497 let result = compress_contig("sample1", "chr1", &contig, &ctx);
498
499 // Should FAIL in adaptive mode (no splitters found)
500 assert!(!result);
501 }
502}