ragc_core/
contig_compression.rs

1//! Contig compression with inline segmentation
2//!
3//! This module implements C++ AGC's compress_contig and add_segment logic
4//! for inline segmentation during compression.
5//!
6//! See docs/INLINE_SEGMENTATION_PATTERN.md for detailed C++ AGC analysis.
7
8use crate::kmer::{Kmer, KmerMode};
9use crate::segment_buffer::BufferedSegments;
10use ahash::{AHashMap, AHashSet};
11use std::sync::{Arc, Mutex};
12
13/// Contig type (vector of nucleotides in numeric encoding)
14pub type Contig = Vec<u8>;
15
16/// Missing k-mer marker (matches C++ AGC's kmer_t(-1) = u64::MAX)
17pub const MISSING_KMER: u64 = u64::MAX;
18
19/// Segment part for inline buffering
20#[derive(Debug, Clone)]
21pub struct SegmentPart {
22    pub kmer1: u64,
23    pub kmer2: u64,
24    pub sample_name: String,
25    pub contig_name: String,
26    pub seg_data: Vec<u8>,
27    pub is_rev_comp: bool,
28    pub seg_part_no: u32,
29}
30
31/// Shared state for compression (passed from worker threads)
32pub struct CompressionContext {
33    pub splitters: Arc<Mutex<AHashSet<u64>>>,
34    pub bloom_splitters: Arc<Mutex<crate::bloom_filter::BloomFilter>>,
35    pub buffered_segments: Arc<Mutex<BufferedSegments>>,
36    pub kmer_length: usize,
37    pub adaptive_mode: bool,
38    pub map_segments: Arc<Mutex<AHashMap<(u64, u64), u32>>>,
39    pub map_segments_terminators: Arc<Mutex<AHashMap<u64, Vec<u64>>>>,
40    pub concatenated_genomes: bool,
41}
42
43/// Compress a contig with inline segmentation
44///
45/// Matches C++ AGC's compress_contig (agc_compressor.cpp:2000-2054)
46///
47/// # Arguments
48/// * `sample_name` - Sample identifier
49/// * `contig_name` - Contig identifier
50/// * `contig` - Nucleotide sequence (numeric encoding: A=0, C=1, G=2, T=3)
51/// * `ctx` - Compression context with splitters and buffering
52///
53/// # Returns
54/// * `true` - Contig segmented successfully
55/// * `false` - Contig failed to segment (adaptive mode: needs new splitters)
56///
57/// # Algorithm
58/// 1. Scan contig with k-mer sliding window
59/// 2. Check each k-mer against splitters (bloom filter + hash set)
60/// 3. When splitter found: extract segment, call add_segment()
61/// 4. If no splitters found and adaptive mode: return false
62/// 5. Handle final segment (after last splitter to end)
63pub fn compress_contig(
64    sample_name: &str,
65    contig_name: &str,
66    contig: &Contig,
67    ctx: &CompressionContext,
68) -> bool {
69    let kmer_length = ctx.kmer_length as u32;
70
71    // Create k-mer scanner for canonical k-mers
72    let mut kmer = Kmer::new(kmer_length, KmerMode::Canonical);
73
74    // Track segment boundaries
75    let mut split_pos: usize = 0; // Start of current segment
76    let mut split_kmer = Kmer::new(kmer_length, KmerMode::Canonical); // K-mer at split position
77    let mut seg_part_no: u32 = 0;
78
79    // Scan contig for splitters
80    for (pos, &base) in contig.iter().enumerate() {
81        kmer.insert(base as u64);
82
83        if !kmer.is_full() {
84            continue;
85        }
86
87        // Get canonical k-mer value
88        let kmer_val = kmer.data_canonical();
89
90        // Check if this k-mer is a splitter
91        // C++ AGC: bloom_splitters.check(d) && hs_splitters.check(d)
92        if is_splitter(kmer_val, ctx) {
93            // Calculate where next segment would start (with k-base overlap)
94            let new_split_pos = pos + 1 - kmer_length as usize;
95
96            // Only create a segment if it's longer than just the overlapping k-mer
97            // If new_split_pos <= split_pos, the segment would be at most k bases long
98            if new_split_pos > split_pos {
99                // Found splitter - extract segment from split_pos to current position
100                let seg_start = split_pos;
101                let seg_end = pos + 1; // Inclusive of current position
102                let segment = contig[seg_start..seg_end].to_vec();
103
104                // Add segment with terminal k-mers (split_kmer, current kmer)
105                add_segment(
106                    sample_name,
107                    contig_name,
108                    seg_part_no,
109                    segment,
110                    &split_kmer,
111                    &kmer,
112                    ctx,
113                );
114
115                seg_part_no += 1;
116            }
117
118            // Always update split position and k-mer for next segment
119            split_pos = new_split_pos;
120            split_kmer = kmer.clone();
121        }
122    }
123
124    // Check if contig failed to segment (adaptive mode)
125    if ctx.adaptive_mode && seg_part_no == 0 {
126        // No splitters found - this is a "hard" contig
127        // split_kmer is still empty (never set), which C++ AGC checks as:
128        // if (adaptive_compression && split_kmer == CKmer(...))
129        return false; // Signal failure - needs new splitters
130    }
131
132    // Add final segment (from last splitter to end of contig)
133    // Skip if the final segment would be k bases or shorter (just the overlap)
134    if split_pos < contig.len() && contig.len() - split_pos > ctx.kmer_length {
135        let segment = contig[split_pos..].to_vec();
136
137        add_segment(
138            sample_name,
139            contig_name,
140            seg_part_no,
141            segment,
142            &split_kmer,
143            &Kmer::new(kmer_length, KmerMode::Canonical), // Empty k-mer for end
144            ctx,
145        );
146    }
147
148    true // Success
149}
150
151/// Add segment to buffered storage with terminal k-mer detection
152///
153/// Matches C++ AGC's add_segment (agc_compressor.cpp:1275-1507)
154///
155/// # Terminal K-mer Cases
156/// 1. **No terminators** (k1 = k2 = ~0): Whole-contig segment
157/// 2. **Both terminators** (k1, k2 both valid): Normal segment
158/// 3. **Front-only** (k1 valid, k2 = ~0): First segment
159/// 4. **Back-only** (k1 = ~0, k2 valid): Last segment
160///
161/// # Split Detection
162/// If segment is NEW and both k1 and k2 are in map_segments_terminators:
163/// - Try to find shared middle splitter k_mid
164/// - Split segment into two overlapping parts
165///
166/// See INLINE_SEGMENTATION_PATTERN.md Section 2 for detailed algorithm.
167fn add_segment(
168    sample_name: &str,
169    contig_name: &str,
170    seg_part_no: u32,
171    mut segment: Vec<u8>,
172    kmer_front: &Kmer,
173    kmer_back: &Kmer,
174    ctx: &CompressionContext,
175) {
176    // Determine terminal k-mers (k1, k2) based on which are full
177    let k1 = if kmer_front.is_full() {
178        kmer_front.data_canonical()
179    } else {
180        MISSING_KMER
181    };
182
183    let k2 = if kmer_back.is_full() {
184        kmer_back.data_canonical()
185    } else {
186        MISSING_KMER
187    };
188
189    // Normalize key to canonical order (C++ AGC uses minmax)
190    let pk = if k1 <= k2 { (k1, k2) } else { (k2, k1) };
191
192    // Check if this segment key is already registered
193    let map_segments = ctx.map_segments.lock().unwrap();
194    let group_id = map_segments.get(&pk).copied();
195    drop(map_segments);
196
197    // Try split detection if NEW segment
198    let mut segment2: Option<Vec<u8>> = None;
199    let mut pk2: Option<(u64, u64)> = None;
200
201    if group_id.is_none() && !ctx.concatenated_genomes {
202        // NEW segment - check for split eligibility
203        if k1 != MISSING_KMER && k2 != MISSING_KMER {
204            // Both terminators exist - check if they're in terminators map
205            let terminators = ctx.map_segments_terminators.lock().unwrap();
206            let k1_in_map = terminators.contains_key(&k1);
207            let k2_in_map = terminators.contains_key(&k2);
208            drop(terminators);
209
210            if k1_in_map && k2_in_map {
211                // Try to find shared middle splitter
212                if let Some((k_middle, left_size, right_size)) =
213                    find_cand_segment_with_missing_middle_splitter(kmer_front, kmer_back, ctx)
214                {
215                    // Determine split outcome
216                    if left_size == 0 || right_size == 0 {
217                        // No actual split - entire segment goes to one group
218                        // (This shouldn't happen but C++ AGC handles it)
219                    } else {
220                        // Split into 2 segments with overlap
221                        let kmer_length = ctx.kmer_length;
222                        let seg2_start_pos = left_size - kmer_length / 2;
223
224                        // segment2: [seg2_start_pos, end)
225                        segment2 = Some(segment[seg2_start_pos..].to_vec());
226
227                        // segment: [0, seg2_start_pos + kmer_length)
228                        segment.resize(seg2_start_pos + kmer_length, 0);
229
230                        // Segment 1 key: (k1, k_middle)
231                        pk2 = Some(if k_middle <= k2 {
232                            (k_middle, k2)
233                        } else {
234                            (k2, k_middle)
235                        });
236
237                        // Note: pk remains (k1, k_middle) or (k_middle, k1) depending on order
238                    }
239                }
240            }
241        }
242    }
243
244    // TODO: ZSTD compress segments
245    // For now, just store uncompressed
246    let compressed_seg = segment.clone();
247
248    // Add first segment (or whole segment if no split)
249    if group_id.is_some() {
250        // KNOWN segment
251        let buffered = ctx.buffered_segments.lock().unwrap();
252        buffered.add_known(
253            group_id.unwrap(),
254            MISSING_KMER,
255            MISSING_KMER,
256            sample_name.to_string(),
257            contig_name.to_string(),
258            compressed_seg,
259            false, // TODO: Determine is_rev_comp from k-mer comparison
260            seg_part_no,
261        );
262    } else {
263        // NEW segment
264        let buffered = ctx.buffered_segments.lock().unwrap();
265        buffered.add_new(
266            pk.0,
267            pk.1,
268            sample_name.to_string(),
269            contig_name.to_string(),
270            compressed_seg,
271            false, // TODO: Determine is_rev_comp
272            seg_part_no,
273        );
274    }
275
276    // Add second segment if split occurred
277    if let (Some(seg2), Some(pk2_key)) = (segment2, pk2) {
278        // TODO: ZSTD compress segment2
279        let compressed_seg2 = seg2.clone();
280
281        // Check if second segment key is known
282        let map_segments = ctx.map_segments.lock().unwrap();
283        let group_id2 = map_segments.get(&pk2_key).copied();
284        drop(map_segments);
285
286        let buffered = ctx.buffered_segments.lock().unwrap();
287        if let Some(gid2) = group_id2 {
288            // KNOWN segment
289            buffered.add_known(
290                gid2,
291                MISSING_KMER,
292                MISSING_KMER,
293                sample_name.to_string(),
294                contig_name.to_string(),
295                compressed_seg2,
296                false, // TODO: is_rev_comp
297                seg_part_no + 1,
298            );
299        } else {
300            // NEW segment
301            buffered.add_new(
302                pk2_key.0,
303                pk2_key.1,
304                sample_name.to_string(),
305                contig_name.to_string(),
306                compressed_seg2,
307                false, // TODO: is_rev_comp
308                seg_part_no + 1,
309            );
310        }
311    }
312}
313
314/// Check if k-mer is a splitter
315///
316/// Matches C++ AGC's two-stage check:
317/// 1. Bloom filter (fast, probabilistic)
318/// 2. Hash set (exact)
319fn is_splitter(kmer: u64, ctx: &CompressionContext) -> bool {
320    // Fast check: bloom filter (may have false positives)
321    let bloom = ctx.bloom_splitters.lock().unwrap();
322    if !bloom.check(kmer) {
323        return false; // Definitely not a splitter
324    }
325    drop(bloom); // Release lock before next check
326
327    // Exact check: hash set (no false positives)
328    let splitters = ctx.splitters.lock().unwrap();
329    splitters.contains(&kmer)
330}
331
332/// Find candidate segment with missing middle splitter (for split detection)
333///
334/// Matches C++ AGC's find_cand_segment_with_missing_middle_splitter
335/// (agc_compressor.cpp:1510-1597)
336///
337/// # Algorithm
338/// 1. Find intersection of terminators for k1 and k2
339/// 2. For each shared k_mid in intersection:
340///    - Check if (k1, k_mid) and (k_mid, k2) both exist in map_segments
341///    - Find k_mid position in segment (via find_middle_splitter)
342/// 3. Return first valid split: (middle_kmer, left_size, right_size)
343///
344/// # Returns
345/// - Some((k_middle, left_size, right_size)) if split found
346/// - None if no valid split
347fn find_cand_segment_with_missing_middle_splitter(
348    kmer_front: &Kmer,
349    kmer_back: &Kmer,
350    ctx: &CompressionContext,
351) -> Option<(u64, usize, usize)> {
352    let k1 = kmer_front.data_canonical();
353    let k2 = kmer_back.data_canonical();
354
355    // Get terminator lists for k1 and k2
356    let terminators = ctx.map_segments_terminators.lock().unwrap();
357    let k1_terminators = terminators.get(&k1)?;
358    let k2_terminators = terminators.get(&k2)?;
359
360    // Find shared terminators (intersection)
361    // Both lists are sorted, so we can use two-pointer technique
362    let mut shared_splitters = Vec::new();
363    let mut i = 0;
364    let mut j = 0;
365
366    while i < k1_terminators.len() && j < k2_terminators.len() {
367        if k1_terminators[i] == k2_terminators[j] {
368            shared_splitters.push(k1_terminators[i]);
369            i += 1;
370            j += 1;
371        } else if k1_terminators[i] < k2_terminators[j] {
372            i += 1;
373        } else {
374            j += 1;
375        }
376    }
377
378    if shared_splitters.is_empty() {
379        return None;
380    }
381
382    drop(terminators);
383
384    // Try each shared splitter
385    let map_segments = ctx.map_segments.lock().unwrap();
386
387    for &k_middle in &shared_splitters {
388        // Create keys (k1, k_middle) and (k_middle, k2) in canonical order
389        let pk_left = if k1 <= k_middle {
390            (k1, k_middle)
391        } else {
392            (k_middle, k1)
393        };
394
395        let pk_right = if k_middle <= k2 {
396            (k_middle, k2)
397        } else {
398            (k2, k_middle)
399        };
400
401        // Check if both groups exist
402        if map_segments.contains_key(&pk_left) && map_segments.contains_key(&pk_right) {
403            drop(map_segments);
404
405            // TODO: Find actual position of k_middle in segment using find_middle_splitter
406            // This requires:
407            // - Decompressing both segments from v_segments[pk_left] and v_segments[pk_right]
408            // - Finding the position where k_middle occurs
409            // - Computing optimal split position via cost analysis
410            //
411            // For now, return a simplified result with approximate position
412            // This is a placeholder - real implementation needed for correctness
413
414            // Placeholder: assume k_middle is in the middle
415            // TODO: Implement find_middle_splitter properly
416            let left_size = ctx.kmer_length; // Placeholder
417            let right_size = ctx.kmer_length; // Placeholder
418
419            return Some((k_middle, left_size, right_size));
420        }
421    }
422
423    None
424}
425
426#[cfg(test)]
427mod tests {
428    use super::*;
429    use std::collections::HashMap;
430
431    #[test]
432    fn test_missing_kmer_constant() {
433        // MISSING_KMER must be u64::MAX to match C++ AGC's kmer_t(-1)
434        assert_eq!(MISSING_KMER, u64::MAX);
435    }
436
437    #[test]
438    fn test_segment_part_creation() {
439        let part = SegmentPart {
440            kmer1: 12345,
441            kmer2: 67890,
442            sample_name: "sample1".to_string(),
443            contig_name: "chr1".to_string(),
444            seg_data: vec![0, 1, 2, 3],
445            is_rev_comp: false,
446            seg_part_no: 0,
447        };
448
449        assert_eq!(part.kmer1, 12345);
450        assert_eq!(part.kmer2, 67890);
451        assert_eq!(part.sample_name, "sample1");
452        assert_eq!(part.seg_part_no, 0);
453    }
454
455    #[test]
456    fn test_compress_contig_no_splitters() {
457        let ctx = CompressionContext {
458            splitters: Arc::new(Mutex::new(AHashSet::new())),
459            bloom_splitters: Arc::new(Mutex::new(crate::bloom_filter::BloomFilter::new(1024))),
460            buffered_segments: Arc::new(Mutex::new(BufferedSegments::new(0))),
461            kmer_length: 3, // Use short k-mer to work with small test contig
462            adaptive_mode: false,
463            map_segments: Arc::new(Mutex::new(AHashMap::new())),
464            map_segments_terminators: Arc::new(Mutex::new(AHashMap::new())),
465            concatenated_genomes: false,
466        };
467
468        // Simple contig with no splitters (must be longer than kmer_length)
469        let contig = vec![0, 1, 2, 3, 0, 1, 2, 3]; // ACGTACGT
470
471        let result = compress_contig("sample1", "chr1", &contig, &ctx);
472
473        // Should succeed (one whole-contig segment)
474        assert!(result);
475
476        // Should have 1 NEW segment
477        let buffered = ctx.buffered_segments.lock().unwrap();
478        assert_eq!(buffered.get_num_new(), 1);
479    }
480
481    #[test]
482    fn test_compress_contig_adaptive_failure() {
483        let ctx = CompressionContext {
484            splitters: Arc::new(Mutex::new(AHashSet::new())),
485            bloom_splitters: Arc::new(Mutex::new(crate::bloom_filter::BloomFilter::new(1024))),
486            buffered_segments: Arc::new(Mutex::new(BufferedSegments::new(0))),
487            kmer_length: 3,      // Use short k-mer to work with small test contig
488            adaptive_mode: true, // Adaptive mode enabled
489            map_segments: Arc::new(Mutex::new(AHashMap::new())),
490            map_segments_terminators: Arc::new(Mutex::new(AHashMap::new())),
491            concatenated_genomes: false,
492        };
493
494        // Short contig with no splitters
495        let contig = vec![0, 1, 2, 3, 0, 1, 2, 3];
496
497        let result = compress_contig("sample1", "chr1", &contig, &ctx);
498
499        // Should FAIL in adaptive mode (no splitters found)
500        assert!(!result);
501    }
502}