ragc_core/
contig_compression.rs

1//! Contig compression with inline segmentation
2//!
3//! This module implements C++ AGC's compress_contig and add_segment logic
4//! for inline segmentation during compression.
5//!
6//! See docs/INLINE_SEGMENTATION_PATTERN.md for detailed C++ AGC analysis.
7
8use crate::kmer::{Kmer, KmerMode};
9use crate::segment_buffer::BufferedSegments;
10use std::sync::{Arc, Mutex};
11
12/// Contig type (vector of nucleotides in numeric encoding)
13pub type Contig = Vec<u8>;
14
15/// Missing k-mer marker (both k1 and k2 = ~0 means no terminal splitters)
16pub const MISSING_KMER: u64 = !0u64;
17
18/// Segment part for inline buffering
19#[derive(Debug, Clone)]
20pub struct SegmentPart {
21    pub kmer1: u64,
22    pub kmer2: u64,
23    pub sample_name: String,
24    pub contig_name: String,
25    pub seg_data: Vec<u8>,
26    pub is_rev_comp: bool,
27    pub seg_part_no: u32,
28}
29
30/// Shared state for compression (passed from worker threads)
31pub struct CompressionContext {
32    pub splitters: Arc<Mutex<std::collections::HashSet<u64>>>,
33    pub bloom_splitters: Arc<Mutex<crate::bloom_filter::BloomFilter>>,
34    pub buffered_segments: Arc<Mutex<BufferedSegments>>,
35    pub kmer_length: usize,
36    pub adaptive_mode: bool,
37    pub map_segments: Arc<Mutex<std::collections::HashMap<(u64, u64), u32>>>,
38    pub map_segments_terminators: Arc<Mutex<std::collections::HashMap<u64, Vec<u64>>>>,
39    pub concatenated_genomes: bool,
40}
41
42/// Compress a contig with inline segmentation
43///
44/// Matches C++ AGC's compress_contig (agc_compressor.cpp:2000-2054)
45///
46/// # Arguments
47/// * `sample_name` - Sample identifier
48/// * `contig_name` - Contig identifier
49/// * `contig` - Nucleotide sequence (numeric encoding: A=0, C=1, G=2, T=3)
50/// * `ctx` - Compression context with splitters and buffering
51///
52/// # Returns
53/// * `true` - Contig segmented successfully
54/// * `false` - Contig failed to segment (adaptive mode: needs new splitters)
55///
56/// # Algorithm
57/// 1. Scan contig with k-mer sliding window
58/// 2. Check each k-mer against splitters (bloom filter + hash set)
59/// 3. When splitter found: extract segment, call add_segment()
60/// 4. If no splitters found and adaptive mode: return false
61/// 5. Handle final segment (after last splitter to end)
62pub fn compress_contig(
63    sample_name: &str,
64    contig_name: &str,
65    contig: &Contig,
66    ctx: &CompressionContext,
67) -> bool {
68    let kmer_length = ctx.kmer_length as u32;
69
70    // Create k-mer scanner for canonical k-mers
71    let mut kmer = Kmer::new(kmer_length, KmerMode::Canonical);
72
73    // Track segment boundaries
74    let mut split_pos: usize = 0; // Start of current segment
75    let mut split_kmer = Kmer::new(kmer_length, KmerMode::Canonical); // K-mer at split position
76    let mut seg_part_no: u32 = 0;
77
78    // Scan contig for splitters
79    for (pos, &base) in contig.iter().enumerate() {
80        kmer.insert(base as u64);
81
82        if !kmer.is_full() {
83            continue;
84        }
85
86        // Get canonical k-mer value
87        let kmer_val = kmer.data_canonical();
88
89        // Check if this k-mer is a splitter
90        // C++ AGC: bloom_splitters.check(d) && hs_splitters.check(d)
91        if is_splitter(kmer_val, ctx) {
92            // Found splitter - extract segment from split_pos to current position
93            let seg_start = split_pos;
94            let seg_end = pos + 1; // Inclusive of current position
95            let segment = contig[seg_start..seg_end].to_vec();
96
97            // Add segment with terminal k-mers (split_kmer, current kmer)
98            add_segment(
99                sample_name,
100                contig_name,
101                seg_part_no,
102                segment,
103                &split_kmer,
104                &kmer,
105                ctx,
106            );
107
108            // Update for next segment
109            seg_part_no += 1;
110            split_pos = pos + 1 - kmer_length as usize; // Start of next segment overlaps by kmer_length
111            split_kmer = kmer.clone();
112        }
113    }
114
115    // Check if contig failed to segment (adaptive mode)
116    if ctx.adaptive_mode && seg_part_no == 0 {
117        // No splitters found - this is a "hard" contig
118        // split_kmer is still empty (never set), which C++ AGC checks as:
119        // if (adaptive_compression && split_kmer == CKmer(...))
120        return false; // Signal failure - needs new splitters
121    }
122
123    // Add final segment (from last splitter to end of contig)
124    if split_pos < contig.len() {
125        let segment = contig[split_pos..].to_vec();
126
127        add_segment(
128            sample_name,
129            contig_name,
130            seg_part_no,
131            segment,
132            &split_kmer,
133            &Kmer::new(kmer_length, KmerMode::Canonical), // Empty k-mer for end
134            ctx,
135        );
136    }
137
138    true // Success
139}
140
141/// Add segment to buffered storage with terminal k-mer detection
142///
143/// Matches C++ AGC's add_segment (agc_compressor.cpp:1275-1507)
144///
145/// # Terminal K-mer Cases
146/// 1. **No terminators** (k1 = k2 = ~0): Whole-contig segment
147/// 2. **Both terminators** (k1, k2 both valid): Normal segment
148/// 3. **Front-only** (k1 valid, k2 = ~0): First segment
149/// 4. **Back-only** (k1 = ~0, k2 valid): Last segment
150///
151/// # Split Detection
152/// If segment is NEW and both k1 and k2 are in map_segments_terminators:
153/// - Try to find shared middle splitter k_mid
154/// - Split segment into two overlapping parts
155///
156/// See INLINE_SEGMENTATION_PATTERN.md Section 2 for detailed algorithm.
157fn add_segment(
158    sample_name: &str,
159    contig_name: &str,
160    seg_part_no: u32,
161    mut segment: Vec<u8>,
162    kmer_front: &Kmer,
163    kmer_back: &Kmer,
164    ctx: &CompressionContext,
165) {
166    // Determine terminal k-mers (k1, k2) based on which are full
167    let k1 = if kmer_front.is_full() {
168        kmer_front.data_canonical()
169    } else {
170        MISSING_KMER
171    };
172
173    let k2 = if kmer_back.is_full() {
174        kmer_back.data_canonical()
175    } else {
176        MISSING_KMER
177    };
178
179    // Normalize key to canonical order (C++ AGC uses minmax)
180    let pk = if k1 <= k2 { (k1, k2) } else { (k2, k1) };
181
182    // Check if this segment key is already registered
183    let map_segments = ctx.map_segments.lock().unwrap();
184    let group_id = map_segments.get(&pk).copied();
185    drop(map_segments);
186
187    // Try split detection if NEW segment
188    let mut segment2: Option<Vec<u8>> = None;
189    let mut pk2: Option<(u64, u64)> = None;
190
191    if group_id.is_none() && !ctx.concatenated_genomes {
192        // NEW segment - check for split eligibility
193        if k1 != MISSING_KMER && k2 != MISSING_KMER {
194            // Both terminators exist - check if they're in terminators map
195            let terminators = ctx.map_segments_terminators.lock().unwrap();
196            let k1_in_map = terminators.contains_key(&k1);
197            let k2_in_map = terminators.contains_key(&k2);
198            drop(terminators);
199
200            if k1_in_map && k2_in_map {
201                // Try to find shared middle splitter
202                if let Some((k_middle, left_size, right_size)) =
203                    find_cand_segment_with_missing_middle_splitter(kmer_front, kmer_back, ctx)
204                {
205                    // Determine split outcome
206                    if left_size == 0 || right_size == 0 {
207                        // No actual split - entire segment goes to one group
208                        // (This shouldn't happen but C++ AGC handles it)
209                    } else {
210                        // Split into 2 segments with overlap
211                        let kmer_length = ctx.kmer_length;
212                        let seg2_start_pos = left_size - kmer_length / 2;
213
214                        // segment2: [seg2_start_pos, end)
215                        segment2 = Some(segment[seg2_start_pos..].to_vec());
216
217                        // segment: [0, seg2_start_pos + kmer_length)
218                        segment.resize(seg2_start_pos + kmer_length, 0);
219
220                        // Segment 1 key: (k1, k_middle)
221                        pk2 = Some(if k_middle <= k2 {
222                            (k_middle, k2)
223                        } else {
224                            (k2, k_middle)
225                        });
226
227                        // Note: pk remains (k1, k_middle) or (k_middle, k1) depending on order
228                    }
229                }
230            }
231        }
232    }
233
234    // TODO: ZSTD compress segments
235    // For now, just store uncompressed
236    let compressed_seg = segment.clone();
237
238    // Add first segment (or whole segment if no split)
239    if group_id.is_some() {
240        // KNOWN segment
241        let buffered = ctx.buffered_segments.lock().unwrap();
242        buffered.add_known(
243            group_id.unwrap(),
244            MISSING_KMER,
245            MISSING_KMER,
246            sample_name.to_string(),
247            contig_name.to_string(),
248            compressed_seg,
249            false, // TODO: Determine is_rev_comp from k-mer comparison
250            seg_part_no,
251        );
252    } else {
253        // NEW segment
254        let buffered = ctx.buffered_segments.lock().unwrap();
255        buffered.add_new(
256            pk.0,
257            pk.1,
258            sample_name.to_string(),
259            contig_name.to_string(),
260            compressed_seg,
261            false, // TODO: Determine is_rev_comp
262            seg_part_no,
263        );
264    }
265
266    // Add second segment if split occurred
267    if let (Some(seg2), Some(pk2_key)) = (segment2, pk2) {
268        // TODO: ZSTD compress segment2
269        let compressed_seg2 = seg2.clone();
270
271        // Check if second segment key is known
272        let map_segments = ctx.map_segments.lock().unwrap();
273        let group_id2 = map_segments.get(&pk2_key).copied();
274        drop(map_segments);
275
276        let buffered = ctx.buffered_segments.lock().unwrap();
277        if let Some(gid2) = group_id2 {
278            // KNOWN segment
279            buffered.add_known(
280                gid2,
281                MISSING_KMER,
282                MISSING_KMER,
283                sample_name.to_string(),
284                contig_name.to_string(),
285                compressed_seg2,
286                false, // TODO: is_rev_comp
287                seg_part_no + 1,
288            );
289        } else {
290            // NEW segment
291            buffered.add_new(
292                pk2_key.0,
293                pk2_key.1,
294                sample_name.to_string(),
295                contig_name.to_string(),
296                compressed_seg2,
297                false, // TODO: is_rev_comp
298                seg_part_no + 1,
299            );
300        }
301    }
302}
303
304/// Check if k-mer is a splitter
305///
306/// Matches C++ AGC's two-stage check:
307/// 1. Bloom filter (fast, probabilistic)
308/// 2. Hash set (exact)
309fn is_splitter(kmer: u64, ctx: &CompressionContext) -> bool {
310    // Fast check: bloom filter (may have false positives)
311    let bloom = ctx.bloom_splitters.lock().unwrap();
312    if !bloom.check(kmer) {
313        return false; // Definitely not a splitter
314    }
315    drop(bloom); // Release lock before next check
316
317    // Exact check: hash set (no false positives)
318    let splitters = ctx.splitters.lock().unwrap();
319    splitters.contains(&kmer)
320}
321
322/// Find candidate segment with missing middle splitter (for split detection)
323///
324/// Matches C++ AGC's find_cand_segment_with_missing_middle_splitter
325/// (agc_compressor.cpp:1510-1597)
326///
327/// # Algorithm
328/// 1. Find intersection of terminators for k1 and k2
329/// 2. For each shared k_mid in intersection:
330///    - Check if (k1, k_mid) and (k_mid, k2) both exist in map_segments
331///    - Find k_mid position in segment (via find_middle_splitter)
332/// 3. Return first valid split: (middle_kmer, left_size, right_size)
333///
334/// # Returns
335/// - Some((k_middle, left_size, right_size)) if split found
336/// - None if no valid split
337fn find_cand_segment_with_missing_middle_splitter(
338    kmer_front: &Kmer,
339    kmer_back: &Kmer,
340    ctx: &CompressionContext,
341) -> Option<(u64, usize, usize)> {
342    let k1 = kmer_front.data_canonical();
343    let k2 = kmer_back.data_canonical();
344
345    // Get terminator lists for k1 and k2
346    let terminators = ctx.map_segments_terminators.lock().unwrap();
347    let k1_terminators = terminators.get(&k1)?;
348    let k2_terminators = terminators.get(&k2)?;
349
350    // Find shared terminators (intersection)
351    // Both lists are sorted, so we can use two-pointer technique
352    let mut shared_splitters = Vec::new();
353    let mut i = 0;
354    let mut j = 0;
355
356    while i < k1_terminators.len() && j < k2_terminators.len() {
357        if k1_terminators[i] == k2_terminators[j] {
358            shared_splitters.push(k1_terminators[i]);
359            i += 1;
360            j += 1;
361        } else if k1_terminators[i] < k2_terminators[j] {
362            i += 1;
363        } else {
364            j += 1;
365        }
366    }
367
368    if shared_splitters.is_empty() {
369        return None;
370    }
371
372    drop(terminators);
373
374    // Try each shared splitter
375    let map_segments = ctx.map_segments.lock().unwrap();
376
377    for &k_middle in &shared_splitters {
378        // Create keys (k1, k_middle) and (k_middle, k2) in canonical order
379        let pk_left = if k1 <= k_middle {
380            (k1, k_middle)
381        } else {
382            (k_middle, k1)
383        };
384
385        let pk_right = if k_middle <= k2 {
386            (k_middle, k2)
387        } else {
388            (k2, k_middle)
389        };
390
391        // Check if both groups exist
392        if map_segments.contains_key(&pk_left) && map_segments.contains_key(&pk_right) {
393            drop(map_segments);
394
395            // TODO: Find actual position of k_middle in segment using find_middle_splitter
396            // This requires:
397            // - Decompressing both segments from v_segments[pk_left] and v_segments[pk_right]
398            // - Finding the position where k_middle occurs
399            // - Computing optimal split position via cost analysis
400            //
401            // For now, return a simplified result with approximate position
402            // This is a placeholder - real implementation needed for correctness
403
404            // Placeholder: assume k_middle is in the middle
405            // TODO: Implement find_middle_splitter properly
406            let left_size = ctx.kmer_length; // Placeholder
407            let right_size = ctx.kmer_length; // Placeholder
408
409            return Some((k_middle, left_size, right_size));
410        }
411    }
412
413    None
414}
415
416#[cfg(test)]
417mod tests {
418    use super::*;
419    use std::collections::HashMap;
420
421    #[test]
422    fn test_missing_kmer_constant() {
423        assert_eq!(MISSING_KMER, !0u64);
424    }
425
426    #[test]
427    fn test_segment_part_creation() {
428        let part = SegmentPart {
429            kmer1: 12345,
430            kmer2: 67890,
431            sample_name: "sample1".to_string(),
432            contig_name: "chr1".to_string(),
433            seg_data: vec![0, 1, 2, 3],
434            is_rev_comp: false,
435            seg_part_no: 0,
436        };
437
438        assert_eq!(part.kmer1, 12345);
439        assert_eq!(part.kmer2, 67890);
440        assert_eq!(part.sample_name, "sample1");
441        assert_eq!(part.seg_part_no, 0);
442    }
443
444    #[test]
445    fn test_compress_contig_no_splitters() {
446        use std::collections::HashSet;
447
448        let ctx = CompressionContext {
449            splitters: Arc::new(Mutex::new(HashSet::new())),
450            bloom_splitters: Arc::new(Mutex::new(crate::bloom_filter::BloomFilter::new(1024))),
451            buffered_segments: Arc::new(Mutex::new(BufferedSegments::new(0))),
452            kmer_length: 21,
453            adaptive_mode: false,
454            map_segments: Arc::new(Mutex::new(HashMap::new())),
455            map_segments_terminators: Arc::new(Mutex::new(HashMap::new())),
456            concatenated_genomes: false,
457        };
458
459        // Simple contig with no splitters
460        let contig = vec![0, 1, 2, 3, 0, 1, 2, 3]; // ACGTACGT
461
462        let result = compress_contig("sample1", "chr1", &contig, &ctx);
463
464        // Should succeed (one whole-contig segment)
465        assert!(result);
466
467        // Should have 1 NEW segment
468        let buffered = ctx.buffered_segments.lock().unwrap();
469        assert_eq!(buffered.get_num_new(), 1);
470    }
471
472    #[test]
473    fn test_compress_contig_adaptive_failure() {
474        use std::collections::HashSet;
475
476        let ctx = CompressionContext {
477            splitters: Arc::new(Mutex::new(HashSet::new())),
478            bloom_splitters: Arc::new(Mutex::new(crate::bloom_filter::BloomFilter::new(1024))),
479            buffered_segments: Arc::new(Mutex::new(BufferedSegments::new(0))),
480            kmer_length: 21,
481            adaptive_mode: true, // Adaptive mode enabled
482            map_segments: Arc::new(Mutex::new(HashMap::new())),
483            map_segments_terminators: Arc::new(Mutex::new(HashMap::new())),
484            concatenated_genomes: false,
485        };
486
487        // Short contig with no splitters
488        let contig = vec![0, 1, 2, 3, 0, 1, 2, 3];
489
490        let result = compress_contig("sample1", "chr1", &contig, &ctx);
491
492        // Should FAIL in adaptive mode (no splitters found)
493        assert!(!result);
494    }
495}