ragc_core/agc_compressor.rs
1// Queue-based streaming compressor API
2// Provides simple push() interface with automatic backpressure and constant memory usage
3
4use crate::kmer_extract::{enumerate_kmers, remove_non_singletons};
5use crate::lz_diff::LZDiff;
6use crate::memory_bounded_queue::MemoryBoundedQueue;
7use crate::segment::{split_at_splitters_with_size, MISSING_KMER};
8use crate::splitters::{determine_splitters, find_new_splitters_for_contig};
9use ahash::AHashSet;
10use anyhow::{Context, Result};
11use ragc_common::{Archive, CollectionV3, Contig, CONTIG_SEPARATOR};
12use std::collections::{BTreeMap, HashMap, HashSet};
13use std::path::Path;
14use std::sync::atomic::{AtomicI32, AtomicU32, AtomicUsize, Ordering};
15use std::sync::{Arc, Mutex, RwLock};
16use std::thread::{self, JoinHandle};
17
18/// MurmurHash64A implementation matching C++ AGC's MurMur64Hash
19/// This is the same hash function used by C++ AGC for fallback filtering
20fn murmur_hash_64a(key: u64) -> u64 {
21 const M: u64 = 0xc6a4a7935bd1e995;
22 const R: u32 = 47;
23
24 let mut h: u64 = 0xc70f6907u64.wrapping_mul(M);
25 let mut k = key;
26
27 k = k.wrapping_mul(M);
28 k ^= k >> R;
29 k = k.wrapping_mul(M);
30 h ^= k;
31 h = h.wrapping_mul(M);
32
33 h ^= h >> R;
34 h = h.wrapping_mul(M);
35 h ^= h >> R;
36
37 h
38}
39
40/// Fallback k-mer filter matching C++ AGC's kmer_filter_t
41/// Used to select a fraction of k-mers for fallback grouping
42#[derive(Debug, Clone)]
43struct FallbackFilter {
44 /// Threshold for hash comparison (0 = disabled, u64::MAX = all pass)
45 threshold: u64,
46 /// Random seed for hash mixing (matches C++ AGC's rnd constant)
47 rnd: u64,
48}
49
50impl FallbackFilter {
51 /// Create a new fallback filter with the given fraction
52 /// Matches C++ AGC's kmer_filter_t constructor
53 fn new(fraction: f64) -> Self {
54 let threshold = if fraction == 0.0 {
55 0
56 } else {
57 (u64::MAX as f64 * fraction) as u64
58 };
59 Self {
60 threshold,
61 rnd: 0xD73F8BF11046C40E, // Matches C++ AGC constant
62 }
63 }
64
65 /// Check if the filter is enabled (fraction > 0)
66 fn is_enabled(&self) -> bool {
67 self.threshold != 0
68 }
69
70 /// Check if a k-mer passes the filter
71 /// Matches C++ AGC's kmer_filter_t::operator()
72 fn passes(&self, kmer: u64) -> bool {
73 (murmur_hash_64a(kmer) ^ self.rnd) < self.threshold
74 }
75}
76
77/// Configuration for the streaming queue-based compressor
78#[derive(Debug, Clone)]
79pub struct StreamingQueueConfig {
80 /// K-mer length for splitters
81 pub k: usize,
82
83 /// Segment size for splitting contigs
84 pub segment_size: usize,
85
86 /// Minimum match length for LZ encoding
87 pub min_match_len: usize,
88
89 /// ZSTD compression level (1-22)
90 pub compression_level: i32,
91
92 /// Number of worker threads
93 pub num_threads: usize,
94
95 /// Queue capacity in bytes (default: 2 GB, like C++ AGC)
96 pub queue_capacity: usize,
97
98 /// Verbosity level
99 pub verbosity: usize,
100
101 /// Adaptive mode: find new splitters for samples that can't be segmented well
102 /// (matches C++ AGC -a flag)
103 pub adaptive_mode: bool,
104
105 /// Fallback fraction: fraction of minimizers to use for fallback grouping
106 /// (matches C++ AGC --fallback-frac parameter, default 0.0)
107 pub fallback_frac: f64,
108
109 /// Batch size: number of samples to accumulate before sorting and distributing
110 /// (matches C++ AGC pack_cardinality parameter, default 50)
111 /// Segments from batch_size samples are sorted by (sample, contig, seg_part_no)
112 /// before distribution to groups, ensuring consistent pack boundaries with C++ AGC.
113 pub batch_size: usize,
114
115 /// Pack size: number of segments per pack (matches C++ AGC contigs_in_pack)
116 /// When a group reaches this many segments, write a pack immediately
117 /// (default: 50, matching PACK_CARDINALITY)
118 pub pack_size: usize,
119
120 /// Concatenated genomes mode: if true, send sync tokens every pack_size contigs
121 /// If false (multiple input files), only send sync tokens at sample boundaries
122 /// Matches C++ AGC's concatenated_genomes behavior
123 pub concatenated_genomes: bool,
124}
125
126impl Default for StreamingQueueConfig {
127 fn default() -> Self {
128 Self {
129 k: 31,
130 segment_size: 60_000,
131 min_match_len: 20,
132 compression_level: 17,
133 num_threads: rayon::current_num_threads().max(4),
134 queue_capacity: 2 * 1024 * 1024 * 1024, // 2 GB like C++ AGC
135 verbosity: 1,
136 adaptive_mode: false, // Default matches C++ AGC (adaptive mode off)
137 fallback_frac: 0.0, // Default matches C++ AGC (fallback disabled)
138 batch_size: 50, // Default matches C++ AGC pack_cardinality
139 pack_size: 50, // Default matches C++ AGC contigs_in_pack / PACK_CARDINALITY
140 concatenated_genomes: false, // Default: multiple input files (non-concatenated)
141 }
142 }
143}
144
145/// Task to be processed by workers
146/// Note: Contig is type alias for Vec<u8>, so we store the name separately
147///
148/// Priority ordering matches C++ AGC:
149/// - Higher sample_priority first (sample1 > sample2 > sample3...)
150/// - Within same sample, lexicographic order on contig_name (ascending)
151///
152/// NOTE: C++ AGC uses a multimap<pair<priority, cost>, T> where cost=contig.size().
153/// Since multimap iterates in ascending key order, smaller names come first.
154/// This results in lexicographic ordering: chrI, chrII, chrIII, chrIV, chrIX, chrMT, chrV...
155/// RAGC must match this ordering for byte-identical archives.
156#[derive(Clone)]
157struct ContigTask {
158 sample_name: String,
159 contig_name: String,
160 data: Contig, // Vec<u8>
161 sample_priority: i32, // Higher = process first (decreases for each sample)
162 cost: usize, // Contig size in bytes (matches C++ AGC cost calculation)
163 sequence: u64, // Insertion order within sample - lower = processed first (FASTA order)
164 is_sync_token: bool, // True if this is a synchronization token (matches C++ AGC registration tokens)
165}
166
167// Implement priority ordering for BinaryHeap (max-heap)
168// BinaryHeap pops the "greatest" element, so we want:
169// - Higher sample_priority = greater (first sample processed first)
170// - Lexicographically SMALLER contig_name = greater (to be popped first)
171//
172// C++ AGC uses multimap which iterates in ascending order, so "chrI" < "chrIX" < "chrMT" < "chrV"
173// To match this with a max-heap, we reverse the contig_name comparison.
174impl PartialEq for ContigTask {
175 fn eq(&self, other: &Self) -> bool {
176 self.sample_priority == other.sample_priority
177 && self.cost == other.cost
178 && self.contig_name == other.contig_name
179 }
180}
181
182impl Eq for ContigTask {}
183
184impl PartialOrd for ContigTask {
185 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
186 Some(self.cmp(other))
187 }
188}
189
190impl Ord for ContigTask {
191 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
192 // C++ AGC uses (priority, cost) as the multimap key with PopLarge (rbegin).
193 // multimap is sorted by (priority, cost) in ASCENDING order.
194 // rbegin() returns LARGEST element, so within same priority, LARGEST cost is popped first.
195 //
196 // Example: AAA#0 contigs are processed by SIZE (largest first):
197 // chrIV (1.5MB) → chrXV (1.1MB) → chrVII (1.1MB) → ... → chrMT (86KB)
198 //
199 // This is NOT file order! Instrumentation shows C++ AGC pops by (priority, cost).
200
201 // First compare by sample_priority (higher priority first)
202 match self.sample_priority.cmp(&other.sample_priority) {
203 std::cmp::Ordering::Equal => {
204 // Then by cost (LARGER cost = higher priority, processed first)
205 // Match C++ AGC's PopLarge behavior
206 match self.cost.cmp(&other.cost) {
207 std::cmp::Ordering::Equal => {
208 // CRITICAL TIE-BREAKER: When sizes are equal, use FASTA order (sequence field)
209 // to ensure deterministic ordering. Without this, the BinaryHeap order is
210 // non-deterministic, causing different segment splitting and 19% size difference.
211 // LOWER sequence = earlier in FASTA = processed first (reverse comparison for max-heap)
212 other.sequence.cmp(&self.sequence)
213 }
214 cost_ord => cost_ord,
215 }
216 }
217 priority_ord => priority_ord,
218 }
219 }
220}
221
222/// Segment group identified by flanking k-mers (matching batch mode)
223#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd)]
224struct SegmentGroupKey {
225 kmer_front: u64,
226 kmer_back: u64,
227}
228
229/// Pending segment for batch-local processing (before group assignment)
230/// Segments are sorted by (sample_name, contig_name, place) to match C++ AGC order
231#[derive(Debug, Clone, PartialEq, Eq)]
232struct PendingSegment {
233 key: SegmentGroupKey,
234 segment_data: Vec<u8>,
235 should_reverse: bool,
236 sample_name: String,
237 contig_name: String,
238 place: usize,
239 sample_priority: i32, // Sample processing order (higher = earlier)
240}
241
242// Match C++ AGC sorting order (agc_compressor.h lines 112-119)
243// Sort by: sample_name, then contig_name, then place (seg_part_no)
244impl PartialOrd for PendingSegment {
245 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
246 Some(self.cmp(other))
247 }
248}
249
250impl Ord for PendingSegment {
251 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
252 // Match C++ AGC: pure lexicographic ordering (no sample_priority)
253 // Sort by: sample_name, then contig_name, then place (seg_part_no)
254 match self.sample_name.cmp(&other.sample_name) {
255 std::cmp::Ordering::Equal => {
256 // Then by contig_name
257 match self.contig_name.cmp(&other.contig_name) {
258 std::cmp::Ordering::Equal => {
259 // Finally by place (seg_part_no)
260 self.place.cmp(&other.place)
261 }
262 other => other,
263 }
264 }
265 other => other,
266 }
267 }
268}
269
270/// Buffered segment waiting to be packed
271#[derive(Debug, Clone, PartialEq, Eq)]
272struct BufferedSegment {
273 sample_name: String,
274 contig_name: String,
275 seg_part_no: usize,
276 data: Contig,
277 is_rev_comp: bool,
278 sample_priority: i32, // Sample processing order (higher = earlier)
279}
280
281// Match C++ AGC sorting order: pure lexicographic (no sample_priority)
282// Sort by: sample_name, contig_name, seg_part_no
283impl PartialOrd for BufferedSegment {
284 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
285 Some(self.cmp(other))
286 }
287}
288
289impl Ord for BufferedSegment {
290 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
291 // Match C++ AGC: pure lexicographic ordering (no sample_priority)
292 // Sort by: sample_name, then contig_name, then seg_part_no
293 match self.sample_name.cmp(&other.sample_name) {
294 std::cmp::Ordering::Equal => {
295 // Then by contig_name
296 match self.contig_name.cmp(&other.contig_name) {
297 std::cmp::Ordering::Equal => {
298 // Finally by seg_part_no
299 self.seg_part_no.cmp(&other.seg_part_no)
300 }
301 other => other,
302 }
303 }
304 other => other,
305 }
306 }
307}
308
309// =============================================================================
310// RAW segment buffering for parallel Phase 1 (BEFORE classification)
311// =============================================================================
312
313/// Raw segment data buffered BEFORE k-mer classification.
314/// This allows parallel buffering without lock contention from find_group_with_one_kmer.
315/// Classification is deferred to Thread 0 at the barrier.
316#[derive(Clone)]
317struct RawBufferedSegment {
318 /// Raw segment data (numeric encoding: 0=A, 1=C, 2=G, 3=T)
319 data: Vec<u8>,
320 /// Precomputed reverse complement of data
321 data_rc: Vec<u8>,
322 /// Front k-mer from segment detection
323 front_kmer: u64,
324 /// Back k-mer from segment detection
325 back_kmer: u64,
326 /// Is front k-mer in canonical direction?
327 front_kmer_is_dir: bool,
328 /// Is back k-mer in canonical direction?
329 back_kmer_is_dir: bool,
330 /// Sample name for sorting and registration
331 sample_name: String,
332 /// Contig name for sorting and registration
333 contig_name: String,
334 /// Segment index within contig (before split adjustment)
335 original_place: usize,
336 /// Sample processing priority (higher = earlier)
337 sample_priority: i32,
338}
339
340// Implement Ord for deterministic sorting at barrier
341impl PartialEq for RawBufferedSegment {
342 fn eq(&self, other: &Self) -> bool {
343 self.sample_name == other.sample_name
344 && self.contig_name == other.contig_name
345 && self.original_place == other.original_place
346 }
347}
348impl Eq for RawBufferedSegment {}
349
350impl PartialOrd for RawBufferedSegment {
351 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
352 Some(self.cmp(other))
353 }
354}
355
356impl Ord for RawBufferedSegment {
357 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
358 // Sort by: sample_name, contig_name, original_place (matches C++ AGC order)
359 match self.sample_name.cmp(&other.sample_name) {
360 std::cmp::Ordering::Equal => match self.contig_name.cmp(&other.contig_name) {
361 std::cmp::Ordering::Equal => self.original_place.cmp(&other.original_place),
362 other => other,
363 },
364 other => other,
365 }
366 }
367}
368
369// =============================================================================
370// C++ AGC-style segment buffering for parallel compression (4-phase pattern)
371// =============================================================================
372
373/// Per-group segment buffer with its own mutex (C++ AGC: list_seg_part_t)
374/// Each group has independent locking to allow parallel writes during Phase 1
375struct PerGroupSegments {
376 segments: Vec<BufferedSegment>,
377}
378
379/// Segment waiting to be assigned a group ID (C++ AGC: kk_seg_part_t)
380/// Used during Phase 1 when segment's k-mer pair doesn't exist in map_segments yet
381#[derive(Clone)]
382struct NewSegment {
383 /// K-mer pair (normalized: front <= back)
384 kmer_front: u64,
385 kmer_back: u64,
386 /// Sort key for deterministic processing: (sample_priority, sample_name, contig_name, seg_part_no)
387 sample_priority: i32,
388 sample_name: String,
389 contig_name: String,
390 seg_part_no: usize,
391 /// Segment data
392 data: Contig,
393 should_reverse: bool,
394}
395
396// Implement Ord for NewSegment to match C++ AGC BTreeSet ordering
397impl PartialEq for NewSegment {
398 fn eq(&self, other: &Self) -> bool {
399 self.sample_priority == other.sample_priority
400 && self.sample_name == other.sample_name
401 && self.contig_name == other.contig_name
402 && self.seg_part_no == other.seg_part_no
403 }
404}
405impl Eq for NewSegment {}
406
407impl PartialOrd for NewSegment {
408 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
409 Some(self.cmp(other))
410 }
411}
412
413impl Ord for NewSegment {
414 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
415 // Higher sample_priority processes first (descending)
416 match other.sample_priority.cmp(&self.sample_priority) {
417 std::cmp::Ordering::Equal => {
418 // Then by sample_name, contig_name, seg_part_no (ascending)
419 match self.sample_name.cmp(&other.sample_name) {
420 std::cmp::Ordering::Equal => match self.contig_name.cmp(&other.contig_name) {
421 std::cmp::Ordering::Equal => self.seg_part_no.cmp(&other.seg_part_no),
422 other => other,
423 },
424 other => other,
425 }
426 }
427 other => other,
428 }
429 }
430}
431
432/// Two-tier segment buffering for C++ AGC 4-phase pattern (C++ AGC: CBufferedSegPart)
433///
434/// Phase 1 (PARALLEL): Workers add segments using add_known() or add_new()
435/// Phase 2 (SINGLE): Thread 0 calls process_new() to assign group IDs
436/// Phase 3 (PARALLEL): Workers call get_vec_id() + get_part() for atomic work-stealing
437/// Phase 4: Thread 0 calls clear() for cleanup
438struct BufferedSegPart {
439 /// KNOWN segments: indexed by group_id, each has own mutex
440 /// RwLock on Vec allows process_new() to resize while add_known() reads
441 /// C++ AGC: vector<list_seg_part_t> vl_seg_part
442 vl_seg_part: RwLock<Vec<Mutex<PerGroupSegments>>>,
443
444 /// NEW segments: BTreeSet for deterministic iteration
445 /// C++ AGC: set<kk_seg_part_t> s_seg_part
446 s_seg_part: Mutex<std::collections::BTreeSet<NewSegment>>,
447
448 /// Atomic counter for work distribution (descending from num_groups-1 to -1)
449 /// C++ AGC: atomic<int32_t> a_v_part_id
450 a_v_part_id: AtomicI32,
451}
452
453impl BufferedSegPart {
454 fn new(initial_groups: usize) -> Self {
455 Self {
456 vl_seg_part: RwLock::new(
457 (0..initial_groups)
458 .map(|_| {
459 Mutex::new(PerGroupSegments {
460 segments: Vec::new(),
461 })
462 })
463 .collect(),
464 ),
465 s_seg_part: Mutex::new(std::collections::BTreeSet::new()),
466 a_v_part_id: AtomicI32::new(-1),
467 }
468 }
469
470 /// Add segment to KNOWN group (has group_id)
471 /// C++ AGC: add_known() - read lock on Vec, per-group lock on Mutex
472 fn add_known(&self, group_id: u32, segment: BufferedSegment) {
473 let groups = self.vl_seg_part.read().unwrap();
474 if (group_id as usize) < groups.len() {
475 groups[group_id as usize]
476 .lock()
477 .unwrap()
478 .segments
479 .push(segment);
480 } else {
481 // This should NOT happen - indicates a bug in group management
482 eprintln!("WARNING: add_known dropping segment! group_id={} >= groups.len()={} sample={} contig={}",
483 group_id, groups.len(), segment.sample_name, segment.contig_name);
484 }
485 }
486
487 /// Ensure capacity for group_id (grow vl_seg_part if needed)
488 /// Called when immediately registering groups during barrier classification
489 fn ensure_capacity(&self, min_group_id: u32) {
490 let current_len = self.vl_seg_part.read().unwrap().len();
491 if (min_group_id as usize) >= current_len {
492 let mut groups = self.vl_seg_part.write().unwrap();
493 while groups.len() <= min_group_id as usize {
494 groups.push(Mutex::new(PerGroupSegments {
495 segments: Vec::new(),
496 }));
497 }
498 }
499 }
500
501 /// Add segment with UNKNOWN group (new k-mer pair)
502 /// C++ AGC: add_new() - global s_seg_part lock (but brief)
503 fn add_new(&self, segment: NewSegment) {
504 self.s_seg_part.lock().unwrap().insert(segment);
505 }
506
507 /// Process NEW segments, assign group IDs deterministically
508 /// C++ AGC: process_new() - ONLY called by thread 0 after barrier
509 /// DETERMINISM FIX: This is the ONLY place where group IDs are assigned.
510 /// The parallel classification phase only determines k-mer pairs.
511 fn process_new(
512 &self,
513 map_segments: &mut BTreeMap<SegmentGroupKey, u32>,
514 next_group_id: &mut u32,
515 reference_segments: &mut BTreeMap<u32, Vec<u8>>,
516 terminators: &mut BTreeMap<u64, Vec<u64>>,
517 ) -> u32 {
518 use crate::segment::MISSING_KMER;
519
520 let mut s = self.s_seg_part.lock().unwrap();
521 let mut m_kmers: BTreeMap<(u64, u64), u32> = BTreeMap::new();
522 let mut new_count = 0u32;
523
524 // First pass: assign group IDs (deterministic - BTreeSet order)
525 for seg in s.iter() {
526 let key = (seg.kmer_front, seg.kmer_back);
527 if !m_kmers.contains_key(&key)
528 && !map_segments.contains_key(&SegmentGroupKey {
529 kmer_front: seg.kmer_front,
530 kmer_back: seg.kmer_back,
531 })
532 {
533 m_kmers.insert(key, *next_group_id);
534 *next_group_id += 1;
535 new_count += 1;
536 }
537 }
538
539 // Resize vl_seg_part for new groups (requires write lock)
540 {
541 let mut groups = self.vl_seg_part.write().unwrap();
542 while groups.len() < *next_group_id as usize {
543 groups.push(Mutex::new(PerGroupSegments {
544 segments: Vec::new(),
545 }));
546 }
547 }
548
549 // Second pass: move segments to vl_seg_part and update map_segments
550 // Also update reference_segments and terminators for new groups
551 let segments: Vec<NewSegment> = s.iter().cloned().collect();
552 s.clear();
553 drop(s);
554
555 // Track which groups have had their reference set (first segment wins)
556 let mut refs_set: std::collections::HashSet<u32> = std::collections::HashSet::new();
557
558 for seg in segments {
559 let key = SegmentGroupKey {
560 kmer_front: seg.kmer_front,
561 kmer_back: seg.kmer_back,
562 };
563
564 // Get group_id from either existing map or newly assigned
565 let (group_id, is_new_group) = if let Some(&id) = map_segments.get(&key) {
566 (id, false)
567 } else if let Some(&id) = m_kmers.get(&(seg.kmer_front, seg.kmer_back)) {
568 // Insert into map_segments
569 map_segments.insert(key.clone(), id);
570 (id, true)
571 } else {
572 continue; // Should not happen
573 };
574
575 // Store reference data for new groups (first segment in sorted order wins)
576 if is_new_group && !refs_set.contains(&group_id) {
577 reference_segments.insert(group_id, seg.data.clone());
578 refs_set.insert(group_id);
579
580 // Update terminators for new groups (C++ AGC lines 1015-1025)
581 if key.kmer_front != MISSING_KMER && key.kmer_back != MISSING_KMER {
582 // Add kmer_front -> kmer_back
583 let front_vec = terminators.entry(key.kmer_front).or_insert_with(Vec::new);
584 if !front_vec.contains(&key.kmer_back) {
585 front_vec.push(key.kmer_back);
586 front_vec.sort();
587 }
588 // Add kmer_back -> kmer_front (if different)
589 if key.kmer_front != key.kmer_back {
590 let back_vec = terminators.entry(key.kmer_back).or_insert_with(Vec::new);
591 if !back_vec.contains(&key.kmer_front) {
592 back_vec.push(key.kmer_front);
593 back_vec.sort();
594 }
595 }
596 }
597 }
598
599 // Add to per-group buffer (uses read lock internally)
600 let buffered = BufferedSegment {
601 sample_name: seg.sample_name,
602 contig_name: seg.contig_name,
603 seg_part_no: seg.seg_part_no,
604 data: seg.data,
605 is_rev_comp: seg.should_reverse,
606 sample_priority: seg.sample_priority,
607 };
608 self.add_known(group_id, buffered);
609 }
610
611 new_count
612 }
613
614 /// Sort known segments within each group for deterministic output
615 fn sort_known(&self) {
616 let groups = self.vl_seg_part.read().unwrap();
617 for group in groups.iter() {
618 group.lock().unwrap().segments.sort();
619 }
620 }
621
622 /// Reset atomic counter for work distribution
623 /// C++ AGC: restart_read_vec()
624 fn restart_read_vec(&self) {
625 let groups = self.vl_seg_part.read().unwrap();
626 self.a_v_part_id
627 .store(groups.len() as i32 - 1, Ordering::SeqCst);
628 }
629
630 /// Get next group_id to process (atomic decrement for work-stealing)
631 /// C++ AGC: get_vec_id() - returns -1 when all groups claimed
632 fn get_vec_id(&self) -> i32 {
633 self.a_v_part_id.fetch_sub(1, Ordering::Relaxed)
634 }
635
636 /// Get and remove one segment from group (for store phase)
637 /// C++ AGC: get_part()
638 fn get_part(&self, group_id: u32) -> Option<BufferedSegment> {
639 let groups = self.vl_seg_part.read().unwrap();
640 if (group_id as usize) < groups.len() {
641 groups[group_id as usize].lock().unwrap().segments.pop()
642 } else {
643 None
644 }
645 }
646
647 /// Get all segments from a group (for batch processing)
648 fn drain_group(&self, group_id: u32) -> Vec<BufferedSegment> {
649 let groups = self.vl_seg_part.read().unwrap();
650 if (group_id as usize) < groups.len() {
651 std::mem::take(&mut groups[group_id as usize].lock().unwrap().segments)
652 } else {
653 Vec::new()
654 }
655 }
656
657 /// Clear all buffers after batch
658 /// C++ AGC: clear()
659 fn clear(&self) {
660 let groups = self.vl_seg_part.read().unwrap();
661 for group in groups.iter() {
662 group.lock().unwrap().segments.clear();
663 }
664 self.s_seg_part.lock().unwrap().clear();
665 }
666
667 /// Check if any segments are buffered
668 fn has_segments(&self) -> bool {
669 let groups = self.vl_seg_part.read().unwrap();
670 for group in groups.iter() {
671 if !group.lock().unwrap().segments.is_empty() {
672 return true;
673 }
674 }
675 !self.s_seg_part.lock().unwrap().is_empty()
676 }
677
678 /// Total number of groups
679 fn num_groups(&self) -> usize {
680 self.vl_seg_part.read().unwrap().len()
681 }
682}
683
684// =============================================================================
685// Parallel flush coordination for Phase 3 (atomic work-stealing)
686// =============================================================================
687
688/// State for coordinating parallel Phase 3 segment storage
689/// Workers atomically claim buffers via next_idx, then process independently
690struct ParallelFlushState {
691 /// Extracted buffers to flush. Each slot has its own Mutex for independent access.
692 /// RwLock allows parallel read access to the Vec during Phase 3, avoiding serialization.
693 /// Workers only need read access to the Vec to reach their claimed slot's inner Mutex.
694 buffers: RwLock<Vec<Mutex<Option<(SegmentGroupKey, SegmentGroupBuffer)>>>>,
695 /// Compression results from each buffer (stored by workers, written by Thread 0)
696 results: RwLock<Vec<Mutex<Option<FlushPackResult>>>>,
697 /// Atomic index for work-stealing (starts at len-1, decrements to -1)
698 next_idx: AtomicI32,
699}
700
701impl ParallelFlushState {
702 fn new() -> Self {
703 Self {
704 buffers: RwLock::new(Vec::new()),
705 results: RwLock::new(Vec::new()),
706 next_idx: AtomicI32::new(-1),
707 }
708 }
709
710 /// Set up buffers to flush and reset atomic counter (called by Thread 0 in Phase 2)
711 fn prepare(&self, extracted: Vec<(SegmentGroupKey, SegmentGroupBuffer)>) {
712 let len = extracted.len();
713 let mut buffers = self.buffers.write().unwrap();
714 *buffers = extracted
715 .into_iter()
716 .map(|(k, b)| Mutex::new(Some((k, b))))
717 .collect();
718 // Initialize results slots (one per buffer)
719 let mut results = self.results.write().unwrap();
720 *results = (0..len).map(|_| Mutex::new(None)).collect();
721 self.next_idx.store(len as i32 - 1, Ordering::SeqCst);
722 }
723
724 /// Claim next buffer index (returns None when all claimed)
725 /// Called by ALL workers in Phase 3
726 fn claim_next_idx(&self) -> Option<usize> {
727 let idx = self.next_idx.fetch_sub(1, Ordering::Relaxed);
728 if idx < 0 {
729 None
730 } else {
731 Some(idx as usize)
732 }
733 }
734
735 /// Get buffer at claimed index (READ lock on Vec, exclusive on slot)
736 /// Called by workers after claiming an index in Phase 3
737 fn get_buffer_at(&self, idx: usize) -> Option<(SegmentGroupKey, SegmentGroupBuffer)> {
738 let buffers = self.buffers.read().unwrap();
739 if idx < buffers.len() {
740 buffers[idx].lock().unwrap().take()
741 } else {
742 None
743 }
744 }
745
746 /// Put buffer back after processing (READ lock on Vec, exclusive on slot)
747 fn return_buffer(&self, idx: usize, key: SegmentGroupKey, buffer: SegmentGroupBuffer) {
748 let buffers = self.buffers.read().unwrap();
749 if idx < buffers.len() {
750 *buffers[idx].lock().unwrap() = Some((key, buffer));
751 }
752 }
753
754 /// Drain all buffers back (called by Thread 0 in Phase 4 - needs WRITE lock)
755 fn drain_buffers(&self) -> Vec<(SegmentGroupKey, SegmentGroupBuffer)> {
756 let mut buffers = self.buffers.write().unwrap();
757 let result: Vec<_> = buffers
758 .iter_mut()
759 .filter_map(|slot| slot.lock().unwrap().take())
760 .collect();
761 buffers.clear();
762 self.next_idx.store(-1, Ordering::SeqCst);
763 result
764 }
765
766 /// Store compression result at given index (READ lock on Vec, exclusive on slot)
767 fn store_result(&self, idx: usize, result: FlushPackResult) {
768 let results = self.results.read().unwrap();
769 if idx < results.len() {
770 *results[idx].lock().unwrap() = Some(result);
771 }
772 }
773
774 /// Drain all results sorted by group_id (called by Thread 0 for deterministic writes)
775 fn drain_results_sorted(&self) -> Vec<FlushPackResult> {
776 let mut results_lock = self.results.write().unwrap();
777 let mut all_results: Vec<FlushPackResult> = results_lock
778 .iter_mut()
779 .filter_map(|slot| slot.lock().unwrap().take())
780 .collect();
781 results_lock.clear();
782 // Sort by group_id for deterministic write order
783 all_results.sort_by_key(|r| r.group_id);
784 all_results
785 }
786}
787
788/// Parallel write buffer with per-stream mutexes (C++ AGC pattern: per-segment mutex)
789/// Workers operating on different streams don't contend at all.
790/// BTreeMap ensures flush writes in sorted stream_id order for determinism.
791struct ParallelWriteBuffer {
792 /// Per-stream buffers: BTreeMap for sorted iteration, each stream has its own Mutex
793 /// RwLock allows concurrent reader access to find the right stream's Mutex
794 streams: RwLock<BTreeMap<usize, Mutex<Vec<(Vec<u8>, u64)>>>>,
795}
796
797impl ParallelWriteBuffer {
798 fn new() -> Self {
799 Self {
800 streams: RwLock::new(BTreeMap::new()),
801 }
802 }
803
804 /// Buffer a write for a specific stream (only locks that stream's mutex)
805 /// Workers on different streams can call this concurrently without contention
806 fn buffer_write(&self, stream_id: usize, data: Vec<u8>, metadata: u64) {
807 // First try with read lock - most common case (stream already exists)
808 {
809 let streams = self.streams.read().unwrap();
810 if let Some(stream_mutex) = streams.get(&stream_id) {
811 stream_mutex.lock().unwrap().push((data, metadata));
812 return;
813 }
814 }
815 // Stream doesn't exist - need write lock to create it
816 {
817 let mut streams = self.streams.write().unwrap();
818 // Double-check (another thread may have created it)
819 streams
820 .entry(stream_id)
821 .or_insert_with(|| Mutex::new(Vec::new()))
822 .lock()
823 .unwrap()
824 .push((data, metadata));
825 }
826 }
827
828 /// Flush all buffered writes to archive in sorted stream_id order
829 /// Called by Thread 0 after barrier - ensures deterministic output
830 fn flush_to_archive(&self, archive: &mut Archive) -> Result<()> {
831 let streams = self.streams.read().unwrap();
832 // BTreeMap iterates in sorted key order (stream_id)
833 for (stream_id, stream_mutex) in streams.iter() {
834 let parts = stream_mutex.lock().unwrap();
835 for (data, metadata) in parts.iter() {
836 // Use buffered writes to reduce syscalls
837 archive.add_part_buffered(*stream_id, data.clone(), *metadata);
838 }
839 }
840 Ok(())
841 }
842
843 /// Clear all buffers (called after flush)
844 fn clear(&self) {
845 let mut streams = self.streams.write().unwrap();
846 for (_, stream_mutex) in streams.iter_mut() {
847 stream_mutex.lock().unwrap().clear();
848 }
849 }
850}
851
852/// Buffer for a segment group (packs 50 segments together)
853struct SegmentGroupBuffer {
854 group_id: u32,
855 stream_id: usize, // Delta stream for packed segments
856 ref_stream_id: usize, // Reference stream for first segment
857 reference_segment: Option<BufferedSegment>, // First segment (reference for LZ encoding)
858 segments: Vec<BufferedSegment>, // Up to PACK_CARDINALITY segments (EXCLUDING reference)
859 ref_written: bool, // Whether reference has been written
860 segments_written: u32, // Counter for delta segments written (NOT including reference)
861 lz_diff: Option<LZDiff>, // LZ encoder prepared once with reference, reused for all segments (matches C++ AGC CSegment::lz_diff)
862 // CRITICAL: Partial pack persistence to ensure pack alignment with decompression expectations
863 // Pack N must contain entries for in_group_ids (N*50)+1 to (N+1)*50
864 // These fields persist unique deltas until we have exactly 50 for a complete pack
865 pending_deltas: Vec<Vec<u8>>, // Unique deltas waiting to be written (< 50)
866 pending_delta_ids: Vec<u32>, // in_group_ids for pending deltas (for deduplication)
867 raw_placeholder_written: bool, // Whether raw group placeholder has been written
868}
869
870impl SegmentGroupBuffer {
871 fn new(group_id: u32, stream_id: usize, ref_stream_id: usize) -> Self {
872 Self {
873 group_id,
874 stream_id,
875 ref_stream_id,
876 reference_segment: None,
877 segments: Vec::new(),
878 ref_written: false,
879 segments_written: 0,
880 lz_diff: None, // Prepared when reference is written (matches C++ AGC segment.cpp line 43)
881 pending_deltas: Vec::new(),
882 pending_delta_ids: Vec::new(),
883 raw_placeholder_written: false,
884 }
885 }
886
887 /// Check if this group should write a pack (has >= pack_size segments)
888 /// Matches C++ AGC's logic for writing packs when full
889 fn should_flush_pack(&self, pack_size: usize) -> bool {
890 // Count buffered segments (excluding reference which is handled separately)
891 self.segments.len() >= pack_size
892 }
893
894 /// Get current segment count (for pack-full detection)
895 fn segment_count(&self) -> usize {
896 self.segments.len()
897 }
898}
899
900/// Batch-local state for processing new segments
901/// Equivalent to C++ AGC's `m_kmers` local variable in process_new()
902/// This is RESET at each sample boundary to match C++ AGC behavior
903struct BatchState {
904 /// New segments discovered in THIS batch (not found in global registry)
905 /// Key: (front_kmer, back_kmer)
906 /// Value: Vec of segments with that k-mer pair
907 new_segments: BTreeMap<(u64, u64), Vec<PendingSegment>>,
908
909 /// Starting group ID for this batch (continues from global count)
910 next_group_id: u32,
911}
912
913impl BatchState {
914 fn new(starting_group_id: u32) -> Self {
915 BatchState {
916 new_segments: BTreeMap::new(),
917 next_group_id: starting_group_id,
918 }
919 }
920
921 /// Clear batch state for next sample (resets new_segments map)
922 /// next_group_id continues incrementing
923 fn clear(&mut self) {
924 self.new_segments.clear();
925 // next_group_id NOT reset - it continues from where it left off
926 }
927
928 /// Add a new segment to this batch
929 fn add_segment(&mut self, key: (u64, u64), segment: PendingSegment) {
930 self.new_segments
931 .entry(key)
932 .or_insert_with(Vec::new)
933 .push(segment);
934 }
935}
936
937/// Pack size (C++ AGC default)
938const PACK_CARDINALITY: usize = 50;
939/// First 16 groups are raw-only (no LZ encoding)
940const NO_RAW_GROUPS: u32 = 16;
941
942/// Streaming compressor with queue-based API
943///
944/// # Example
945/// ```no_run
946/// use ragc_core::{StreamingQueueCompressor, StreamingQueueConfig};
947/// use ahash::AHashSet;
948///
949/// # fn main() -> anyhow::Result<()> {
950/// let config = StreamingQueueConfig::default();
951/// let splitters = AHashSet::new(); // Normally from reference
952/// let mut compressor = StreamingQueueCompressor::with_splitters(
953/// "output.agc",
954/// config,
955/// splitters
956/// )?;
957///
958/// // Push sequences (blocks when queue is full - automatic backpressure!)
959/// # let sequences = vec![("sample1".to_string(), "chr1".to_string(), vec![0u8; 1000])];
960/// for (sample, contig_name, data) in sequences {
961/// compressor.push(sample, contig_name, data)?;
962/// }
963///
964/// // Finalize - waits for all compression to complete
965/// compressor.finalize()?;
966/// # Ok(())
967/// # }
968/// ```
969pub struct StreamingQueueCompressor {
970 queue: Arc<MemoryBoundedQueue<ContigTask>>,
971 workers: Vec<JoinHandle<Result<()>>>,
972 barrier: Arc<std::sync::Barrier>, // Synchronization barrier for batch boundaries (matches C++ AGC bar.arrive_and_wait())
973 collection: Arc<Mutex<CollectionV3>>,
974 splitters: Arc<AHashSet<u64>>,
975 config: StreamingQueueConfig,
976 archive: Arc<Mutex<Archive>>,
977 segment_groups: Arc<Mutex<BTreeMap<SegmentGroupKey, SegmentGroupBuffer>>>,
978 group_counter: Arc<AtomicU32>, // Starts at 16 for LZ groups
979 raw_group_counter: Arc<AtomicU32>, // Round-robin counter for raw groups (0-15)
980 reference_sample_name: Arc<Mutex<Option<String>>>, // First sample becomes reference
981 // Segment splitting support (Phase 1)
982 map_segments: Arc<RwLock<BTreeMap<SegmentGroupKey, u32>>>, // (front, back) -> group_id (BTreeMap for deterministic iteration)
983 map_segments_terminators: Arc<RwLock<BTreeMap<u64, Vec<u64>>>>, // kmer -> [connected kmers] (BTreeMap for determinism)
984
985 // FFI Grouping Engine - C++ AGC-compatible group assignment
986 #[cfg(feature = "cpp_agc")]
987 grouping_engine: Arc<Mutex<crate::ragc_ffi::GroupingEngine>>,
988
989 // Persistent reference segment storage (matches C++ AGC v_segments)
990 // Stores reference segment data even after groups are flushed, enabling LZ cost estimation
991 // for subsequent samples (fixes multi-sample group fragmentation bug)
992 reference_segments: Arc<RwLock<BTreeMap<u32, Vec<u8>>>>, // group_id -> reference segment data (BTreeMap for determinism)
993
994 // Reference orientation tracking - stores is_rev_comp for each group's reference segment
995 // When a delta segment joins an existing group, it MUST use the same orientation as the reference
996 // to ensure LZ encoding works correctly (fixes ZERO_MATCH bug in Case 3 terminator segments)
997 reference_orientations: Arc<RwLock<BTreeMap<u32, bool>>>, // group_id -> reference is_rev_comp (BTreeMap for determinism)
998
999 // Track segment splits for renumbering subsequent segments
1000 // Maps (sample_name, contig_name, original_place) -> number of splits inserted before this position
1001 split_offsets: Arc<Mutex<BTreeMap<(String, String, usize), usize>>>, // BTreeMap for determinism
1002
1003 // Priority assignment for interleaved processing (matches C++ AGC)
1004 // Higher priority = processed first (sample1 > sample2 > sample3...)
1005 sample_priorities: Arc<RwLock<BTreeMap<String, i32>>>, // sample_name -> priority (BTreeMap for determinism)
1006
1007 // Track last sample to detect sample boundaries for sync token insertion
1008 last_sample_name: Arc<Mutex<Option<String>>>, // Last sample that was pushed
1009
1010 // Batch-local group assignment (matches C++ AGC m_kmers per-batch behavior)
1011 // When batch_samples reaches batch_size, we flush pending segments and clear batch-local state
1012 batch_samples: Arc<Mutex<HashSet<String>>>, // Samples in current batch (matches C++ AGC pack_cardinality batch)
1013 batch_local_groups: Arc<Mutex<BTreeMap<SegmentGroupKey, u32>>>, // Batch-local m_kmers equivalent (BTreeMap for deterministic iteration)
1014 batch_local_terminators: Arc<Mutex<BTreeMap<u64, Vec<u64>>>>, // Batch-local terminators (BTreeMap for determinism)
1015 pending_batch_segments: Arc<Mutex<Vec<PendingSegment>>>, // Buffer segments until batch boundary
1016 // Two-tier segment buffering for C++ AGC 4-phase parallel pattern
1017 buffered_seg_part: Arc<BufferedSegPart>, // Per-group buffers for parallel Phase 1
1018 // Fallback minimizers map for segments with no terminator match (matches C++ AGC map_fallback_minimizers)
1019 map_fallback_minimizers: Arc<Mutex<BTreeMap<u64, Vec<(u64, u64)>>>>, // kmer -> [(front, back)] candidate group keys (BTreeMap for determinism)
1020 next_priority: Arc<Mutex<i32>>, // Decreases for each new sample (starts at i32::MAX)
1021 next_sequence: Arc<std::sync::atomic::AtomicU64>, // Increases for each contig (FASTA order)
1022 global_contig_count: Arc<AtomicUsize>, // GLOBAL contig counter for synchronization (C++ AGC: cnt_contigs_in_sample)
1023
1024 // Deferred metadata streams - written AFTER segment data (C++ AGC compatibility)
1025 // C++ AGC writes segment data first, then metadata streams at the end
1026 deferred_file_type_info: (usize, Vec<u8>), // (stream_id, data)
1027 deferred_params: (usize, Vec<u8>), // (stream_id, data)
1028 deferred_splitters: (usize, Vec<u8>), // (stream_id, data)
1029 deferred_segment_splitters: (usize, Vec<u8>), // (stream_id, data)
1030
1031 // Dynamic splitter discovery for adaptive mode (matches C++ AGC find_new_splitters)
1032 // Stores reference k-mers to exclude when finding new splitters for non-reference contigs
1033 ref_singletons: Arc<Vec<u64>>, // Sorted for binary search - reference singleton k-mers (v_candidate_kmers)
1034 ref_duplicates: Arc<AHashSet<u64>>, // Reference duplicate k-mers (v_duplicated_kmers)
1035
1036 // Parallel Phase 3 state for atomic work-stealing (matches C++ AGC architecture)
1037 parallel_state: Arc<ParallelFlushState>,
1038
1039 // Per-stream write buffer for parallel Phase 3 (C++ AGC pattern: per-segment mutex)
1040 // Workers on different streams can buffer writes concurrently without contention
1041 write_buffer: Arc<ParallelWriteBuffer>,
1042
1043 // RAW segment buffers for deferred classification (parallel Phase 1 optimization)
1044 // PER-WORKER buffers eliminate contention: each worker pushes to its own buffer
1045 // Thread 0 drains all buffers at barrier for classification
1046 raw_segment_buffers: Arc<Vec<Mutex<Vec<RawBufferedSegment>>>>,
1047}
1048
1049impl StreamingQueueCompressor {
1050 /// Create a new streaming compressor with pre-computed splitters
1051 ///
1052 /// Use this when you already have splitters (e.g., from a reference genome)
1053 ///
1054 /// # Arguments
1055 /// * `output_path` - Path to output AGC archive
1056 /// * `config` - Compression configuration
1057 /// * `splitters` - Pre-computed splitter k-mers
1058 pub fn with_splitters(
1059 output_path: impl AsRef<Path>,
1060 config: StreamingQueueConfig,
1061 splitters: AHashSet<u64>,
1062 ) -> Result<Self> {
1063 // Call internal with empty ref data (no dynamic splitter discovery)
1064 Self::with_splitters_internal(
1065 output_path,
1066 config,
1067 splitters,
1068 Arc::new(Vec::new()),
1069 Arc::new(AHashSet::new()),
1070 )
1071 }
1072
1073 /// Internal constructor that accepts all splitter data
1074 fn with_splitters_internal(
1075 output_path: impl AsRef<Path>,
1076 config: StreamingQueueConfig,
1077 splitters: AHashSet<u64>,
1078 ref_singletons: Arc<Vec<u64>>,
1079 ref_duplicates: Arc<AHashSet<u64>>,
1080 ) -> Result<Self> {
1081 let output_path = output_path.as_ref();
1082 let archive_path = output_path.to_string_lossy().to_string();
1083
1084 if config.verbosity > 0 {
1085 eprintln!("Initializing streaming compressor...");
1086 eprintln!(
1087 " Queue capacity: {} GB",
1088 config.queue_capacity / (1024 * 1024 * 1024)
1089 );
1090 eprintln!(" Worker threads: {}", config.num_threads);
1091 eprintln!(" Splitters: {}", splitters.len());
1092 }
1093
1094 // Create archive
1095 let mut archive = Archive::new_writer();
1096 archive.open(output_path)?;
1097
1098 // Create collection
1099 let mut collection = CollectionV3::new();
1100 collection.set_config(config.segment_size as u32, config.k as u32, None);
1101
1102 // CRITICAL: Register collection streams FIRST (C++ AGC compatibility)
1103 // C++ AGC expects collection-samples at stream 0, collection-contigs at 1, collection-details at 2
1104 collection.prepare_for_compression(&mut archive)?;
1105
1106 // DEFERRED METADATA STREAMS (C++ AGC compatibility)
1107 // C++ AGC writes segment data FIRST, then metadata streams at the END.
1108 // We register streams now but defer writing data until finalize().
1109
1110 // Prepare file_type_info data (defer write)
1111 let deferred_file_type_info = {
1112 let mut data = Vec::new();
1113 let append_str = |data: &mut Vec<u8>, s: &str| {
1114 data.extend_from_slice(s.as_bytes());
1115 data.push(0);
1116 };
1117
1118 append_str(&mut data, "producer");
1119 append_str(&mut data, "ragc");
1120 append_str(&mut data, "producer_version_major");
1121 append_str(&mut data, &ragc_common::AGC_FILE_MAJOR.to_string());
1122 append_str(&mut data, "producer_version_minor");
1123 append_str(&mut data, &ragc_common::AGC_FILE_MINOR.to_string());
1124 append_str(&mut data, "producer_version_build");
1125 append_str(&mut data, "0");
1126 append_str(&mut data, "file_version_major");
1127 append_str(&mut data, &ragc_common::AGC_FILE_MAJOR.to_string());
1128 append_str(&mut data, "file_version_minor");
1129 append_str(&mut data, &ragc_common::AGC_FILE_MINOR.to_string());
1130 append_str(&mut data, "comment");
1131 append_str(
1132 &mut data,
1133 &format!(
1134 "RAGC v.{}.{}",
1135 ragc_common::AGC_FILE_MAJOR,
1136 ragc_common::AGC_FILE_MINOR
1137 ),
1138 );
1139
1140 let stream_id = archive.register_stream("file_type_info");
1141 // DEFERRED: archive.add_part(stream_id, &data, 7) will be called in finalize()
1142 (stream_id, data)
1143 };
1144
1145 // Prepare params data (defer write)
1146 let deferred_params = {
1147 let stream_id = archive.register_stream("params");
1148 let mut data = Vec::new();
1149 data.extend_from_slice(&(config.k as u32).to_le_bytes());
1150 data.extend_from_slice(&(config.min_match_len as u32).to_le_bytes());
1151 data.extend_from_slice(&50u32.to_le_bytes()); // pack_cardinality (default)
1152 data.extend_from_slice(&(config.segment_size as u32).to_le_bytes());
1153 // DEFERRED: archive.add_part(stream_id, &data, 0) will be called in finalize()
1154 (stream_id, data)
1155 };
1156
1157 // Prepare empty splitters stream (defer write)
1158 let deferred_splitters = {
1159 let stream_id = archive.register_stream("splitters");
1160 let data = Vec::new();
1161 // DEFERRED: archive.add_part(stream_id, &data, 0) will be called in finalize()
1162 (stream_id, data)
1163 };
1164
1165 // Prepare empty segment-splitters stream (defer write)
1166 let deferred_segment_splitters = {
1167 let stream_id = archive.register_stream("segment-splitters");
1168 let data = Vec::new();
1169 // DEFERRED: archive.add_part(stream_id, &data, 0) will be called in finalize()
1170 (stream_id, data)
1171 };
1172
1173 let collection = Arc::new(Mutex::new(collection));
1174 let archive = Arc::new(Mutex::new(archive));
1175
1176 // Create memory-bounded queue
1177 let queue = Arc::new(MemoryBoundedQueue::new(config.queue_capacity));
1178
1179 let splitters = Arc::new(splitters);
1180 // ref_singletons and ref_duplicates are passed as parameters to ensure workers
1181 // get the same Arc as stored in self (critical for dynamic splitter discovery)
1182
1183 // Segment grouping for LZ packing (using BTreeMap for better memory efficiency)
1184 let segment_groups = Arc::new(Mutex::new(BTreeMap::new()));
1185 let group_counter = Arc::new(AtomicU32::new(NO_RAW_GROUPS)); // Start at 16 (LZ groups), group 0 reserved for orphan segments
1186 let raw_group_counter = Arc::new(AtomicU32::new(0)); // Round-robin counter for raw groups (0-15)
1187 let reference_sample_name = Arc::new(Mutex::new(None)); // Shared across all workers
1188
1189 // Segment splitting support (Phase 1)
1190 // Initialize map_segments with (MISSING_KMER, MISSING_KMER) → 0
1191 // This matches C++ AGC line 2396: map_segments[make_pair(~0ull, ~0ull)] = 0
1192 // All raw segments (both k-mers missing) will map to group 0
1193 let mut initial_map_segments = BTreeMap::new();
1194 initial_map_segments.insert(
1195 SegmentGroupKey {
1196 kmer_front: MISSING_KMER,
1197 kmer_back: MISSING_KMER,
1198 },
1199 0,
1200 );
1201 let map_segments: Arc<RwLock<BTreeMap<SegmentGroupKey, u32>>> =
1202 Arc::new(RwLock::new(initial_map_segments));
1203 let map_segments_terminators: Arc<RwLock<BTreeMap<u64, Vec<u64>>>> =
1204 Arc::new(RwLock::new(BTreeMap::new()));
1205 let split_offsets: Arc<Mutex<BTreeMap<(String, String, usize), usize>>> =
1206 Arc::new(Mutex::new(BTreeMap::new()));
1207
1208 // Persistent reference segment storage (matches C++ AGC v_segments)
1209 let reference_segments: Arc<RwLock<BTreeMap<u32, Vec<u8>>>> =
1210 Arc::new(RwLock::new(BTreeMap::new()));
1211
1212 // Reference orientation tracking (fixes ZERO_MATCH bug in Case 3 terminator segments)
1213 let reference_orientations: Arc<RwLock<BTreeMap<u32, bool>>> =
1214 Arc::new(RwLock::new(BTreeMap::new()));
1215
1216 // FFI Grouping Engine - C++ AGC-compatible group assignment
1217 #[cfg(feature = "cpp_agc")]
1218 let grouping_engine = Arc::new(Mutex::new(crate::ragc_ffi::GroupingEngine::new(
1219 config.k as u32,
1220 NO_RAW_GROUPS, // Start group IDs at 16 (group 0 reserved for orphan segments)
1221 )));
1222
1223 // Priority tracking for interleaved processing (matches C++ AGC)
1224 let sample_priorities: Arc<RwLock<BTreeMap<String, i32>>> =
1225 Arc::new(RwLock::new(BTreeMap::new()));
1226 let last_sample_name: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None)); // Track last sample for boundary detection
1227 let next_priority = Arc::new(Mutex::new(i32::MAX)); // Start high, decrease for each sample
1228 let next_sequence = Arc::new(std::sync::atomic::AtomicU64::new(0)); // Increases for each contig (FASTA order)
1229 let global_contig_count = Arc::new(AtomicUsize::new(0)); // GLOBAL counter across all samples (C++ AGC: cnt_contigs_in_sample)
1230
1231 // Batch-local group assignment (matches C++ AGC m_kmers per-batch behavior)
1232 let batch_samples: Arc<Mutex<HashSet<String>>> = Arc::new(Mutex::new(HashSet::new()));
1233 let batch_local_groups: Arc<Mutex<BTreeMap<SegmentGroupKey, u32>>> =
1234 Arc::new(Mutex::new(BTreeMap::new()));
1235 let batch_local_terminators: Arc<Mutex<BTreeMap<u64, Vec<u64>>>> =
1236 Arc::new(Mutex::new(BTreeMap::new()));
1237 let pending_batch_segments: Arc<Mutex<Vec<PendingSegment>>> =
1238 Arc::new(Mutex::new(Vec::new()));
1239 // Two-tier segment buffering for C++ AGC 4-phase parallel pattern
1240 let buffered_seg_part: Arc<BufferedSegPart> =
1241 Arc::new(BufferedSegPart::new(NO_RAW_GROUPS as usize));
1242 let map_fallback_minimizers: Arc<Mutex<BTreeMap<u64, Vec<(u64, u64)>>>> =
1243 Arc::new(Mutex::new(BTreeMap::new()));
1244
1245 // Initialize barrier for sample boundary synchronization (matches C++ AGC barrier)
1246 // All workers must synchronize at sample boundaries to ensure batch flush completes before processing new samples
1247 let barrier = Arc::new(std::sync::Barrier::new(config.num_threads));
1248
1249 // Parallel Phase 3 state for atomic work-stealing (matches C++ AGC architecture)
1250 let parallel_state = Arc::new(ParallelFlushState::new());
1251
1252 // Per-stream write buffer for parallel Phase 3 (C++ AGC pattern: per-segment mutex)
1253 // Workers on different streams can buffer writes concurrently without contention
1254 let write_buffer = Arc::new(ParallelWriteBuffer::new());
1255
1256 // RAW segment buffers for deferred classification (parallel Phase 1 optimization)
1257 // PER-WORKER buffers eliminate contention: each worker pushes to its own buffer
1258 let raw_segment_buffers: Arc<Vec<Mutex<Vec<RawBufferedSegment>>>> = Arc::new(
1259 (0..config.num_threads)
1260 .map(|_| Mutex::new(Vec::new()))
1261 .collect(),
1262 );
1263
1264 // Spawn worker threads
1265 let mut workers = Vec::new();
1266 for worker_id in 0..config.num_threads {
1267 let queue = Arc::clone(&queue);
1268 let collection = Arc::clone(&collection);
1269 let splitters = Arc::clone(&splitters);
1270 let ref_singletons = Arc::clone(&ref_singletons);
1271 let ref_duplicates = Arc::clone(&ref_duplicates);
1272 let archive = Arc::clone(&archive);
1273 let segment_groups = Arc::clone(&segment_groups);
1274 let group_counter = Arc::clone(&group_counter);
1275 let raw_group_counter = Arc::clone(&raw_group_counter);
1276 let reference_sample_name = Arc::clone(&reference_sample_name);
1277 let map_segments = Arc::clone(&map_segments);
1278 let map_segments_terminators = Arc::clone(&map_segments_terminators);
1279 let reference_segments = Arc::clone(&reference_segments);
1280 let reference_orientations = Arc::clone(&reference_orientations);
1281 let split_offsets = Arc::clone(&split_offsets);
1282 #[cfg(feature = "cpp_agc")]
1283 let grouping_engine = Arc::clone(&grouping_engine);
1284 let batch_samples = Arc::clone(&batch_samples);
1285 let batch_local_groups = Arc::clone(&batch_local_groups);
1286 let batch_local_terminators = Arc::clone(&batch_local_terminators);
1287 let pending_batch_segments = Arc::clone(&pending_batch_segments);
1288 let buffered_seg_part = Arc::clone(&buffered_seg_part);
1289 let map_fallback_minimizers = Arc::clone(&map_fallback_minimizers);
1290 let barrier = Arc::clone(&barrier);
1291 let parallel_state = Arc::clone(¶llel_state);
1292 let write_buffer = Arc::clone(&write_buffer);
1293 let raw_segment_buffers = Arc::clone(&raw_segment_buffers);
1294 let config = config.clone();
1295
1296 let handle = thread::spawn(move || {
1297 worker_thread(
1298 worker_id,
1299 queue,
1300 collection,
1301 splitters,
1302 ref_singletons,
1303 ref_duplicates,
1304 archive,
1305 segment_groups,
1306 group_counter,
1307 raw_group_counter,
1308 reference_sample_name,
1309 map_segments,
1310 map_segments_terminators,
1311 reference_segments,
1312 reference_orientations,
1313 split_offsets,
1314 #[cfg(feature = "cpp_agc")]
1315 grouping_engine,
1316 batch_samples,
1317 batch_local_groups,
1318 batch_local_terminators,
1319 pending_batch_segments,
1320 buffered_seg_part,
1321 map_fallback_minimizers,
1322 raw_segment_buffers,
1323 barrier,
1324 parallel_state,
1325 write_buffer,
1326 config,
1327 )
1328 });
1329
1330 workers.push(handle);
1331 }
1332
1333 if config.verbosity > 0 {
1334 eprintln!("Ready to receive sequences!");
1335 }
1336
1337 Ok(Self {
1338 queue,
1339 workers,
1340 barrier,
1341 collection,
1342 splitters,
1343 config,
1344 archive,
1345 segment_groups,
1346 group_counter,
1347 raw_group_counter,
1348 reference_sample_name,
1349 map_segments,
1350 map_segments_terminators,
1351 #[cfg(feature = "cpp_agc")]
1352 grouping_engine,
1353 reference_segments,
1354 reference_orientations,
1355 split_offsets,
1356 sample_priorities,
1357 last_sample_name,
1358 next_priority,
1359 batch_samples,
1360 batch_local_groups,
1361 batch_local_terminators,
1362 pending_batch_segments,
1363 buffered_seg_part,
1364 map_fallback_minimizers,
1365 next_sequence,
1366 global_contig_count,
1367 // Deferred metadata streams (written at end for C++ AGC compatibility)
1368 deferred_file_type_info,
1369 deferred_params,
1370 deferred_splitters,
1371 deferred_segment_splitters,
1372 // Dynamic splitter discovery - MUST use the SAME Arcs passed to workers!
1373 // (empty by default - populated with_full_splitter_data)
1374 ref_singletons,
1375 ref_duplicates,
1376 // Parallel Phase 3 state
1377 parallel_state,
1378 // Per-stream write buffer
1379 write_buffer,
1380 // Raw segment buffers for deferred classification (per-worker)
1381 raw_segment_buffers,
1382 })
1383 }
1384
1385 /// Create a new streaming compressor with full splitter data for dynamic discovery
1386 ///
1387 /// This is the preferred constructor when using adaptive mode. It accepts:
1388 /// - `splitters`: Pre-computed splitter k-mers from reference (for initial segmentation)
1389 /// - `singletons`: All singleton k-mers from reference (for exclusion in find_new_splitters)
1390 /// - `duplicates`: All duplicate k-mers from reference (for exclusion in find_new_splitters)
1391 ///
1392 /// # Arguments
1393 /// * `output_path` - Path to output AGC archive
1394 /// * `config` - Compression configuration
1395 /// * `splitters` - Pre-computed splitter k-mers
1396 /// * `singletons` - Reference singleton k-mers (sorted Vec for binary search)
1397 /// * `duplicates` - Reference duplicate k-mers
1398 pub fn with_full_splitter_data(
1399 output_path: impl AsRef<Path>,
1400 config: StreamingQueueConfig,
1401 splitters: AHashSet<u64>,
1402 singletons: Vec<u64>,
1403 duplicates: AHashSet<u64>,
1404 ) -> Result<Self> {
1405 // Sort singletons for binary search before creating compressor
1406 let mut sorted_singletons = singletons;
1407 sorted_singletons.sort_unstable();
1408
1409 let verbosity = config.verbosity;
1410 let ref_singletons = Arc::new(sorted_singletons);
1411 let ref_duplicates = Arc::new(duplicates);
1412
1413 if verbosity > 0 {
1414 eprintln!(
1415 " Dynamic splitter discovery enabled: {} ref singletons, {} ref duplicates",
1416 ref_singletons.len(),
1417 ref_duplicates.len()
1418 );
1419 }
1420
1421 // Call internal constructor with ref data so workers get the correct Arcs
1422 Self::with_splitters_internal(
1423 output_path,
1424 config,
1425 splitters,
1426 ref_singletons,
1427 ref_duplicates,
1428 )
1429 }
1430
1431 /// Create compressor and determine splitters from first contig
1432 ///
1433 /// **Note**: This requires at least one contig to be pushed before workers start.
1434 /// Consider using `with_splitters()` instead if you have a reference genome.
1435 pub fn new(output_path: impl AsRef<Path>, config: StreamingQueueConfig) -> Result<Self> {
1436 // Start with empty splitters - will be determined from first push
1437 Self::with_splitters(output_path, config, AHashSet::new())
1438 }
1439
1440 /// Push a contig to the compression queue
1441 ///
1442 /// **BLOCKS** if the queue is full (automatic backpressure!)
1443 ///
1444 /// # Arguments
1445 /// * `sample_name` - Name of the sample
1446 /// * `contig_name` - Name of the contig
1447 /// * `data` - Contig sequence data (Vec<u8>)
1448 ///
1449 /// # Example
1450 /// ```no_run
1451 /// # use ragc_core::{StreamingQueueCompressor, StreamingQueueConfig};
1452 /// # use ahash::AHashSet;
1453 /// # let mut compressor = StreamingQueueCompressor::with_splitters("out.agc", StreamingQueueConfig::default(), AHashSet::new())?;
1454 /// compressor.push("sample1".to_string(), "chr1".to_string(), vec![b'A', b'T', b'G', b'C'])?;
1455 /// # Ok::<(), anyhow::Error>(())
1456 /// ```
1457 pub fn push(&mut self, sample_name: String, contig_name: String, data: Contig) -> Result<()> {
1458 // If no splitters yet, determine from this contig
1459 if self.splitters.is_empty() && self.workers.is_empty() {
1460 if self.config.verbosity > 0 {
1461 eprintln!("Determining splitters from first contig...");
1462 }
1463
1464 let (splitters, _, _) =
1465 determine_splitters(&[data.clone()], self.config.k, self.config.segment_size);
1466
1467 if self.config.verbosity > 0 {
1468 eprintln!("Found {} splitters", splitters.len());
1469 }
1470
1471 // Update splitters and spawn workers
1472 self.splitters = Arc::new(splitters);
1473
1474 // Spawn workers now that we have splitters
1475 for worker_id in 0..self.config.num_threads {
1476 let queue = Arc::clone(&self.queue);
1477 let collection = Arc::clone(&self.collection);
1478 let splitters = Arc::clone(&self.splitters);
1479 let ref_singletons = Arc::clone(&self.ref_singletons);
1480 let ref_duplicates = Arc::clone(&self.ref_duplicates);
1481 let archive = Arc::clone(&self.archive);
1482 let segment_groups = Arc::clone(&self.segment_groups);
1483 let group_counter = Arc::clone(&self.group_counter);
1484 let raw_group_counter = Arc::clone(&self.raw_group_counter);
1485 let reference_sample_name = Arc::clone(&self.reference_sample_name);
1486 let map_segments = Arc::clone(&self.map_segments);
1487 let map_segments_terminators = Arc::clone(&self.map_segments_terminators);
1488 let reference_segments = Arc::clone(&self.reference_segments);
1489 let reference_orientations = Arc::clone(&self.reference_orientations);
1490 let split_offsets = Arc::clone(&self.split_offsets);
1491 #[cfg(feature = "cpp_agc")]
1492 let grouping_engine = Arc::clone(&self.grouping_engine);
1493 let batch_samples = Arc::clone(&self.batch_samples);
1494 let batch_local_groups = Arc::clone(&self.batch_local_groups);
1495 let batch_local_terminators = Arc::clone(&self.batch_local_terminators);
1496 let pending_batch_segments = Arc::clone(&self.pending_batch_segments);
1497 let buffered_seg_part = Arc::clone(&self.buffered_seg_part);
1498 let map_fallback_minimizers = Arc::clone(&self.map_fallback_minimizers);
1499 let raw_segment_buffers = Arc::clone(&self.raw_segment_buffers);
1500 let barrier = Arc::clone(&self.barrier);
1501 let parallel_state = Arc::clone(&self.parallel_state);
1502 let write_buffer = Arc::clone(&self.write_buffer);
1503 let config = self.config.clone();
1504
1505 let handle = thread::spawn(move || {
1506 worker_thread(
1507 worker_id,
1508 queue,
1509 collection,
1510 splitters,
1511 ref_singletons,
1512 ref_duplicates,
1513 archive,
1514 segment_groups,
1515 group_counter,
1516 raw_group_counter,
1517 reference_sample_name,
1518 map_segments,
1519 map_segments_terminators,
1520 reference_segments,
1521 reference_orientations,
1522 split_offsets,
1523 #[cfg(feature = "cpp_agc")]
1524 grouping_engine,
1525 batch_samples,
1526 batch_local_groups,
1527 batch_local_terminators,
1528 pending_batch_segments,
1529 buffered_seg_part,
1530 map_fallback_minimizers,
1531 raw_segment_buffers,
1532 barrier,
1533 parallel_state,
1534 write_buffer,
1535 config,
1536 )
1537 });
1538
1539 self.workers.push(handle);
1540 }
1541
1542 if self.config.verbosity > 0 {
1543 eprintln!("Workers spawned and ready!");
1544 }
1545 }
1546
1547 // Register contig in collection
1548 {
1549 let mut collection = self.collection.lock().unwrap();
1550 collection
1551 .register_sample_contig(&sample_name, &contig_name)
1552 .context("Failed to register contig")?;
1553 }
1554
1555 // Set first sample as reference (multi-file mode)
1556 {
1557 let mut ref_sample = self.reference_sample_name.lock().unwrap();
1558 if ref_sample.is_none() {
1559 if self.config.verbosity > 0 {
1560 eprintln!("Using first sample ({}) as reference", sample_name);
1561 }
1562 *ref_sample = Some(sample_name.clone());
1563 }
1564 }
1565
1566 // Calculate task size
1567 let task_size = data.len();
1568
1569 // Get sequence number for FASTA ordering (lower = earlier = higher priority)
1570 let sequence = self
1571 .next_sequence
1572 .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1573
1574 // Get or assign priority for this sample (matches C++ AGC priority queue)
1575 // Higher priority = processed first (decreases for each new sample)
1576 // C++ AGC also decrements priority every 50 contigs WITHIN a sample (max_no_contigs_before_synchronization)
1577 let sample_priority = {
1578 let mut priorities = self.sample_priorities.write().unwrap();
1579 let current_priority = *priorities.entry(sample_name.clone()).or_insert_with(|| {
1580 // First time seeing this sample - assign new priority
1581 let mut next_p = self.next_priority.lock().unwrap();
1582 let priority = *next_p;
1583 *next_p -= 1; // Decrement for next sample (C++ AGC uses --sample_priority)
1584 priority
1585 });
1586
1587 // Track GLOBAL contig count and insert sync tokens every 50 contigs (pack_cardinality)
1588 // C++ AGC: if (++cnt_contigs_in_sample >= max_no_contigs_before_synchronization)
1589 // NOTE: Despite the name, C++ AGC's cnt_contigs_in_sample is GLOBAL, not per-sample!
1590 // FIX 5: Only send PACK_BOUNDARY sync tokens in concatenated mode (single file)
1591 // In non-concatenated mode (multiple files), only SAMPLE_BOUNDARY sync tokens are sent
1592 let count = self
1593 .global_contig_count
1594 .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1595 let need_sync =
1596 self.config.concatenated_genomes && (count + 1) % self.config.pack_size == 0;
1597
1598 if need_sync {
1599 // Reached synchronization point (every 50 contigs GLOBALLY)
1600 // C++ AGC does: cnt_contigs_in_sample = 0; --sample_priority;
1601 if let Some(priority) = priorities.get_mut(&sample_name) {
1602 *priority -= 1;
1603 }
1604
1605 // Get the NEW priority (after decrement) for sync tokens
1606 let new_priority = *priorities.get(&sample_name).unwrap();
1607
1608 // Drop locks before inserting sync tokens to avoid deadlock
1609 drop(priorities);
1610
1611 // Insert sync tokens (matches C++ AGC EmplaceManyNoCost)
1612 // CRITICAL: Sync tokens must have HIGHER priority than subsequent contigs
1613 // to ensure they're processed before any contigs with the new_priority.
1614 if self.config.verbosity > 0 {
1615 eprintln!(
1616 "PACK_BOUNDARY: Inserting {} sync tokens after {} contigs (global count)",
1617 self.config.num_threads,
1618 count + 1
1619 );
1620 }
1621
1622 for _ in 0..self.config.num_threads {
1623 let sync_token = ContigTask {
1624 sample_name: sample_name.clone(),
1625 contig_name: String::from("<SYNC>"),
1626 data: Vec::new(),
1627 // Use large priority boost to ensure sync tokens are processed BEFORE any contigs
1628 // With +1, contigs with same priority but higher cost were being popped first
1629 // This caused barrier deadlock when some workers exited before others got sync tokens
1630 sample_priority: new_priority + 1_000_000,
1631 cost: 0,
1632 sequence,
1633 is_sync_token: true,
1634 };
1635 self.queue.push(sync_token, 0)?;
1636 }
1637
1638 // Return NEW priority for subsequent contigs
1639 new_priority
1640 } else {
1641 current_priority // Use priority BEFORE potential decrement (this contig uses current priority)
1642 }
1643 };
1644
1645 // Insert sync tokens at sample boundaries (matches C++ AGC registration tokens)
1646 // OPTIMIZATION: In multi-file mode, SKIP per-sample sync tokens for better parallelism
1647 // This batches all samples together - sync only happens at finalization
1648 // Set RAGC_SYNC_PER_SAMPLE=1 to force per-sample sync (matches old behavior)
1649 {
1650 let mut last_sample = self.last_sample_name.lock().unwrap();
1651 if let Some(ref last) = *last_sample {
1652 if last != &sample_name {
1653 // Sample boundary detected
1654 // Only insert sync tokens if forced by env var (for debugging/compatibility)
1655 let force_sync = std::env::var("RAGC_SYNC_PER_SAMPLE")
1656 .map(|v| v == "1")
1657 .unwrap_or(false);
1658
1659 if force_sync {
1660 if self.config.verbosity > 0 {
1661 eprintln!(
1662 "SAMPLE_BOUNDARY: Inserting {} sync tokens (transitioning from {} to {})",
1663 self.config.num_threads, last, sample_name
1664 );
1665 }
1666
1667 // Insert num_threads sync tokens (matches C++ AGC EmplaceManyNoCost)
1668 // All workers must pop a token and synchronize before processing new sample
1669 // CRITICAL: Sync tokens must have MUCH HIGHER priority than any contigs
1670 // to ensure they're pulled and processed BEFORE any contigs.
1671 // Use large priority boost (+1_000_000) to overcome cost-based tie-breaking
1672 // which was causing contigs to be popped before sync tokens at same priority.
1673 for _ in 0..self.config.num_threads {
1674 let sync_token = ContigTask {
1675 sample_name: sample_name.clone(),
1676 contig_name: String::from("<SYNC>"),
1677 data: Vec::new(), // Empty data for sync token
1678 sample_priority: sample_priority + 1_000_000, // Much higher priority than any contigs
1679 cost: 0, // No cost for sync tokens
1680 sequence,
1681 is_sync_token: true,
1682 };
1683 self.queue.push(sync_token, 0)?; // 0 size for sync tokens
1684 }
1685 } else if self.config.verbosity > 1 {
1686 eprintln!(
1687 "SAMPLE_BOUNDARY: SKIPPING sync tokens (multi-file batching: {} -> {})",
1688 last, sample_name
1689 );
1690 }
1691 }
1692 }
1693 // Update last sample name
1694 *last_sample = Some(sample_name.clone());
1695 }
1696
1697 // Create task with priority information
1698 // NOTE: sequence is used for FASTA ordering (lower = processed first)
1699 let cost = data.len(); // C++ AGC: auto cost = contig.size()
1700 let task = ContigTask {
1701 sample_name: sample_name.clone(),
1702 contig_name,
1703 data,
1704 sample_priority,
1705 cost,
1706 sequence,
1707 is_sync_token: false, // Normal contig task, not a sync token
1708 };
1709
1710 // Push to queue (BLOCKS if queue is full!)
1711 // Queue is now a priority queue - highest priority processed first
1712 // eprintln!("[RAGC PUSH] sample={} contig={} priority={} cost={} sequence={}",
1713 // &task.sample_name, &task.contig_name, task.sample_priority, task.cost, task.sequence);
1714 self.queue
1715 .push(task, task_size)
1716 .context("Failed to push to queue")?;
1717
1718 Ok(())
1719 }
1720
1721 /// Finalize compression
1722 ///
1723 /// This will:
1724 /// 1. Close the queue (no more pushes allowed)
1725 /// 2. Wait for all worker threads to finish processing
1726 /// 3. Write metadata to the archive
1727 /// 4. Close the archive file
1728 ///
1729 /// # Example
1730 /// ```no_run
1731 /// # use ragc_core::{StreamingQueueCompressor, StreamingQueueConfig};
1732 /// # use ahash::AHashSet;
1733 /// # let mut compressor = StreamingQueueCompressor::with_splitters("out.agc", StreamingQueueConfig::default(), AHashSet::new())?;
1734 /// // ... push sequences ...
1735 /// compressor.finalize()?;
1736 /// # Ok::<(), anyhow::Error>(())
1737 /// ```
1738 pub fn drain(&self) -> Result<()> {
1739 if self.config.verbosity > 0 {
1740 eprintln!(
1741 "Draining queue (waiting for {} items to be processed)...",
1742 self.queue.len()
1743 );
1744 }
1745
1746 // Wait for queue to empty
1747 // Poll every 100ms until queue is empty
1748 while self.queue.len() > 0 {
1749 std::thread::sleep(std::time::Duration::from_millis(100));
1750 }
1751
1752 if self.config.verbosity > 0 {
1753 eprintln!("Queue drained - all queued contigs processed");
1754 }
1755
1756 Ok(())
1757 }
1758
1759 /// Insert sync tokens to trigger incremental compression of buffered segments.
1760 /// Call this after pushing a batch of samples to process them incrementally
1761 /// instead of waiting for finalize().
1762 pub fn sync_and_flush(&self, sample_name: &str) -> Result<()> {
1763 // Insert sync tokens for each worker
1764 let sequence = self
1765 .next_sequence
1766 .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1767
1768 for _ in 0..self.config.num_threads {
1769 let sync_token = ContigTask {
1770 sample_name: format!("<SYNC:{}>", sample_name),
1771 contig_name: String::from("<SYNC>"),
1772 data: Vec::new(),
1773 sample_priority: 1_000_000_i32, // High priority = processed after pending contigs
1774 cost: 0,
1775 sequence,
1776 is_sync_token: true,
1777 };
1778 self.queue.push(sync_token, 0)?;
1779 }
1780
1781 // Wait for sync tokens to be processed (queue empty)
1782 while self.queue.len() > 0 {
1783 std::thread::sleep(std::time::Duration::from_millis(10));
1784 }
1785
1786 Ok(())
1787 }
1788
1789 pub fn finalize(self) -> Result<()> {
1790 if self.config.verbosity > 0 {
1791 eprintln!("Finalizing compression...");
1792 }
1793
1794 // CRITICAL: Insert FINAL sync tokens before closing queue
1795 // This ensures buffered_seg_part data is processed and flushed
1796 // (matches C++ AGC line 2236-2244: final sync at end of input)
1797 if self.config.verbosity > 0 {
1798 eprintln!(
1799 " Inserting {} final sync tokens...",
1800 self.config.num_threads
1801 );
1802 }
1803
1804 // Use sequence 0 and high priority to ensure sync tokens are processed last
1805 let sequence = 0;
1806
1807 for _ in 0..self.config.num_threads {
1808 let sync_token = ContigTask {
1809 sample_name: String::from("<FINAL>"),
1810 contig_name: String::from("<SYNC>"),
1811 data: Vec::new(),
1812 sample_priority: 1_000_000_i32, // Very high priority = processed after all real contigs
1813 cost: 0,
1814 sequence,
1815 is_sync_token: true,
1816 };
1817 self.queue.push(sync_token, 0)?;
1818 }
1819
1820 if self.config.verbosity > 0 {
1821 eprintln!(" Closing queue...");
1822 }
1823
1824 // Close queue - no more pushes allowed
1825 self.queue.close();
1826
1827 if self.config.verbosity > 0 {
1828 eprintln!(" Waiting for {} workers to finish...", self.workers.len());
1829 }
1830
1831 let wait_start = std::time::Instant::now();
1832 // Wait for all workers to finish
1833 for (i, handle) in self.workers.into_iter().enumerate() {
1834 handle
1835 .join()
1836 .expect("Worker thread panicked")
1837 .with_context(|| format!("Worker {} failed", i))?;
1838 }
1839
1840 if self.config.verbosity > 0 {
1841 eprintln!(
1842 "FINALIZE_TIMING: Wait for workers took {:?}",
1843 wait_start.elapsed()
1844 );
1845 eprintln!("All workers finished!");
1846 eprintln!("Flushing remaining segment packs...");
1847 }
1848
1849 // Flush all remaining partial packs using PARALLEL compression
1850 let flush_start = std::time::Instant::now();
1851 {
1852 use crate::segment_compression::compress_segment_configured;
1853 use rayon::prelude::*;
1854
1855 let mut groups = self.segment_groups.lock().unwrap();
1856 let num_groups = groups.len();
1857
1858 // Phase 1: Flush any groups with pending segments (rare, usually 0-1)
1859 let phase1_start = std::time::Instant::now();
1860 let mut phase1_count = 0;
1861 for (key, buffer) in groups.iter_mut() {
1862 if !buffer.segments.is_empty() || !buffer.ref_written {
1863 phase1_count += 1;
1864 if self.config.verbosity > 1 {
1865 eprintln!(
1866 "Flushing group {} with {} segments (k-mers: {:#x}, {:#x})",
1867 buffer.group_id,
1868 buffer.segments.len(),
1869 key.kmer_front,
1870 key.kmer_back
1871 );
1872 }
1873 flush_pack(
1874 buffer,
1875 &self.collection,
1876 &self.archive,
1877 &self.config,
1878 &self.reference_segments,
1879 )
1880 .context("Failed to flush remaining pack")?;
1881 }
1882 }
1883 if self.config.verbosity > 0 {
1884 eprintln!(
1885 "FLUSH_PHASE1: {} groups with pending segments, took {:?}",
1886 phase1_count,
1887 phase1_start.elapsed()
1888 );
1889 }
1890
1891 // Phase 2: Collect and PARALLEL compress pending_deltas
1892 // Each entry: (stream_id, raw_data, compressed_data, raw_size)
1893 struct PartialPackData {
1894 stream_id: usize,
1895 raw_data: Vec<u8>,
1896 compressed: Vec<u8>,
1897 raw_size: usize,
1898 use_compressed: bool,
1899 }
1900
1901 let compression_level = self.config.compression_level;
1902 let verbosity = self.config.verbosity;
1903
1904 // Extract work items from groups
1905 let phase2_start = std::time::Instant::now();
1906 let work_items: Vec<_> = groups
1907 .iter_mut()
1908 .filter(|(_, buffer)| !buffer.pending_deltas.is_empty())
1909 .map(|(_, buffer)| {
1910 // FIX: Raw groups are 0-15 (group_id < 16), not just group 0
1911 let use_lz_encoding = buffer.group_id >= NO_RAW_GROUPS;
1912 let mut packed_data = Vec::new();
1913
1914 if !use_lz_encoding && !buffer.raw_placeholder_written {
1915 packed_data.push(0x7f);
1916 packed_data.push(CONTIG_SEPARATOR);
1917 }
1918
1919 for delta in buffer.pending_deltas.iter() {
1920 packed_data.extend_from_slice(delta);
1921 packed_data.push(CONTIG_SEPARATOR);
1922 }
1923
1924 let stream_id = buffer.stream_id as usize;
1925 let group_id = buffer.group_id;
1926 let delta_count = buffer.pending_deltas.len();
1927
1928 buffer.pending_deltas.clear();
1929 buffer.pending_delta_ids.clear();
1930
1931 (stream_id, packed_data, group_id, delta_count)
1932 })
1933 .collect();
1934
1935 let work_items_count = work_items.len();
1936 if self.config.verbosity > 0 {
1937 eprintln!(
1938 "FLUSH_PHASE2a: Collected {} work items, took {:?}",
1939 work_items_count,
1940 phase2_start.elapsed()
1941 );
1942 }
1943
1944 // Parallel compression using rayon
1945 // Use full compression level for final partial packs to match C++ AGC output.
1946 // Previously capped at level 9 for speed, but this caused 15% larger archives
1947 // when most compression happens in finalize (e.g., with per-sample sync).
1948 let partial_compression_level = compression_level;
1949 let compress_start = std::time::Instant::now();
1950 let compressed_packs: Vec<PartialPackData> = work_items
1951 .into_par_iter()
1952 .filter_map(|(stream_id, packed_data, group_id, delta_count)| {
1953 if packed_data.is_empty() {
1954 return None;
1955 }
1956
1957 let raw_size = packed_data.len();
1958 let mut compressed = match compress_segment_configured(
1959 &packed_data,
1960 partial_compression_level,
1961 ) {
1962 Ok(c) => c,
1963 Err(e) => {
1964 eprintln!(
1965 "Error compressing final partial pack for group {}: {}",
1966 group_id, e
1967 );
1968 return None;
1969 }
1970 };
1971 compressed.push(0); // Marker 0 = plain ZSTD
1972
1973 let use_compressed = compressed.len() < raw_size;
1974
1975 if verbosity > 1 {
1976 eprintln!(
1977 " Compressed final partial pack for group {} with {} deltas",
1978 group_id, delta_count
1979 );
1980 }
1981
1982 Some(PartialPackData {
1983 stream_id,
1984 raw_data: packed_data,
1985 compressed,
1986 raw_size,
1987 use_compressed,
1988 })
1989 })
1990 .collect();
1991
1992 if self.config.verbosity > 0 {
1993 eprintln!(
1994 "FLUSH_PHASE2b: Parallel compression of {} packs, took {:?}",
1995 compressed_packs.len(),
1996 compress_start.elapsed()
1997 );
1998 }
1999
2000 // Phase 3: Sequential writes to archive (sorted by stream_id for determinism)
2001 let phase3_start = std::time::Instant::now();
2002 let mut sorted_packs = compressed_packs;
2003 sorted_packs.sort_by_key(|p| p.stream_id);
2004
2005 let mut arch = self.archive.lock().unwrap();
2006 for pack in sorted_packs {
2007 if pack.use_compressed {
2008 // Use buffered writes to reduce syscalls
2009 arch.add_part_buffered(
2010 pack.stream_id,
2011 pack.compressed.clone(),
2012 pack.raw_size as u64,
2013 );
2014 } else {
2015 arch.add_part_buffered(pack.stream_id, pack.raw_data.clone(), 0);
2016 }
2017 }
2018 drop(arch);
2019
2020 if self.config.verbosity > 0 {
2021 eprintln!(
2022 "FLUSH_PHASE3: Sequential writes, took {:?}",
2023 phase3_start.elapsed()
2024 );
2025 eprintln!("Flushed {} segment groups", num_groups);
2026 eprintln!("FINALIZE_TIMING: Flush took {:?}", flush_start.elapsed());
2027 }
2028 }
2029
2030 if self.config.verbosity > 0 {
2031 eprintln!("Writing metadata...");
2032 }
2033
2034 // Get total sample count for metadata writing
2035 let num_samples = {
2036 let coll = self.collection.lock().unwrap();
2037 coll.get_no_samples()
2038 };
2039
2040 // Write collection metadata to archive
2041 {
2042 let mut archive = self.archive.lock().unwrap();
2043 let mut collection = self.collection.lock().unwrap();
2044
2045 // DEFERRED METADATA WRITES (C++ AGC compatibility)
2046 // C++ AGC writes metadata streams AFTER segment data, in this order:
2047 // 1. params
2048 // 2. splitters
2049 // 3. segment-splitters
2050 // 4. collection metadata (samples, contigs, details)
2051 // 5. file_type_info
2052 let (params_stream_id, params_data) = &self.deferred_params;
2053 archive.add_part_buffered(*params_stream_id, params_data.clone(), 0);
2054
2055 let (splitters_stream_id, splitters_data) = &self.deferred_splitters;
2056 archive.add_part_buffered(*splitters_stream_id, splitters_data.clone(), 0);
2057
2058 let (seg_splitters_stream_id, seg_splitters_data) = &self.deferred_segment_splitters;
2059 archive.add_part_buffered(*seg_splitters_stream_id, seg_splitters_data.clone(), 0);
2060
2061 // Write sample names
2062 collection
2063 .store_batch_sample_names(&mut archive)
2064 .context("Failed to write sample names")?;
2065
2066 // Write contig names and segment details in batches of 50
2067 // (matches C++ AGC pack_cardinality default)
2068 const PACK_CARDINALITY: usize = 50;
2069 let mut i = 0;
2070 while i < num_samples {
2071 let batch_end = (i + PACK_CARDINALITY).min(num_samples);
2072 collection
2073 .store_contig_batch(&mut archive, i, batch_end)
2074 .context("Failed to write contig batch")?;
2075 i = batch_end;
2076 }
2077
2078 // Write file_type_info LAST (matches C++ AGC store_file_type_info order)
2079 let (file_type_info_stream_id, file_type_info_data) = &self.deferred_file_type_info;
2080 archive.add_part_buffered(*file_type_info_stream_id, file_type_info_data.clone(), 7);
2081
2082 // Flush all buffered writes to disk in one batch (reduces syscalls from ~200 to ~1)
2083 archive
2084 .flush_buffers()
2085 .context("Failed to flush archive buffers")?;
2086
2087 if self.config.verbosity > 0 {
2088 eprintln!("Collection metadata written successfully");
2089 }
2090
2091 // Close archive (writes footer)
2092 archive.close().context("Failed to close archive")?;
2093 }
2094
2095 if self.config.verbosity > 0 {
2096 eprintln!("Compression complete!");
2097 }
2098
2099 Ok(())
2100 }
2101
2102 /// Get current queue statistics
2103 pub fn queue_stats(&self) -> QueueStats {
2104 QueueStats {
2105 current_size_bytes: self.queue.current_size(),
2106 current_items: self.queue.len(),
2107 capacity_bytes: self.queue.capacity(),
2108 is_closed: self.queue.is_closed(),
2109 }
2110 }
2111}
2112
2113/// Queue statistics
2114#[derive(Debug, Clone)]
2115pub struct QueueStats {
2116 pub current_size_bytes: usize,
2117 pub current_items: usize,
2118 pub capacity_bytes: usize,
2119 pub is_closed: bool,
2120}
2121
2122/// Flush a complete pack of segments (compress, LZ encode, write to archive)
2123/// Pre-compressed data ready for archive write (no locks needed during compression)
2124struct PreCompressedPart {
2125 stream_id: usize,
2126 data: Vec<u8>,
2127 metadata: u64,
2128}
2129
2130/// Segment registration data for collection (batched for single lock acquisition)
2131struct SegmentRegistration {
2132 sample_name: String,
2133 contig_name: String,
2134 seg_part_no: usize,
2135 group_id: u32,
2136 in_group_id: u32,
2137 is_rev_comp: bool,
2138 raw_length: u32,
2139}
2140
2141/// Result of parallel compression phase (for deterministic sequential writes)
2142/// Workers produce these in parallel, then Thread 0 writes them in sorted order
2143struct FlushPackResult {
2144 group_id: u32,
2145 archive_writes: Vec<PreCompressedPart>,
2146 registrations: Vec<SegmentRegistration>,
2147 ref_to_store: Option<(u32, Vec<u8>)>,
2148}
2149
2150fn flush_pack(
2151 buffer: &mut SegmentGroupBuffer,
2152 collection: &Arc<Mutex<CollectionV3>>,
2153 archive: &Arc<Mutex<Archive>>,
2154 config: &StreamingQueueConfig,
2155 reference_segments: &Arc<RwLock<BTreeMap<u32, Vec<u8>>>>,
2156) -> Result<()> {
2157 use crate::segment_compression::{compress_reference_segment, compress_segment_configured};
2158
2159 // Skip if no segments to write (but still write reference if present)
2160 if buffer.segments.is_empty() && buffer.ref_written {
2161 return Ok(());
2162 }
2163
2164 let use_lz_encoding = buffer.group_id >= NO_RAW_GROUPS;
2165
2166 // CRITICAL FIX: Sort ALL segments FIRST by (sample_name, contig_name, seg_part_no)
2167 // BEFORE picking the reference. This ensures the lexicographically first segment
2168 // becomes the reference, matching C++ AGC's behavior.
2169 // (Previous code sorted AFTER picking reference, causing wrong reference selection)
2170 buffer.segments.sort();
2171
2172 // ============================================================
2173 // PHASE 1: Compress everything WITHOUT holding any locks
2174 // ============================================================
2175
2176 // Collect all pre-compressed writes for batched archive write
2177 let mut archive_writes: Vec<PreCompressedPart> = Vec::new();
2178 // Collect all segment registrations for batched collection update
2179 let mut registrations: Vec<SegmentRegistration> = Vec::new();
2180 // Reference data to store in global map (if any)
2181 let mut ref_to_store: Option<(u32, Vec<u8>)> = None;
2182
2183 // Write reference segment if not already written (first pack for this group)
2184 // Extract reference from sorted segments (matching C++ AGC: first segment after sort becomes reference)
2185 // NOTE: Raw groups (0-15) do NOT have a reference - all segments stored raw
2186 if use_lz_encoding && !buffer.ref_written && !buffer.segments.is_empty() {
2187 // Remove first segment (alphabetically first after sorting) to use as reference
2188 let ref_seg = buffer.segments.remove(0);
2189
2190 if crate::env_cache::debug_ref_write() {
2191 eprintln!(
2192 "DEBUG_REF_WRITE: group={} sample={} contig={} seg={} data_len={} segments_remaining={}",
2193 buffer.group_id, ref_seg.sample_name, ref_seg.contig_name,
2194 ref_seg.seg_part_no, ref_seg.data.len(), buffer.segments.len()
2195 );
2196 }
2197
2198 if config.verbosity > 1 {
2199 eprintln!(
2200 " Flushing group {}: reference from {} (chosen from {} sorted segments)",
2201 buffer.group_id,
2202 ref_seg.sample_name,
2203 buffer.segments.len() + 1
2204 );
2205 }
2206
2207 // Compress reference using adaptive compression (NO LOCK)
2208 let (mut compressed, marker) =
2209 compress_reference_segment(&ref_seg.data).context("Failed to compress reference")?;
2210 compressed.push(marker);
2211
2212 // Metadata stores the uncompressed size
2213 let ref_size = ref_seg.data.len() as u64;
2214
2215 // CRITICAL: Check if compression helped (matching C++ AGC segment.h lines 179, 204)
2216 // C++ AGC: if(packed_size + 1u < (uint32_t) data.size())
2217 // If compression didn't help, write UNCOMPRESSED raw data with metadata=0
2218 if compressed.len() < ref_seg.data.len() {
2219 // Compression helped - write compressed data with metadata=original_size
2220 archive_writes.push(PreCompressedPart {
2221 stream_id: buffer.ref_stream_id,
2222 data: compressed,
2223 metadata: ref_size,
2224 });
2225 } else {
2226 // Compression didn't help - write UNCOMPRESSED data with metadata=0
2227 archive_writes.push(PreCompressedPart {
2228 stream_id: buffer.ref_stream_id,
2229 data: ref_seg.data.clone(),
2230 metadata: 0,
2231 });
2232 }
2233
2234 // Queue reference registration
2235 registrations.push(SegmentRegistration {
2236 sample_name: ref_seg.sample_name.clone(),
2237 contig_name: ref_seg.contig_name.clone(),
2238 seg_part_no: ref_seg.seg_part_no,
2239 group_id: buffer.group_id,
2240 in_group_id: 0, // Reference is always at position 0
2241 is_rev_comp: ref_seg.is_rev_comp,
2242 raw_length: ref_seg.data.len() as u32,
2243 });
2244
2245 buffer.ref_written = true;
2246
2247 // Queue reference for global map storage
2248 ref_to_store = Some((buffer.group_id, ref_seg.data.clone()));
2249
2250 buffer.reference_segment = Some(ref_seg.clone()); // Store for LZ encoding
2251
2252 // Prepare LZ encoder with reference (matching C++ AGC segment.cpp line 43: lz_diff->Prepare(s))
2253 // This is done ONCE when the reference is written, then reused for all subsequent segments
2254 if use_lz_encoding {
2255 let mut lz = LZDiff::new(config.min_match_len as u32);
2256 lz.prepare(&ref_seg.data);
2257 buffer.lz_diff = Some(lz);
2258 }
2259 }
2260
2261 // NOTE: Segments are already sorted at the start of flush_pack (line ~1003)
2262 // This sort was moved earlier to ensure correct reference selection.
2263
2264 // Pack segments together with delta deduplication (matching C++ AGC segment.cpp lines 66-74)
2265 // Note: segments do NOT include the reference - it's stored separately
2266 //
2267 // CRITICAL FIX: Partial packs must persist across flush_pack calls to ensure pack boundaries
2268 // align with decompression expectations. Pack N must contain entries for in_group_ids
2269 // (N*50)+1 to (N+1)*50. Only write a pack when it has exactly 50 entries (or at finalization).
2270 // Use buffer.pending_deltas and buffer.pending_delta_ids to persist partial packs.
2271
2272 let mut segment_in_group_ids: Vec<(usize, u32)> = Vec::new(); // (segment_index, in_group_id) for each segment
2273
2274 // Helper function to compress a complete pack (exactly 50 entries) - NO LOCK
2275 let compress_pack = |deltas: &[Vec<u8>],
2276 needs_raw_placeholder: bool,
2277 stream_id: usize,
2278 compression_level: i32|
2279 -> Result<PreCompressedPart> {
2280 let mut packed_data = Vec::new();
2281
2282 // CRITICAL: Raw groups need a placeholder segment at position 0
2283 if needs_raw_placeholder {
2284 packed_data.push(0x7f);
2285 packed_data.push(CONTIG_SEPARATOR);
2286 }
2287
2288 for delta in deltas.iter() {
2289 packed_data.extend_from_slice(delta);
2290 packed_data.push(CONTIG_SEPARATOR);
2291 }
2292
2293 let total_raw_size = packed_data.len();
2294 let mut compressed = compress_segment_configured(&packed_data, compression_level)
2295 .context("Failed to compress pack")?;
2296 compressed.push(0); // Marker 0 = plain ZSTD
2297
2298 if compressed.len() < total_raw_size {
2299 Ok(PreCompressedPart {
2300 stream_id,
2301 data: compressed,
2302 metadata: total_raw_size as u64,
2303 })
2304 } else {
2305 Ok(PreCompressedPart {
2306 stream_id,
2307 data: packed_data,
2308 metadata: 0,
2309 })
2310 }
2311 };
2312
2313 for (seg_idx, seg) in buffer.segments.iter().enumerate() {
2314 let contig_data = if !use_lz_encoding || buffer.reference_segment.is_none() {
2315 // Raw segment: groups 0-15 OR groups without reference
2316 seg.data.clone()
2317 } else {
2318 // LZ-encoded segment (groups >= 16 with reference)
2319 // DEBUG: Log sizes before encoding
2320 if let Some(ref_seg) = &buffer.reference_segment {
2321 if config.verbosity > 1 {
2322 eprintln!(" LZ encoding: group={} ref_len={} target_len={} sample={} contig={} part={}",
2323 buffer.group_id, ref_seg.data.len(), seg.data.len(),
2324 seg.sample_name, seg.contig_name, seg.seg_part_no);
2325 }
2326 }
2327 // Reuse prepared lz_diff (matching C++ AGC segment.cpp line 59: lz_diff->Encode(s, delta))
2328 let ragc_encoded = buffer
2329 .lz_diff
2330 .as_mut()
2331 .expect("lz_diff should be prepared when reference is written")
2332 .encode(&seg.data);
2333
2334 // Compare with C++ AGC encode (TEST HARNESS)
2335 #[cfg(feature = "cpp_agc")]
2336 if crate::env_cache::test_lz_encoding() {
2337 if let Some(ref_seg) = &buffer.reference_segment {
2338 if let Some(cpp_encoded) = crate::ragc_ffi::lzdiff_v2_encode(
2339 &ref_seg.data,
2340 &seg.data,
2341 config.min_match_len as u32,
2342 ) {
2343 if ragc_encoded != cpp_encoded {
2344 eprintln!("\n========================================");
2345 eprintln!("🔥 LZ ENCODING MISMATCH DETECTED!");
2346 eprintln!("========================================");
2347 eprintln!("Group: {}", buffer.group_id);
2348 eprintln!("Sample: {}", seg.sample_name);
2349 eprintln!("Contig: {}", seg.contig_name);
2350 eprintln!("Segment: {}", seg.seg_part_no);
2351 eprintln!("Reference len: {}", ref_seg.data.len());
2352 eprintln!("Target len: {}", seg.data.len());
2353 eprintln!("RAGC encoded: {} bytes", ragc_encoded.len());
2354 eprintln!("C++ AGC encoded: {} bytes", cpp_encoded.len());
2355 eprintln!(
2356 "Difference: {} bytes",
2357 (ragc_encoded.len() as i64 - cpp_encoded.len() as i64).abs()
2358 );
2359 eprintln!();
2360
2361 // Find first difference
2362 let mut first_diff_byte = None;
2363 for (i, (r, c)) in
2364 ragc_encoded.iter().zip(cpp_encoded.iter()).enumerate()
2365 {
2366 if r != c {
2367 first_diff_byte = Some(i);
2368 break;
2369 }
2370 }
2371
2372 if let Some(i) = first_diff_byte {
2373 eprintln!("First difference at byte {}", i);
2374 let start = if i > 20 { i - 20 } else { 0 };
2375 let end = (i + 30).min(ragc_encoded.len()).min(cpp_encoded.len());
2376
2377 eprintln!("\nRAGC output around difference:");
2378 let ragc_hex: Vec<_> = ragc_encoded[start..end]
2379 .iter()
2380 .map(|b| format!("{:02x}", b))
2381 .collect();
2382 let ragc_ascii: String = ragc_encoded[start..end]
2383 .iter()
2384 .map(|&b| if b >= 32 && b < 127 { b as char } else { '.' })
2385 .collect();
2386 eprintln!(" Hex: {}", ragc_hex.join(" "));
2387 eprintln!(" ASCII: {}", ragc_ascii);
2388
2389 eprintln!("\nC++ AGC output around difference:");
2390 let cpp_hex: Vec<_> = cpp_encoded[start..end]
2391 .iter()
2392 .map(|b| format!("{:02x}", b))
2393 .collect();
2394 let cpp_ascii: String = cpp_encoded[start..end]
2395 .iter()
2396 .map(|&b| if b >= 32 && b < 127 { b as char } else { '.' })
2397 .collect();
2398 eprintln!(" Hex: {}", cpp_hex.join(" "));
2399 eprintln!(" ASCII: {}", cpp_ascii);
2400
2401 eprintln!("\nByte at position {}:", i);
2402 eprintln!(
2403 " RAGC: 0x{:02x} ('{}')",
2404 ragc_encoded[i],
2405 if ragc_encoded[i] >= 32 && ragc_encoded[i] < 127 {
2406 ragc_encoded[i] as char
2407 } else {
2408 '?'
2409 }
2410 );
2411 eprintln!(
2412 " C++ AGC: 0x{:02x} ('{}')",
2413 cpp_encoded[i],
2414 if cpp_encoded[i] >= 32 && cpp_encoded[i] < 127 {
2415 cpp_encoded[i] as char
2416 } else {
2417 '?'
2418 }
2419 );
2420 } else if ragc_encoded.len() != cpp_encoded.len() {
2421 eprintln!(
2422 "Encodings match for first {} bytes, but lengths differ",
2423 ragc_encoded.len().min(cpp_encoded.len())
2424 );
2425 if ragc_encoded.len() > cpp_encoded.len() {
2426 let extra_start = cpp_encoded.len();
2427 let extra_hex: Vec<_> = ragc_encoded[extra_start..]
2428 .iter()
2429 .take(40)
2430 .map(|b| format!("{:02x}", b))
2431 .collect();
2432 let extra_ascii: String = ragc_encoded[extra_start..]
2433 .iter()
2434 .take(40)
2435 .map(|&b| if b >= 32 && b < 127 { b as char } else { '.' })
2436 .collect();
2437 eprintln!(
2438 "RAGC has {} extra bytes:",
2439 ragc_encoded.len() - cpp_encoded.len()
2440 );
2441 eprintln!(" Hex: {}", extra_hex.join(" "));
2442 eprintln!(" ASCII: {}", extra_ascii);
2443 } else {
2444 let extra_start = ragc_encoded.len();
2445 let extra_hex: Vec<_> = cpp_encoded[extra_start..]
2446 .iter()
2447 .take(40)
2448 .map(|b| format!("{:02x}", b))
2449 .collect();
2450 let extra_ascii: String = cpp_encoded[extra_start..]
2451 .iter()
2452 .take(40)
2453 .map(|&b| if b >= 32 && b < 127 { b as char } else { '.' })
2454 .collect();
2455 eprintln!(
2456 "C++ AGC has {} extra bytes:",
2457 cpp_encoded.len() - ragc_encoded.len()
2458 );
2459 eprintln!(" Hex: {}", extra_hex.join(" "));
2460 eprintln!(" ASCII: {}", extra_ascii);
2461 }
2462 }
2463
2464 // Show last 10 bytes of each
2465 eprintln!("\nLast 10 bytes of each encoding:");
2466 let ragc_tail_start = if ragc_encoded.len() > 10 {
2467 ragc_encoded.len() - 10
2468 } else {
2469 0
2470 };
2471 let ragc_tail_hex: Vec<_> = ragc_encoded[ragc_tail_start..]
2472 .iter()
2473 .map(|b| format!("{:02x}", b))
2474 .collect();
2475 let ragc_tail_ascii: String = ragc_encoded[ragc_tail_start..]
2476 .iter()
2477 .map(|&b| if b >= 32 && b < 127 { b as char } else { '.' })
2478 .collect();
2479 eprintln!(
2480 "RAGC (bytes {}-{}):",
2481 ragc_tail_start,
2482 ragc_encoded.len() - 1
2483 );
2484 eprintln!(" Hex: {}", ragc_tail_hex.join(" "));
2485 eprintln!(" ASCII: {}", ragc_tail_ascii);
2486
2487 let cpp_tail_start = if cpp_encoded.len() > 10 {
2488 cpp_encoded.len() - 10
2489 } else {
2490 0
2491 };
2492 let cpp_tail_hex: Vec<_> = cpp_encoded[cpp_tail_start..]
2493 .iter()
2494 .map(|b| format!("{:02x}", b))
2495 .collect();
2496 let cpp_tail_ascii: String = cpp_encoded[cpp_tail_start..]
2497 .iter()
2498 .map(|&b| if b >= 32 && b < 127 { b as char } else { '.' })
2499 .collect();
2500 eprintln!(
2501 "C++ AGC (bytes {}-{}):",
2502 cpp_tail_start,
2503 cpp_encoded.len() - 1
2504 );
2505 eprintln!(" Hex: {}", cpp_tail_hex.join(" "));
2506 eprintln!(" ASCII: {}", cpp_tail_ascii);
2507
2508 eprintln!("\n========================================");
2509 eprintln!("Aborting on first LZ encoding mismatch!");
2510 eprintln!("========================================\n");
2511
2512 panic!("LZ encoding mismatch detected - see details above");
2513 }
2514 }
2515 }
2516 }
2517
2518 ragc_encoded
2519 };
2520
2521 // Handle LZ groups with IMPROVED_LZ_ENCODING: empty delta means same as reference
2522 // (matching C++ AGC segment.cpp lines 62-63)
2523 if use_lz_encoding && contig_data.is_empty() {
2524 // Same as reference - use in_group_id = 0
2525 segment_in_group_ids.push((seg_idx, 0));
2526 continue;
2527 }
2528
2529 // Check if this delta already exists in pending pack (matching C++ AGC segment.cpp line 66)
2530 // Note: deduplication is per-pack, not global
2531 if let Some(existing_idx) = buffer.pending_deltas.iter().position(|d| d == &contig_data) {
2532 // Reuse existing delta's in_group_id (matching C++ AGC segment.cpp line 69)
2533 let reused_id = buffer.pending_delta_ids[existing_idx];
2534 segment_in_group_ids.push((seg_idx, reused_id));
2535 } else {
2536 // New unique delta - assign next in_group_id (matching C++ AGC segment.cpp lines 74, 77)
2537 // FIX: Apply .max(1) BEFORE using segments_written to ensure unique IDs when no reference
2538 // Bug was: max(0,1)=1, increment to 1 → max(1,1)=1 (COLLISION!)
2539 // Fixed: max(0,1)=1, id=1, increment to 2 → id=2 (UNIQUE!)
2540 buffer.segments_written = buffer.segments_written.max(1);
2541 let in_group_id = buffer.segments_written;
2542 buffer.segments_written += 1;
2543 buffer.pending_delta_ids.push(in_group_id);
2544 segment_in_group_ids.push((seg_idx, in_group_id));
2545 buffer.pending_deltas.push(contig_data);
2546
2547 // CRITICAL: Flush pack when it reaches capacity
2548 // For raw groups (group_id < 16):
2549 // - Pack 0: placeholder (position 0) + 49 segments (positions 1-49) = 50 positions
2550 // - Pack 1+: 50 segments (positions 0-49)
2551 // This ensures extraction formula (pack_id = in_group_id / 50, position = in_group_id % 50) works correctly
2552 let flush_threshold = if !use_lz_encoding && !buffer.raw_placeholder_written {
2553 // Raw group pack 0: flush at 49 to leave room for placeholder
2554 PACK_CARDINALITY - 1
2555 } else {
2556 // All other packs (raw pack 1+ or LZ packs): flush at 50
2557 PACK_CARDINALITY
2558 };
2559
2560 if buffer.pending_deltas.len() == flush_threshold {
2561 // Compress pack WITHOUT holding any lock
2562 let needs_placeholder = !use_lz_encoding && !buffer.raw_placeholder_written;
2563 let pack = compress_pack(
2564 &buffer.pending_deltas,
2565 needs_placeholder,
2566 buffer.stream_id,
2567 config.compression_level,
2568 )?;
2569 archive_writes.push(pack);
2570 buffer.raw_placeholder_written = true;
2571
2572 // Clear for next pack - deduplication starts fresh
2573 buffer.pending_deltas.clear();
2574 buffer.pending_delta_ids.clear();
2575 }
2576 }
2577 }
2578
2579 // DO NOT write partial pack here - leave it in buffer.pending_deltas for next flush_pack call
2580 // Partial packs are only written in finalize() to ensure pack boundaries align with decompression
2581
2582 // Queue segment registrations (batched for single lock acquisition)
2583 for &(seg_idx, in_group_id) in segment_in_group_ids.iter() {
2584 let seg = &buffer.segments[seg_idx];
2585 registrations.push(SegmentRegistration {
2586 sample_name: seg.sample_name.clone(),
2587 contig_name: seg.contig_name.clone(),
2588 seg_part_no: seg.seg_part_no,
2589 group_id: buffer.group_id,
2590 in_group_id,
2591 is_rev_comp: seg.is_rev_comp,
2592 raw_length: seg.data.len() as u32,
2593 });
2594 }
2595
2596 // ============================================================
2597 // PHASE 2: Batched writes with minimal lock duration
2598 // ============================================================
2599
2600 // Buffer all pre-compressed data for archive (SINGLE lock acquisition)
2601 // Actual writes happen via flush_buffers() at end for fewer syscalls
2602 if !archive_writes.is_empty() {
2603 let mut arch = archive.lock().unwrap();
2604 for part in archive_writes {
2605 arch.add_part_buffered(part.stream_id, part.data, part.metadata);
2606 }
2607 }
2608
2609 // Store reference in global map (if any)
2610 if let Some((group_id, ref_data)) = ref_to_store {
2611 let mut ref_segs = reference_segments.write().unwrap();
2612 ref_segs.insert(group_id, ref_data);
2613 }
2614
2615 // Register all segments in collection (SINGLE lock acquisition)
2616 if !registrations.is_empty() {
2617 let mut coll = collection.lock().unwrap();
2618 for reg in registrations {
2619 coll.add_segment_placed(
2620 ®.sample_name,
2621 ®.contig_name,
2622 reg.seg_part_no,
2623 reg.group_id,
2624 reg.in_group_id,
2625 reg.is_rev_comp,
2626 reg.raw_length,
2627 )
2628 .context("Failed to register segment")?;
2629 }
2630 }
2631
2632 // Clear segments for next batch (but keep pending_deltas!)
2633 buffer.segments.clear();
2634
2635 Ok(())
2636}
2637
2638/// Compress-only version of flush_pack for deterministic parallel compression.
2639/// Workers call this in parallel to produce FlushPackResult, then Thread 0
2640/// writes all results in sorted group_id order for deterministic archives.
2641fn flush_pack_compress_only(
2642 buffer: &mut SegmentGroupBuffer,
2643 config: &StreamingQueueConfig,
2644) -> Result<FlushPackResult> {
2645 use crate::segment_compression::{compress_reference_segment, compress_segment_configured};
2646
2647 let mut archive_writes: Vec<PreCompressedPart> = Vec::new();
2648 let mut registrations: Vec<SegmentRegistration> = Vec::new();
2649 let mut ref_to_store: Option<(u32, Vec<u8>)> = None;
2650
2651 // Skip if no segments to write (but still write reference if present)
2652 if buffer.segments.is_empty() && buffer.ref_written {
2653 return Ok(FlushPackResult {
2654 group_id: buffer.group_id,
2655 archive_writes,
2656 registrations,
2657 ref_to_store,
2658 });
2659 }
2660
2661 let use_lz_encoding = buffer.group_id >= NO_RAW_GROUPS;
2662
2663 // Sort segments for deterministic reference selection
2664 buffer.segments.sort();
2665
2666 // Write reference segment if not already written
2667 if use_lz_encoding && !buffer.ref_written && !buffer.segments.is_empty() {
2668 let ref_seg = buffer.segments.remove(0);
2669
2670 // Compress reference
2671 let (mut compressed, marker) =
2672 compress_reference_segment(&ref_seg.data).context("Failed to compress reference")?;
2673 compressed.push(marker);
2674
2675 let ref_size = ref_seg.data.len() as u64;
2676
2677 if compressed.len() < ref_seg.data.len() {
2678 archive_writes.push(PreCompressedPart {
2679 stream_id: buffer.ref_stream_id,
2680 data: compressed,
2681 metadata: ref_size,
2682 });
2683 } else {
2684 archive_writes.push(PreCompressedPart {
2685 stream_id: buffer.ref_stream_id,
2686 data: ref_seg.data.clone(),
2687 metadata: 0,
2688 });
2689 }
2690
2691 registrations.push(SegmentRegistration {
2692 sample_name: ref_seg.sample_name.clone(),
2693 contig_name: ref_seg.contig_name.clone(),
2694 seg_part_no: ref_seg.seg_part_no,
2695 group_id: buffer.group_id,
2696 in_group_id: 0,
2697 is_rev_comp: ref_seg.is_rev_comp,
2698 raw_length: ref_seg.data.len() as u32,
2699 });
2700
2701 buffer.ref_written = true;
2702 ref_to_store = Some((buffer.group_id, ref_seg.data.clone()));
2703 buffer.reference_segment = Some(ref_seg.clone());
2704
2705 if use_lz_encoding {
2706 let mut lz = LZDiff::new(config.min_match_len as u32);
2707 lz.prepare(&ref_seg.data);
2708 buffer.lz_diff = Some(lz);
2709 }
2710 }
2711
2712 // Compress pack helper (same as flush_pack)
2713 let compress_pack = |deltas: &[Vec<u8>],
2714 needs_raw_placeholder: bool,
2715 stream_id: usize,
2716 compression_level: i32|
2717 -> Result<PreCompressedPart> {
2718 let mut packed_data = Vec::new();
2719
2720 if needs_raw_placeholder {
2721 packed_data.push(0x7f);
2722 packed_data.push(CONTIG_SEPARATOR);
2723 }
2724
2725 for delta in deltas.iter() {
2726 packed_data.extend_from_slice(delta);
2727 packed_data.push(CONTIG_SEPARATOR);
2728 }
2729
2730 let total_raw_size = packed_data.len();
2731 let mut compressed = compress_segment_configured(&packed_data, compression_level)
2732 .context("Failed to compress pack")?;
2733 compressed.push(0);
2734
2735 if compressed.len() < total_raw_size {
2736 Ok(PreCompressedPart {
2737 stream_id,
2738 data: compressed,
2739 metadata: total_raw_size as u64,
2740 })
2741 } else {
2742 Ok(PreCompressedPart {
2743 stream_id,
2744 data: packed_data,
2745 metadata: 0,
2746 })
2747 }
2748 };
2749
2750 let mut segment_in_group_ids: Vec<(usize, u32)> = Vec::new();
2751
2752 for (seg_idx, seg) in buffer.segments.iter().enumerate() {
2753 let contig_data = if !use_lz_encoding || buffer.reference_segment.is_none() {
2754 seg.data.clone()
2755 } else {
2756 buffer
2757 .lz_diff
2758 .as_mut()
2759 .expect("lz_diff should be prepared")
2760 .encode(&seg.data)
2761 };
2762
2763 if use_lz_encoding && contig_data.is_empty() {
2764 segment_in_group_ids.push((seg_idx, 0));
2765 continue;
2766 }
2767
2768 if let Some(existing_idx) = buffer.pending_deltas.iter().position(|d| d == &contig_data) {
2769 let reused_id = buffer.pending_delta_ids[existing_idx];
2770 segment_in_group_ids.push((seg_idx, reused_id));
2771 } else {
2772 buffer.segments_written = buffer.segments_written.max(1);
2773 let in_group_id = buffer.segments_written;
2774 buffer.segments_written += 1;
2775 buffer.pending_delta_ids.push(in_group_id);
2776 segment_in_group_ids.push((seg_idx, in_group_id));
2777 buffer.pending_deltas.push(contig_data);
2778
2779 // CRITICAL: Flush pack when it reaches capacity
2780 // For raw groups pack 0: flush at 49 (placeholder takes position 0)
2781 // For all other packs: flush at 50
2782 let flush_threshold = if !use_lz_encoding && !buffer.raw_placeholder_written {
2783 PACK_CARDINALITY - 1
2784 } else {
2785 PACK_CARDINALITY
2786 };
2787
2788 if buffer.pending_deltas.len() == flush_threshold {
2789 let needs_placeholder = !use_lz_encoding && !buffer.raw_placeholder_written;
2790 let pack = compress_pack(
2791 &buffer.pending_deltas,
2792 needs_placeholder,
2793 buffer.stream_id,
2794 config.compression_level,
2795 )?;
2796 archive_writes.push(pack);
2797 buffer.raw_placeholder_written = true;
2798 buffer.pending_deltas.clear();
2799 buffer.pending_delta_ids.clear();
2800 }
2801 }
2802 }
2803
2804 for &(seg_idx, in_group_id) in segment_in_group_ids.iter() {
2805 let seg = &buffer.segments[seg_idx];
2806 registrations.push(SegmentRegistration {
2807 sample_name: seg.sample_name.clone(),
2808 contig_name: seg.contig_name.clone(),
2809 seg_part_no: seg.seg_part_no,
2810 group_id: buffer.group_id,
2811 in_group_id,
2812 is_rev_comp: seg.is_rev_comp,
2813 raw_length: seg.data.len() as u32,
2814 });
2815 }
2816
2817 buffer.segments.clear();
2818
2819 Ok(FlushPackResult {
2820 group_id: buffer.group_id,
2821 archive_writes,
2822 registrations,
2823 ref_to_store,
2824 })
2825}
2826
2827/// Write reference segment immediately when first segment arrives in group
2828/// (Matches C++ AGC segment.cpp lines 41-48: if (no_seqs == 0) writes reference right away)
2829/// This ensures LZ encoding works correctly for subsequent segments
2830fn write_reference_immediately(
2831 segment: &BufferedSegment,
2832 buffer: &mut SegmentGroupBuffer,
2833 collection: &Arc<Mutex<CollectionV3>>,
2834 archive: &Arc<Mutex<Archive>>,
2835 reference_segments: &Arc<RwLock<BTreeMap<u32, Vec<u8>>>>,
2836 reference_orientations: &Arc<RwLock<BTreeMap<u32, bool>>>,
2837 config: &StreamingQueueConfig,
2838) -> Result<()> {
2839 use crate::segment_compression::compress_reference_segment;
2840
2841 if crate::env_cache::debug_ref_write() {
2842 eprintln!(
2843 "DEBUG_REF_IMMEDIATE: group={} sample={} contig={} seg={} data_len={}",
2844 buffer.group_id,
2845 segment.sample_name,
2846 segment.contig_name,
2847 segment.seg_part_no,
2848 segment.data.len()
2849 );
2850 }
2851
2852 if config.verbosity > 1 {
2853 eprintln!(
2854 " Writing immediate reference for group {}: {} {}:{} (part {})",
2855 buffer.group_id,
2856 segment.sample_name,
2857 segment.contig_name,
2858 segment.seg_part_no,
2859 segment.seg_part_no
2860 );
2861 }
2862
2863 // 1. Compress reference using adaptive compression (matching flush_pack lines 635-637)
2864 let (mut compressed, marker) =
2865 compress_reference_segment(&segment.data).context("Failed to compress reference")?;
2866 compressed.push(marker);
2867
2868 let ref_size = segment.data.len();
2869
2870 // 2. Write to archive immediately (matching C++ AGC segment.cpp line 43: store_in_archive)
2871 // CRITICAL: Check if compression helped (matching C++ AGC segment.h line 179)
2872 {
2873 let mut arch = archive.lock().unwrap();
2874 if compressed.len() < ref_size {
2875 // Compression helped - buffer compressed data with metadata=original_size
2876 arch.add_part_buffered(buffer.ref_stream_id, compressed, ref_size as u64);
2877 } else {
2878 // Compression didn't help - buffer UNCOMPRESSED data with metadata=0
2879 arch.add_part_buffered(buffer.ref_stream_id, segment.data.clone(), 0);
2880 }
2881 }
2882
2883 // 3. Register reference in collection with in_group_id = 0 (matching flush_pack lines 650-661)
2884 {
2885 let mut coll = collection.lock().unwrap();
2886 coll.add_segment_placed(
2887 &segment.sample_name,
2888 &segment.contig_name,
2889 segment.seg_part_no,
2890 buffer.group_id,
2891 0, // Reference is always at position 0
2892 segment.is_rev_comp,
2893 segment.data.len() as u32,
2894 )
2895 .context("Failed to register immediate reference")?;
2896 }
2897
2898 // 4. Mark reference as written and store for LZ encoding (matching flush_pack lines 663-664)
2899 buffer.ref_written = true;
2900 buffer.reference_segment = Some(segment.clone());
2901 // CRITICAL: Mark that in_group_id=0 is taken, so subsequent segments start from 1
2902 buffer.segments_written = 1;
2903
2904 // 4b. Store reference data persistently (matching C++ AGC v_segments)
2905 // This enables LZ cost estimation for subsequent samples even after flush
2906 {
2907 let mut ref_segs = reference_segments.write().unwrap();
2908 ref_segs.insert(buffer.group_id, segment.data.clone());
2909 }
2910
2911 // 4c. Store reference orientation for ZERO_MATCH bug fix
2912 // When a delta segment joins this group later, it MUST use the same orientation
2913 // as the reference to ensure LZ encoding works correctly
2914 {
2915 let mut ref_orients = reference_orientations.write().unwrap();
2916 ref_orients.insert(buffer.group_id, segment.is_rev_comp);
2917 }
2918
2919 // 5. Prepare LZ encoder with reference (matching C++ AGC segment.cpp line 43: lz_diff->Prepare(s))
2920 // This is done ONCE when the reference is written, then reused for all subsequent segments
2921 let use_lz_encoding = buffer.group_id >= NO_RAW_GROUPS;
2922 if use_lz_encoding {
2923 let mut lz = LZDiff::new(config.min_match_len as u32);
2924 lz.prepare(&segment.data);
2925 buffer.lz_diff = Some(lz);
2926 }
2927
2928 Ok(())
2929}
2930
2931/// Compute reverse complement of a sequence
2932fn reverse_complement_sequence(seq: &[u8]) -> Vec<u8> {
2933 use crate::kmer::reverse_complement;
2934 seq.iter()
2935 .rev()
2936 .map(|&base| reverse_complement(base as u64) as u8)
2937 .collect()
2938}
2939
2940/// Find best existing group for a segment with only one k-mer present
2941/// (Implements C++ AGC's find_cand_segment_with_one_splitter logic from lines 1659-1745)
2942fn find_group_with_one_kmer(
2943 kmer: u64,
2944 kmer_is_dir: bool,
2945 segment_data: &[u8], // Segment data in forward orientation
2946 segment_data_rc: &[u8], // Segment data in reverse complement
2947 map_segments_terminators: &Arc<RwLock<BTreeMap<u64, Vec<u64>>>>,
2948 map_segments: &Arc<RwLock<BTreeMap<SegmentGroupKey, u32>>>,
2949 segment_groups: &Arc<Mutex<BTreeMap<SegmentGroupKey, SegmentGroupBuffer>>>,
2950 reference_segments: &Arc<RwLock<BTreeMap<u32, Vec<u8>>>>,
2951 config: &StreamingQueueConfig,
2952) -> (u64, u64, bool) {
2953 let segment_len = segment_data.len();
2954 use crate::segment::MISSING_KMER;
2955
2956 // Look up kmer in terminators map to find connected k-mers
2957 let connected_kmers = {
2958 let terminators = map_segments_terminators.read().unwrap();
2959 match terminators.get(&kmer) {
2960 Some(vec) => vec.clone(),
2961 None => {
2962 // No connections found - create new group with MISSING
2963 // Match C++ AGC lines 1671-1679: check is_dir_oriented()
2964 // Debug: log entry to no-connection path
2965 if crate::env_cache::debug_is_dir() {
2966 eprintln!(
2967 "RAGC_FIND_GROUP_NO_CONN: kmer={} kmer_is_dir={}",
2968 kmer, kmer_is_dir
2969 );
2970 }
2971 if kmer_is_dir {
2972 // Dir-oriented: (kmer, MISSING) with rc=false
2973 if config.verbosity > 1 {
2974 #[cfg(feature = "verbose_debug")]
2975 eprintln!("RAGC_CASE3_NO_CONNECTION: kmer={} is_dir=true -> ({}, MISSING) rc=false", kmer, kmer);
2976 }
2977 return (kmer, MISSING_KMER, false);
2978 } else {
2979 // NOT dir-oriented: (MISSING, kmer) with rc=true
2980 if config.verbosity > 1 {
2981 #[cfg(feature = "verbose_debug")]
2982 eprintln!("RAGC_CASE3_NO_CONNECTION: kmer={} is_dir=false -> (MISSING, {}) rc=true", kmer, kmer);
2983 }
2984 return (MISSING_KMER, kmer, true);
2985 }
2986 }
2987 }
2988 };
2989
2990 if config.verbosity > 1 {
2991 #[cfg(feature = "verbose_debug")]
2992 eprintln!(
2993 "RAGC_CASE3_FOUND_CONNECTIONS: kmer={} connections={}",
2994 kmer,
2995 connected_kmers.len()
2996 );
2997 }
2998 // Debug: log connections found
2999 if crate::env_cache::debug_is_dir() {
3000 eprintln!(
3001 "RAGC_FIND_GROUP_FOUND_CONN: kmer={} kmer_is_dir={} connections={:?}",
3002 kmer, kmer_is_dir, connected_kmers
3003 );
3004 }
3005
3006 // Build list of candidate groups
3007 // Each candidate: (key_front, key_back, needs_rc, ref_segment_size)
3008 let mut candidates: Vec<(u64, u64, bool, usize)> = Vec::new();
3009
3010 // OPTIMIZATION: Reduce lock scope - first collect candidate keys, then look up ref sizes
3011 // This minimizes the time segment_groups.lock() is held
3012
3013 // Phase 1: Build candidate orderings (no locks needed)
3014 let mut candidate_keys: Vec<(u64, u64, bool, SegmentGroupKey)> = Vec::new();
3015 for &cand_kmer in &connected_kmers {
3016 // Create candidate group key normalized (smaller, larger)
3017 // C++ AGC lines 1691-1704
3018 //
3019 // IMPORTANT: When cand_kmer is MISSING, we need to try BOTH orderings!
3020 // Groups with MISSING k-mers can be stored as either (MISSING, kmer) or (kmer, MISSING)
3021 // depending on kmer_is_dir when they were created. We must match the actual stored key.
3022 let orderings: Vec<(u64, u64, bool)> = if cand_kmer == MISSING_KMER {
3023 // MISSING is involved - try both orderings to find the group
3024 vec![
3025 (MISSING_KMER, kmer, true), // (MISSING, kmer) with RC
3026 (kmer, MISSING_KMER, false), // (kmer, MISSING) without RC
3027 ]
3028 } else if cand_kmer < kmer {
3029 // cand_kmer is smaller - it goes first
3030 // This means we need to RC (C++ AGC line 1696: get<2>(ck) = true)
3031 vec![(cand_kmer, kmer, true)]
3032 } else {
3033 // kmer is smaller - it goes first
3034 // No RC needed (C++ AGC line 1703: get<2>(ck) = false)
3035 vec![(kmer, cand_kmer, false)]
3036 };
3037
3038 for (key_front, key_back, needs_rc) in orderings {
3039 let cand_key = SegmentGroupKey {
3040 kmer_front: key_front,
3041 kmer_back: key_back,
3042 };
3043 candidate_keys.push((key_front, key_back, needs_rc, cand_key));
3044 }
3045 }
3046
3047 // Phase 2: Quick check which candidates exist (brief locks)
3048 // First pass: check global registry (RwLock - can be concurrent)
3049 let mut existing_candidates: Vec<(u64, u64, bool, Option<u32>)> = Vec::new();
3050 {
3051 let seg_map = map_segments.read().unwrap();
3052 for (key_front, key_back, needs_rc, cand_key) in &candidate_keys {
3053 if let Some(&group_id) = seg_map.get(cand_key) {
3054 existing_candidates.push((*key_front, *key_back, *needs_rc, Some(group_id)));
3055 }
3056 }
3057 } // seg_map lock released
3058
3059 // Second pass: check batch-local buffer for remaining candidates (Mutex - exclusive)
3060 // Only if we didn't find candidates in global registry
3061 if existing_candidates.is_empty() {
3062 let groups = segment_groups.lock().unwrap();
3063 let mut already_found = std::collections::HashSet::new();
3064 for (key_front, key_back, needs_rc, cand_key) in &candidate_keys {
3065 if groups.contains_key(cand_key) {
3066 // Get ref size from buffer
3067 let ref_size = if let Some(group_buffer) = groups.get(cand_key) {
3068 if let Some(ref_seg) = &group_buffer.reference_segment {
3069 ref_seg.data.len()
3070 } else {
3071 segment_len
3072 }
3073 } else {
3074 segment_len
3075 };
3076
3077 // Debug trace
3078 if crate::env_cache::debug_is_dir() {
3079 eprintln!("RAGC_FIND_GROUP_CAND_CHECK: cand_key=({},{}) exists_in_groups=true ref_size={}",
3080 key_front, key_back, ref_size);
3081 }
3082
3083 // Use cand_kmer as key to deduplicate (only one match per connected_kmer)
3084 let connected = if *key_front == kmer {
3085 *key_back
3086 } else {
3087 *key_front
3088 };
3089 if !already_found.contains(&connected) {
3090 candidates.push((*key_front, *key_back, *needs_rc, ref_size));
3091 already_found.insert(connected);
3092 }
3093 }
3094 }
3095 } // groups lock released
3096
3097 // Phase 3: Get ref sizes for global candidates (brief RwLock)
3098 if !existing_candidates.is_empty() {
3099 let ref_segs = reference_segments.read().unwrap();
3100 let mut already_found = std::collections::HashSet::new();
3101 for (key_front, key_back, needs_rc, group_id_opt) in existing_candidates {
3102 let ref_size = if let Some(group_id) = group_id_opt {
3103 if let Some(ref_data) = ref_segs.get(&group_id) {
3104 ref_data.len()
3105 } else {
3106 segment_len
3107 }
3108 } else {
3109 segment_len
3110 };
3111
3112 // Debug trace
3113 if crate::env_cache::debug_is_dir() {
3114 eprintln!("RAGC_FIND_GROUP_CAND_CHECK: cand_key=({},{}) exists_in_seg_map=true ref_size={}",
3115 key_front, key_back, ref_size);
3116 }
3117
3118 // Use cand_kmer as key to deduplicate (only one match per connected_kmer)
3119 let connected = if key_front == kmer {
3120 key_back
3121 } else {
3122 key_front
3123 };
3124 if !already_found.contains(&connected) {
3125 candidates.push((key_front, key_back, needs_rc, ref_size));
3126 already_found.insert(connected);
3127 }
3128 }
3129 } // ref_segs lock released
3130
3131 if candidates.is_empty() {
3132 // No existing groups found - create new with MISSING
3133 // Must match C++ AGC is_dir_oriented logic (same as no-connections case above)
3134 if crate::env_cache::debug_is_dir() {
3135 if kmer_is_dir {
3136 eprintln!("RAGC_FIND_GROUP_NO_CAND: kmer={} kmer_is_dir={} -> returning ({},MISSING,false)",
3137 kmer, kmer_is_dir, kmer);
3138 } else {
3139 eprintln!("RAGC_FIND_GROUP_NO_CAND: kmer={} kmer_is_dir={} -> returning (MISSING,{},true)",
3140 kmer, kmer_is_dir, kmer);
3141 }
3142 }
3143 if kmer_is_dir {
3144 // Dir-oriented: (kmer, MISSING) with rc=false
3145 if config.verbosity > 1 {
3146 #[cfg(feature = "verbose_debug")]
3147 eprintln!(
3148 "RAGC_CASE3_NO_CANDIDATES: kmer={} is_dir=true -> ({}, MISSING) rc=false",
3149 kmer, kmer
3150 );
3151 }
3152 return (kmer, MISSING_KMER, false);
3153 } else {
3154 // NOT dir-oriented: (MISSING, kmer) with rc=true
3155 if config.verbosity > 1 {
3156 #[cfg(feature = "verbose_debug")]
3157 eprintln!(
3158 "RAGC_CASE3_NO_CANDIDATES: kmer={} is_dir=false -> (MISSING, {}) rc=true",
3159 kmer, kmer
3160 );
3161 }
3162 return (MISSING_KMER, kmer, true);
3163 }
3164 }
3165
3166 // Sort candidates by reference segment size (C++ AGC lines 1710-1719)
3167 // Prefer candidates with ref size closest to our segment size
3168 candidates.sort_by(|a, b| {
3169 let a_diff = (a.3 as i64 - segment_len as i64).abs();
3170 let b_diff = (b.3 as i64 - segment_len as i64).abs();
3171
3172 if a_diff != b_diff {
3173 a_diff.cmp(&b_diff)
3174 } else {
3175 a.3.cmp(&b.3) // If equal distance, prefer smaller ref size
3176 }
3177 });
3178
3179 // Debug: Print sorted candidates before evaluation
3180 if config.verbosity > 2 {
3181 eprintln!(
3182 "RAGC_CASE3_SORTED_CANDIDATES: kmer={} segment_len={} n_candidates={}",
3183 kmer,
3184 segment_len,
3185 candidates.len()
3186 );
3187 for (i, &(kf, kb, rc, rs)) in candidates.iter().enumerate() {
3188 let size_diff = (rs as i64 - segment_len as i64).abs();
3189 eprintln!(
3190 " CAND[{}]: ({},{}) rc={} ref_size={} size_diff={}",
3191 i, kf, kb, rc, rs, size_diff
3192 );
3193 }
3194 }
3195
3196 // Test compression for each candidate (C++ AGC lines 1726-1788)
3197 // Match C++ AGC's TWO-PASS approach:
3198 // Pass 1: Compute all estimates, track minimum (lines 1726-1732)
3199 // Pass 2: Pick candidate with minimum estimate (lines 1775-1787)
3200 //
3201 // CRITICAL: Initialize best_pk to (~0ull, ~0ull) like C++ AGC (line 1628)
3202 let mut best_key_front = u64::MAX; // ~0ull in C++
3203 let mut best_key_back = u64::MAX; // ~0ull in C++
3204 let mut best_needs_rc = false;
3205 let mut best_estim_size = if segment_len < 16 {
3206 segment_len
3207 } else {
3208 segment_len - 16
3209 };
3210
3211 // Pass 1: Compute estimates and find minimum
3212 // Store estimates alongside candidates: Vec<(front, back, needs_rc, ref_size, estim_size)>
3213 let mut candidate_estimates: Vec<(u64, u64, bool, usize, usize)> = Vec::new();
3214
3215 {
3216 let groups = segment_groups.lock().unwrap();
3217 let seg_map = map_segments.read().unwrap();
3218 let ref_segs = reference_segments.read().unwrap();
3219
3220 for &(key_front, key_back, needs_rc, ref_size) in &candidates {
3221 let cand_key = SegmentGroupKey {
3222 kmer_front: key_front,
3223 kmer_back: key_back,
3224 };
3225
3226 // Get the reference segment for this candidate from buffer OR persistent storage
3227 let (ref_data_opt, ref_source): (Option<&[u8]>, &str) = if let Some(group_buffer) =
3228 groups.get(&cand_key)
3229 {
3230 if config.verbosity > 2
3231 && key_front == 1244212049458757632
3232 && key_back == 1244212049458757632
3233 {
3234 let ref_seg = group_buffer.reference_segment.as_ref();
3235 let ref_len = ref_seg.map(|s| s.data.len()).unwrap_or(0);
3236 let ref_first5: Vec<u8> = ref_seg
3237 .map(|s| s.data.iter().take(5).cloned().collect())
3238 .unwrap_or_default();
3239 eprintln!("RAGC_REF_LOOKUP_BUFFER: degenerate key ({},{}) buffer ref_len={} ref[0..5]={:?}",
3240 key_front, key_back, ref_len, ref_first5);
3241 }
3242 (
3243 group_buffer
3244 .reference_segment
3245 .as_ref()
3246 .map(|seg| seg.data.as_slice()),
3247 "buffer",
3248 )
3249 } else if let Some(&group_id) = seg_map.get(&cand_key) {
3250 if config.verbosity > 2
3251 && key_front == 1244212049458757632
3252 && key_back == 1244212049458757632
3253 {
3254 let ref_data = ref_segs.get(&group_id);
3255 let ref_len = ref_data.map(|d| d.len()).unwrap_or(0);
3256 let ref_first5: Vec<u8> = ref_data
3257 .map(|d| d.iter().take(5).cloned().collect())
3258 .unwrap_or_default();
3259 eprintln!("RAGC_REF_LOOKUP_PERSISTENT: degenerate key ({},{}) -> group_id={} ref_len={} ref[0..5]={:?}",
3260 key_front, key_back, group_id, ref_len, ref_first5);
3261 }
3262 (
3263 ref_segs.get(&group_id).map(|data| data.as_slice()),
3264 "persistent",
3265 )
3266 } else {
3267 (None, "none")
3268 };
3269 let ref_data_opt = ref_data_opt;
3270
3271 if let Some(ref_data) = ref_data_opt {
3272 // Test LZ encoding against this reference (C++ AGC line 1728: estimate())
3273 let target_data = if needs_rc {
3274 segment_data_rc
3275 } else {
3276 segment_data
3277 };
3278
3279 // Compute estimate - compare both RAGC native and C++ FFI when verbose
3280 let estim_size = {
3281 let mut lz = LZDiff::new(config.min_match_len as u32);
3282 lz.prepare(&ref_data.to_vec());
3283 // Use estimate() which matches C++ CLZDiff_V2::Estimate exactly
3284 lz.estimate(&target_data.to_vec(), best_estim_size as u32) as usize
3285 };
3286
3287 // Also compute with C++ FFI and compare
3288 #[cfg(feature = "cpp_agc")]
3289 let cpp_estim_size = crate::ragc_ffi::lzdiff_v2_estimate(
3290 ref_data,
3291 target_data,
3292 config.min_match_len as u32,
3293 best_estim_size as u32,
3294 ) as usize;
3295
3296 #[cfg(feature = "cpp_agc")]
3297 if estim_size != cpp_estim_size && config.verbosity > 0 {
3298 eprintln!(
3299 "ESTIMATE_MISMATCH: ragc={} cpp={} ref_len={} tgt_len={} bound={}",
3300 estim_size,
3301 cpp_estim_size,
3302 ref_data.len(),
3303 target_data.len(),
3304 best_estim_size
3305 );
3306 }
3307
3308 // DEBUG: Also compute estimate with initial threshold to check if tie would occur
3309 #[cfg(not(feature = "cpp_agc"))]
3310 let estim_no_bound = if config.verbosity > 2 {
3311 let mut lz2 = LZDiff::new(config.min_match_len as u32);
3312 lz2.prepare(&ref_data.to_vec());
3313 lz2.estimate(&target_data.to_vec(), (segment_len - 16) as u32) as usize
3314 } else {
3315 0
3316 };
3317
3318 if config.verbosity > 2 {
3319 // Print detailed debug info including bound and first/last bytes
3320 let ref_first: Vec<u8> = ref_data.iter().take(5).cloned().collect();
3321 let ref_last: Vec<u8> = ref_data.iter().rev().take(5).cloned().collect();
3322 let tgt_first: Vec<u8> = target_data.iter().take(5).cloned().collect();
3323 let tgt_last: Vec<u8> = target_data.iter().rev().take(5).cloned().collect();
3324 #[cfg(not(feature = "cpp_agc"))]
3325 eprintln!(
3326 "RAGC_CASE3_ESTIMATE: kmer={} cand=({},{}) rc={} ref_len={} target_len={} bound={} estim={} estim_nobound={} ref[0..5]={:?} ref[-5..]={:?} tgt[0..5]={:?} tgt[-5..]={:?}",
3327 kmer, key_front, key_back, needs_rc, ref_data.len(), target_data.len(), best_estim_size, estim_size, estim_no_bound, ref_first, ref_last, tgt_first, tgt_last
3328 );
3329 #[cfg(feature = "cpp_agc")]
3330 eprintln!(
3331 "RAGC_CASE3_ESTIMATE: kmer={} cand=({},{}) rc={} ref_len={} target_len={} bound={} estim={} ref[0..5]={:?} ref[-5..]={:?} tgt[0..5]={:?} tgt[-5..]={:?}",
3332 kmer, key_front, key_back, needs_rc, ref_data.len(), target_data.len(), best_estim_size, estim_size, ref_first, ref_last, tgt_first, tgt_last
3333 );
3334 }
3335
3336 // Track minimum estim_size (C++ AGC lines 1730-1732)
3337 if estim_size < best_estim_size {
3338 best_estim_size = estim_size;
3339 }
3340
3341 candidate_estimates.push((key_front, key_back, needs_rc, ref_size, estim_size));
3342 }
3343 }
3344 }
3345
3346 // Pass 2: Pick candidate with minimum estimate among ALL candidates, using tie-breakers
3347 // (C++ AGC lines 1775-1788)
3348 //
3349 // CRITICAL FIX: C++ AGC only picks candidates that BEAT the initial threshold (segment_size - 16).
3350 // If no candidate beats the threshold, best_pk stays at (~0ull, ~0ull) and fallback MISSING is used.
3351 //
3352 // The previous bug was unconditionally picking the first candidate (first_candidate = true).
3353 // This caused RAGC to always pick the first candidate even when its estimate was worse than threshold,
3354 // preventing fallback to existing MISSING groups.
3355 //
3356 // C++ AGC's selection logic (lines 1780-1787):
3357 // if (v_estim_size[i] < best_estim_size || ...)
3358 // This only updates best_pk if estimate is BETTER than current best (initially threshold).
3359 for &(key_front, key_back, needs_rc, _ref_size, estim_size) in &candidate_estimates {
3360 let cand_pk = (key_front, key_back);
3361 let best_pk = (best_key_front, best_key_back);
3362
3363 // Match C++ AGC's selection logic exactly (lines 1780-1787):
3364 // Only pick candidate if:
3365 // - Smaller estimate than current best (initially threshold), OR
3366 // - Same estimate with lexicographically smaller pk, OR
3367 // - Same estimate+pk with better RC (prefers forward orientation)
3368 if estim_size < best_estim_size
3369 || (estim_size == best_estim_size && cand_pk < best_pk)
3370 || (estim_size == best_estim_size && cand_pk == best_pk && !needs_rc)
3371 {
3372 best_estim_size = estim_size;
3373 best_key_front = key_front;
3374 best_key_back = key_back;
3375 best_needs_rc = needs_rc;
3376 }
3377 }
3378
3379 // Debug: Print Pass 2 results
3380 if config.verbosity > 2 && !candidate_estimates.is_empty() {
3381 let threshold = if segment_len < 16 {
3382 segment_len
3383 } else {
3384 segment_len - 16
3385 };
3386 eprintln!(
3387 "RAGC_CASE3_PASS2_RESULTS: threshold={} best=({},{}) best_estim={}",
3388 threshold, best_key_front, best_key_back, best_estim_size
3389 );
3390 for (i, &(kf, kb, rc, rs, es)) in candidate_estimates.iter().enumerate() {
3391 let is_winner = kf == best_key_front && kb == best_key_back;
3392 let marker = if is_winner { "*WINNER*" } else { "" };
3393 eprintln!(
3394 " RESULT[{}]: ({},{}) rc={} ref_size={} estimate={} {}",
3395 i, kf, kb, rc, rs, es, marker
3396 );
3397 }
3398 }
3399
3400 // If no candidate was selected (best_pk is still (~0ull, ~0ull)), create MISSING key
3401 // This matches C++ AGC lines 1791-1799: fallback to (kmer, MISSING) or (MISSING, kmer)
3402 if best_key_front == u64::MAX && best_key_back == u64::MAX {
3403 if kmer_is_dir {
3404 // Dir-oriented: (kmer, MISSING) with rc=false
3405 if config.verbosity > 1 {
3406 #[cfg(feature = "verbose_debug")]
3407 eprintln!(
3408 "RAGC_CASE3_NO_WINNER: kmer={} is_dir=true -> ({}, MISSING) rc=false",
3409 kmer, kmer
3410 );
3411 }
3412 return (kmer, MISSING_KMER, false);
3413 } else {
3414 // NOT dir-oriented: (MISSING, kmer) with rc=true
3415 if config.verbosity > 1 {
3416 #[cfg(feature = "verbose_debug")]
3417 eprintln!(
3418 "RAGC_CASE3_NO_WINNER: kmer={} is_dir=false -> (MISSING, {}) rc=true",
3419 kmer, kmer
3420 );
3421 }
3422 return (MISSING_KMER, kmer, true);
3423 }
3424 }
3425
3426 if config.verbosity > 1 {
3427 #[cfg(feature = "verbose_debug")]
3428 eprintln!(
3429 "RAGC_CASE3_PICKED: kmer={} best=({},{}) rc={} estim_size={} segment_size={}",
3430 kmer, best_key_front, best_key_back, best_needs_rc, best_estim_size, segment_len
3431 );
3432 }
3433
3434 (best_key_front, best_key_back, best_needs_rc)
3435}
3436
3437/// Find candidate segment using fallback minimizers
3438/// Matches C++ AGC's find_cand_segment_using_fallback_minimizers (lines 1807-1958)
3439///
3440/// This function is called when Case 3 (one k-mer present) fails to find a good match.
3441/// It scans the segment for k-mers that pass the fallback filter, looks them up in
3442/// the fallback minimizers map, and finds candidate groups with shared k-mers.
3443///
3444/// # Arguments
3445/// * `segment_data` - The segment data to search
3446/// * `k` - K-mer length
3447/// * `min_shared_kmers` - Minimum number of shared k-mers to consider a candidate
3448/// * `fallback_filter` - Filter to select which k-mers to check
3449/// * `map_fallback_minimizers` - Map from k-mer to candidate group keys
3450/// * `map_segments` - Map from group key to group ID
3451/// * `segment_groups` - Buffer of segment groups
3452/// * `reference_segments` - Stored reference segments
3453/// * `config` - Compression configuration
3454///
3455/// # Returns
3456/// (key_front, key_back, should_reverse) if a candidate is found, or (MISSING, MISSING, false) if none
3457#[allow(clippy::too_many_arguments)]
3458fn find_cand_segment_using_fallback_minimizers(
3459 segment_data: &[u8],
3460 segment_data_rc: &[u8],
3461 k: usize,
3462 min_shared_kmers: u64,
3463 fallback_filter: &FallbackFilter,
3464 map_fallback_minimizers: &Arc<Mutex<BTreeMap<u64, Vec<(u64, u64)>>>>,
3465 map_segments: &Arc<RwLock<BTreeMap<SegmentGroupKey, u32>>>,
3466 segment_groups: &Arc<Mutex<BTreeMap<SegmentGroupKey, SegmentGroupBuffer>>>,
3467 reference_segments: &Arc<RwLock<BTreeMap<u32, Vec<u8>>>>,
3468 config: &StreamingQueueConfig,
3469) -> (u64, u64, bool) {
3470 use crate::segment::MISSING_KMER;
3471
3472 const MAX_NUM_TO_ESTIMATE: usize = 10;
3473 let short_segments = config.segment_size <= 10000;
3474 let segment_len = segment_data.len();
3475
3476 if !fallback_filter.is_enabled() {
3477 return (MISSING_KMER, MISSING_KMER, false);
3478 }
3479
3480 // Scan segment for k-mers and count candidates
3481 // Map from candidate group key to list of shared k-mers
3482 let mut cand_seg_counts: BTreeMap<(u64, u64), Vec<u64>> = BTreeMap::new(); // BTreeMap for determinism
3483
3484 // K-mer scanning state (matches C++ AGC CKmer behavior)
3485 let mut kmer_data: u64 = 0;
3486 let mut kmer_rc: u64 = 0;
3487 let mut kmer_len: usize = 0;
3488 let mask: u64 = (1u64 << (2 * k)) - 1;
3489
3490 // Scan segment for k-mers
3491 for &base in segment_data {
3492 if base > 3 {
3493 // Non-ACGT character - reset k-mer
3494 kmer_data = 0;
3495 kmer_rc = 0;
3496 kmer_len = 0;
3497 continue;
3498 }
3499
3500 // Add base to forward k-mer (shift left, add at LSB)
3501 kmer_data = ((kmer_data << 2) | (base as u64)) & mask;
3502
3503 // Add complement to reverse k-mer (shift right, add at MSB)
3504 let comp = 3 - base; // A<->T, C<->G
3505 kmer_rc = (kmer_rc >> 2) | ((comp as u64) << (2 * (k - 1)));
3506
3507 kmer_len += 1;
3508
3509 if kmer_len >= k {
3510 // Use canonical k-mer (smaller of forward and reverse)
3511 let canonical = kmer_data.min(kmer_rc);
3512 let is_dir_oriented = kmer_data <= kmer_rc;
3513
3514 // Check if k-mer passes fallback filter and is not symmetric
3515 if fallback_filter.passes(canonical) && kmer_data != kmer_rc {
3516 // Look up in fallback minimizers map
3517 let fb_map = map_fallback_minimizers.lock().unwrap();
3518 if let Some(candidates) = fb_map.get(&canonical) {
3519 for &(key1, key2) in candidates {
3520 // Skip MISSING keys
3521 if key1 == MISSING_KMER || key2 == MISSING_KMER {
3522 continue;
3523 }
3524
3525 // Normalize based on orientation
3526 let cand_key = if !is_dir_oriented {
3527 (key2, key1)
3528 } else {
3529 (key1, key2)
3530 };
3531
3532 cand_seg_counts
3533 .entry(cand_key)
3534 .or_insert_with(Vec::new)
3535 .push(canonical);
3536 }
3537 }
3538 }
3539 }
3540 }
3541
3542 // Prune candidates to those with >= min_shared_kmers unique k-mers
3543 let mut pruned_candidates: Vec<(u64, (u64, u64))> = Vec::new();
3544 for (key, mut kmers) in cand_seg_counts {
3545 kmers.sort_unstable();
3546 kmers.dedup();
3547 let unique_count = kmers.len() as u64;
3548 if unique_count >= min_shared_kmers {
3549 pruned_candidates.push((unique_count, key));
3550 }
3551 }
3552
3553 if pruned_candidates.is_empty() {
3554 if config.verbosity > 1 {
3555 #[cfg(feature = "verbose_debug")]
3556 eprintln!(
3557 "RAGC_FALLBACK_NO_CANDIDATES: min_shared={}",
3558 min_shared_kmers
3559 );
3560 }
3561 return (MISSING_KMER, MISSING_KMER, false);
3562 }
3563
3564 // Sort by count (descending) and take top MAX_NUM_TO_ESTIMATE
3565 pruned_candidates.sort_by(|a, b| b.0.cmp(&a.0));
3566 if pruned_candidates.len() > MAX_NUM_TO_ESTIMATE {
3567 pruned_candidates.truncate(MAX_NUM_TO_ESTIMATE);
3568 }
3569
3570 // Avoid trying poor candidates (less than half the best count)
3571 let best_count = pruned_candidates[0].0;
3572 pruned_candidates.retain(|c| c.0 * 2 >= best_count);
3573
3574 if config.verbosity > 1 {
3575 #[cfg(feature = "verbose_debug")]
3576 eprintln!(
3577 "RAGC_FALLBACK_CANDIDATES: count={} best_shared={} min_shared={}",
3578 pruned_candidates.len(),
3579 best_count,
3580 min_shared_kmers
3581 );
3582 }
3583
3584 // For short segments, use fast decision based on shared k-mer count
3585 if short_segments {
3586 let (count, (key_front, key_back)) = pruned_candidates[0];
3587 if config.verbosity > 1 {
3588 #[cfg(feature = "verbose_debug")]
3589 eprintln!(
3590 "RAGC_FALLBACK_SHORT_SEGMENT: key=({},{}) shared_kmers={}",
3591 key_front, key_back, count
3592 );
3593 }
3594 // Normalize: ensure front <= back
3595 if key_front <= key_back {
3596 return (key_front, key_back, false);
3597 } else {
3598 return (key_back, key_front, true);
3599 }
3600 }
3601
3602 // For longer segments, estimate compression cost for each candidate
3603 let mut best_key: Option<(u64, u64)> = None;
3604 let mut best_estimate: usize = segment_len;
3605 let mut _best_is_rc = false;
3606
3607 {
3608 let groups = segment_groups.lock().unwrap();
3609 let seg_map = map_segments.read().unwrap();
3610 let ref_segs = reference_segments.read().unwrap();
3611
3612 for &(_count, (key_front, key_back)) in &pruned_candidates {
3613 // Normalize key
3614 let (norm_front, norm_back, is_seg_rc) = if key_front <= key_back {
3615 (key_front, key_back, false)
3616 } else {
3617 (key_back, key_front, true)
3618 };
3619
3620 let cand_key = SegmentGroupKey {
3621 kmer_front: norm_front,
3622 kmer_back: norm_back,
3623 };
3624
3625 // Get reference segment for this candidate
3626 let ref_data_opt: Option<&[u8]> = if let Some(group_buffer) = groups.get(&cand_key) {
3627 group_buffer
3628 .reference_segment
3629 .as_ref()
3630 .map(|seg| seg.data.as_slice())
3631 } else if let Some(&group_id) = seg_map.get(&cand_key) {
3632 ref_segs.get(&group_id).map(|data| data.as_slice())
3633 } else {
3634 None
3635 };
3636
3637 if let Some(ref_data) = ref_data_opt {
3638 let target_data = if is_seg_rc {
3639 segment_data_rc
3640 } else {
3641 segment_data
3642 };
3643
3644 // Estimate compression cost
3645 #[cfg(feature = "cpp_agc")]
3646 let estimate = crate::ragc_ffi::lzdiff_v2_estimate(
3647 ref_data,
3648 target_data,
3649 config.min_match_len as u32,
3650 best_estimate as u32,
3651 ) as usize;
3652
3653 #[cfg(not(feature = "cpp_agc"))]
3654 let estimate = {
3655 let mut lz = LZDiff::new(config.min_match_len as u32);
3656 lz.prepare(&ref_data.to_vec());
3657 // Use estimate() which matches C++ CLZDiff_V2::Estimate exactly
3658 lz.estimate(&target_data.to_vec(), best_estimate as u32) as usize
3659 };
3660
3661 if config.verbosity > 2 {
3662 #[cfg(feature = "verbose_debug")]
3663 eprintln!(
3664 "RAGC_FALLBACK_ESTIMATE: key=({},{}) rc={} estimate={}",
3665 norm_front, norm_back, is_seg_rc, estimate
3666 );
3667 }
3668
3669 // Track best (lowest estimate)
3670 if estimate > 0 && estimate < best_estimate {
3671 best_estimate = estimate;
3672 best_key = Some((norm_front, norm_back));
3673 _best_is_rc = is_seg_rc;
3674 }
3675 }
3676 }
3677 }
3678
3679 // In adaptive mode, check if result is worth using
3680 if config.adaptive_mode {
3681 let threshold = if short_segments {
3682 (segment_len as f64 * 0.9) as usize
3683 } else {
3684 (segment_len as f64 * 0.2) as usize
3685 };
3686
3687 if best_estimate >= threshold {
3688 if config.verbosity > 1 {
3689 #[cfg(feature = "verbose_debug")]
3690 eprintln!(
3691 "RAGC_FALLBACK_ADAPTIVE_REJECT: estimate={} threshold={}",
3692 best_estimate, threshold
3693 );
3694 }
3695 return (MISSING_KMER, MISSING_KMER, false);
3696 }
3697 }
3698
3699 match best_key {
3700 Some((front, back)) => {
3701 // Normalize: ensure front <= back
3702 if front <= back {
3703 if config.verbosity > 1 {
3704 #[cfg(feature = "verbose_debug")]
3705 eprintln!(
3706 "RAGC_FALLBACK_PICKED: key=({},{}) rc=false estimate={}",
3707 front, back, best_estimate
3708 );
3709 }
3710 (front, back, false)
3711 } else {
3712 if config.verbosity > 1 {
3713 #[cfg(feature = "verbose_debug")]
3714 eprintln!(
3715 "RAGC_FALLBACK_PICKED: key=({},{}) rc=true estimate={}",
3716 back, front, best_estimate
3717 );
3718 }
3719 (back, front, true)
3720 }
3721 }
3722 None => {
3723 if config.verbosity > 1 {
3724 #[cfg(feature = "verbose_debug")]
3725 eprintln!("RAGC_FALLBACK_NO_WINNER: no candidate beat threshold");
3726 }
3727 (MISSING_KMER, MISSING_KMER, false)
3728 }
3729 }
3730}
3731
3732/// Add fallback mapping for a segment's k-mers
3733/// Matches C++ AGC's add_fallback_mapping (lines 1961-1989)
3734///
3735/// Called when a segment is assigned to a group to populate the fallback minimizers map.
3736fn add_fallback_mapping(
3737 segment_data: &[u8],
3738 k: usize,
3739 splitter1: u64,
3740 splitter2: u64,
3741 fallback_filter: &FallbackFilter,
3742 map_fallback_minimizers: &Arc<Mutex<BTreeMap<u64, Vec<(u64, u64)>>>>,
3743) {
3744 use crate::segment::MISSING_KMER;
3745
3746 if !fallback_filter.is_enabled() {
3747 return;
3748 }
3749
3750 // Skip if splitters are MISSING
3751 if splitter1 == MISSING_KMER || splitter2 == MISSING_KMER {
3752 return;
3753 }
3754
3755 let splitter_dir = (splitter1, splitter2);
3756 let splitter_rev = (splitter2, splitter1);
3757 let mask: u64 = (1u64 << (2 * k)) - 1;
3758
3759 // K-mer scanning state
3760 let mut kmer_data: u64 = 0;
3761 let mut kmer_rc: u64 = 0;
3762 let mut kmer_len: usize = 0;
3763
3764 let mut fb_map = map_fallback_minimizers.lock().unwrap();
3765
3766 for &base in segment_data {
3767 if base > 3 {
3768 kmer_data = 0;
3769 kmer_rc = 0;
3770 kmer_len = 0;
3771 continue;
3772 }
3773
3774 kmer_data = ((kmer_data << 2) | (base as u64)) & mask;
3775 let comp = 3 - base;
3776 kmer_rc = (kmer_rc >> 2) | ((comp as u64) << (2 * (k - 1)));
3777 kmer_len += 1;
3778
3779 if kmer_len >= k {
3780 let canonical = kmer_data.min(kmer_rc);
3781 let is_dir_oriented = kmer_data <= kmer_rc;
3782
3783 // Check filter and skip symmetric k-mers
3784 if fallback_filter.passes(canonical) && kmer_data != kmer_rc {
3785 let to_add = if is_dir_oriented {
3786 splitter_dir
3787 } else {
3788 splitter_rev
3789 };
3790 let entry = fb_map.entry(canonical).or_insert_with(Vec::new);
3791
3792 // Only add if not already present
3793 if !entry.contains(&to_add) {
3794 entry.push(to_add);
3795 }
3796 }
3797 }
3798 }
3799}
3800
3801// =============================================================================
3802// Parallel batch processing functions (C++ AGC 4-phase pattern)
3803// =============================================================================
3804
3805/// Phase 2: Prepare batch for parallel processing
3806/// - Processes NEW segments from buffered_seg_part (assigns group_ids)
3807/// - Drains segments from buffered_seg_part into SegmentGroupBuffer entries
3808/// - Extracts buffers that need flushing into ParallelFlushState
3809/// Returns true if there are buffers to flush, false otherwise
3810#[allow(clippy::too_many_arguments)]
3811fn prepare_batch_parallel(
3812 segment_groups: &Arc<Mutex<BTreeMap<SegmentGroupKey, SegmentGroupBuffer>>>,
3813 buffered_seg_part: &Arc<BufferedSegPart>,
3814 batch_local_groups: &Arc<Mutex<BTreeMap<SegmentGroupKey, u32>>>,
3815 batch_local_terminators: &Arc<Mutex<BTreeMap<u64, Vec<u64>>>>,
3816 map_segments: &Arc<RwLock<BTreeMap<SegmentGroupKey, u32>>>,
3817 map_segments_terminators: &Arc<RwLock<BTreeMap<u64, Vec<u64>>>>,
3818 group_counter: &Arc<AtomicU32>,
3819 raw_group_counter: &Arc<AtomicU32>,
3820 archive: &Arc<Mutex<Archive>>,
3821 collection: &Arc<Mutex<CollectionV3>>,
3822 reference_segments: &Arc<RwLock<BTreeMap<u32, Vec<u8>>>>,
3823 reference_orientations: &Arc<RwLock<BTreeMap<u32, bool>>>,
3824 #[cfg(feature = "cpp_agc")] grouping_engine: &Arc<Mutex<crate::ragc_ffi::GroupingEngine>>,
3825 parallel_state: &ParallelFlushState,
3826 config: &StreamingQueueConfig,
3827) -> Result<bool> {
3828 use crate::segment::MISSING_KMER;
3829
3830 // Check if there's anything to process
3831 let batch_map_len = batch_local_groups.lock().unwrap().len();
3832 let batch_terms_len = batch_local_terminators.lock().unwrap().len();
3833 let has_buffered_segments = buffered_seg_part.has_segments();
3834
3835 if batch_map_len == 0 && batch_terms_len == 0 && !has_buffered_segments {
3836 return Ok(false);
3837 }
3838
3839 if config.verbosity > 0 {
3840 eprintln!("PREPARE_BATCH_PARALLEL: Processing {} batch-local groups, buffered segments: {}, {} terminator keys",
3841 batch_map_len, has_buffered_segments, batch_terms_len);
3842 }
3843
3844 // Phase 2a: Process NEW segments - assign group_ids deterministically
3845 // Get current group counter and update after process_new
3846 let mut next_group_id = group_counter.load(Ordering::SeqCst);
3847 {
3848 let mut global_map = map_segments.write().unwrap();
3849 let mut ref_seg = reference_segments.write().unwrap();
3850 let mut term_map = map_segments_terminators.write().unwrap();
3851 let new_count = buffered_seg_part.process_new(
3852 &mut global_map,
3853 &mut next_group_id,
3854 &mut ref_seg,
3855 &mut term_map,
3856 );
3857 if config.verbosity > 0 && new_count > 0 {
3858 eprintln!(
3859 "PREPARE_BATCH_PARALLEL: Assigned {} new group IDs",
3860 new_count
3861 );
3862 }
3863 }
3864 // Update the shared counter
3865 group_counter.store(next_group_id, Ordering::SeqCst);
3866
3867 // Phase 2b: Sort segments within each group for determinism
3868 buffered_seg_part.sort_known();
3869
3870 // Phase 2c: Drain segments from buffered_seg_part into SegmentGroupBuffer entries
3871 let mut groups_map = segment_groups.lock().unwrap();
3872
3873 // Build reverse lookup map (group_id -> key) ONCE to avoid O(n²) lookups
3874 let group_id_to_key: std::collections::HashMap<u32, SegmentGroupKey> = {
3875 let global_map = map_segments.read().unwrap();
3876 global_map
3877 .iter()
3878 .map(|(k, &gid)| (gid, k.clone()))
3879 .collect()
3880 };
3881
3882 // Phase 2c-1: Collect all segments with their keys (no locks needed)
3883 // FIX 18: For raw groups (0-15), use unique keys (raw_group_id, MISSING) instead of (MISSING, MISSING)
3884 // This ensures each raw group has its own buffer, matching C++ AGC's distribute_segments() behavior
3885 let num_groups = buffered_seg_part.num_groups();
3886 let mut collected_segments: Vec<(u32, SegmentGroupKey, BufferedSegment)> = Vec::new();
3887 for group_id in 0..num_groups as u32 {
3888 while let Some(seg) = buffered_seg_part.get_part(group_id) {
3889 // FIX 18: Raw groups (0-15) need unique keys to have separate buffers
3890 // The global map has (MISSING, MISSING) -> 0, but we need each raw group
3891 // to have its own buffer for proper distribution (matching C++ AGC)
3892 let key = if group_id < NO_RAW_GROUPS {
3893 // Raw group: use unique key (group_id, MISSING) to distinguish buffers
3894 SegmentGroupKey {
3895 kmer_front: group_id as u64,
3896 kmer_back: MISSING_KMER,
3897 }
3898 } else {
3899 // LZ group: use the actual key from the map
3900 group_id_to_key.get(&group_id).cloned().unwrap_or_else(|| {
3901 // Fallback - shouldn't happen for LZ groups
3902 SegmentGroupKey {
3903 kmer_front: MISSING_KMER,
3904 kmer_back: MISSING_KMER,
3905 }
3906 })
3907 };
3908 collected_segments.push((group_id, key, seg));
3909 }
3910 }
3911
3912 // Phase 2c-2: Batch update batch_local_groups (ONE lock acquisition)
3913 {
3914 let mut batch_map = batch_local_groups.lock().unwrap();
3915 for (group_id, key, _) in &collected_segments {
3916 batch_map.insert(key.clone(), *group_id);
3917 }
3918 }
3919
3920 // Phase 2c-3: Register with FFI engine (ONE lock acquisition)
3921 #[cfg(feature = "cpp_agc")]
3922 {
3923 let mut eng = grouping_engine.lock().unwrap();
3924 for (group_id, key, _) in &collected_segments {
3925 if key.kmer_front != MISSING_KMER && key.kmer_back != MISSING_KMER {
3926 eng.register_group(key.kmer_front, key.kmer_back, *group_id);
3927 }
3928 }
3929 }
3930
3931 // Phase 2c-4: Batch update terminators (ONE lock acquisition)
3932 {
3933 let mut term_map = batch_local_terminators.lock().unwrap();
3934 for (_, key, _) in &collected_segments {
3935 if key.kmer_front != MISSING_KMER && key.kmer_back != MISSING_KMER {
3936 term_map
3937 .entry(key.kmer_front)
3938 .or_insert_with(Vec::new)
3939 .push(key.kmer_back);
3940 if key.kmer_front != key.kmer_back {
3941 term_map
3942 .entry(key.kmer_back)
3943 .or_insert_with(Vec::new)
3944 .push(key.kmer_front);
3945 }
3946 }
3947 }
3948 }
3949
3950 // Phase 2c-5: Pre-register all streams for new groups (ONE lock acquisition)
3951 // Build a set of existing group_ids first for O(1) lookup
3952 let existing_group_ids: std::collections::HashSet<u32> =
3953 groups_map.values().map(|b| b.group_id).collect();
3954
3955 // Collect unique new group_ids (O(n) instead of O(n×m))
3956 // DETERMINISM FIX: Use BTreeSet instead of HashSet to ensure deterministic iteration order
3957 let new_group_ids: std::collections::BTreeSet<u32> = collected_segments
3958 .iter()
3959 .map(|(gid, _, _)| *gid)
3960 .filter(|gid| !existing_group_ids.contains(gid))
3961 .collect();
3962
3963 // Pre-register all streams in one lock acquisition
3964 // BTreeSet iteration is sorted, so stream registration order is deterministic
3965 let stream_registrations: std::collections::HashMap<u32, (usize, usize)> = if !new_group_ids
3966 .is_empty()
3967 {
3968 let archive_version = ragc_common::AGC_FILE_MAJOR * 1000 + ragc_common::AGC_FILE_MINOR;
3969 let mut arch = archive.lock().unwrap();
3970 new_group_ids
3971 .iter()
3972 .map(|&group_id| {
3973 let delta_stream_name = ragc_common::stream_delta_name(archive_version, group_id);
3974 let ref_stream_name = ragc_common::stream_ref_name(archive_version, group_id);
3975 let stream_id = arch.register_stream(&delta_stream_name);
3976 let ref_stream_id = arch.register_stream(&ref_stream_name);
3977 (group_id, (stream_id, ref_stream_id))
3978 })
3979 .collect()
3980 } else {
3981 std::collections::HashMap::new()
3982 };
3983
3984 // Phase 2c-6: Add segments to buffers (groups_map already locked)
3985 for (group_id, key, seg) in collected_segments {
3986 let buffer = groups_map.entry(key.clone()).or_insert_with(|| {
3987 let (stream_id, ref_stream_id) = stream_registrations
3988 .get(&group_id)
3989 .copied()
3990 .unwrap_or_else(|| {
3991 // Fallback: register now (shouldn't happen if logic is correct)
3992 let archive_version =
3993 ragc_common::AGC_FILE_MAJOR * 1000 + ragc_common::AGC_FILE_MINOR;
3994 let delta_stream_name =
3995 ragc_common::stream_delta_name(archive_version, group_id);
3996 let ref_stream_name = ragc_common::stream_ref_name(archive_version, group_id);
3997 let mut arch = archive.lock().unwrap();
3998 let sid = arch.register_stream(&delta_stream_name);
3999 let rsid = arch.register_stream(&ref_stream_name);
4000 (sid, rsid)
4001 });
4002 SegmentGroupBuffer::new(group_id, stream_id, ref_stream_id)
4003 });
4004 buffer.segments.push(seg);
4005 }
4006
4007 // Clear the buffered_seg_part after draining
4008 buffered_seg_part.clear();
4009
4010 // Extract buffers that need flushing
4011 let mut extracted: Vec<(SegmentGroupKey, SegmentGroupBuffer)> = Vec::new();
4012 let keys_to_remove: Vec<SegmentGroupKey> = groups_map
4013 .iter()
4014 .filter(|(_, buffer)| !buffer.segments.is_empty() || !buffer.ref_written)
4015 .map(|(k, _)| k.clone())
4016 .collect();
4017
4018 // Sort keys for deterministic processing order
4019 let mut sorted_keys = keys_to_remove;
4020 sorted_keys.sort();
4021
4022 for key in sorted_keys {
4023 if let Some(buffer) = groups_map.remove(&key) {
4024 extracted.push((key, buffer));
4025 }
4026 }
4027
4028 let has_work = !extracted.is_empty();
4029
4030 if config.verbosity > 0 {
4031 eprintln!(
4032 "PREPARE_BATCH_PARALLEL: Extracted {} buffers for parallel flush",
4033 extracted.len()
4034 );
4035 }
4036
4037 // Populate ParallelFlushState
4038 parallel_state.prepare(extracted);
4039
4040 Ok(has_work)
4041}
4042
4043/// Phase 4: Cleanup after parallel processing
4044/// - Re-inserts processed buffers
4045/// - Updates global maps
4046/// - Clears batch-local state
4047fn cleanup_batch_parallel(
4048 segment_groups: &Arc<Mutex<BTreeMap<SegmentGroupKey, SegmentGroupBuffer>>>,
4049 batch_local_groups: &Arc<Mutex<BTreeMap<SegmentGroupKey, u32>>>,
4050 batch_local_terminators: &Arc<Mutex<BTreeMap<u64, Vec<u64>>>>,
4051 map_segments: &Arc<RwLock<BTreeMap<SegmentGroupKey, u32>>>,
4052 map_segments_terminators: &Arc<RwLock<BTreeMap<u64, Vec<u64>>>>,
4053 parallel_state: &ParallelFlushState,
4054 config: &StreamingQueueConfig,
4055) {
4056 // Re-insert processed buffers
4057 let processed = parallel_state.drain_buffers();
4058 {
4059 let mut groups_map = segment_groups.lock().unwrap();
4060 for (key, buffer) in processed {
4061 groups_map.insert(key, buffer);
4062 }
4063 }
4064
4065 // Update global registry with batch-local groups
4066 {
4067 let batch_map = batch_local_groups.lock().unwrap();
4068 let mut global_map = map_segments.write().unwrap();
4069 for (key, group_id) in batch_map.iter() {
4070 global_map.entry(key.clone()).or_insert(*group_id);
4071 }
4072 }
4073
4074 // Merge batch-local terminators into global terminators
4075 {
4076 let batch_terms = batch_local_terminators.lock().unwrap();
4077 let mut global_terms = map_segments_terminators.write().unwrap();
4078 for (kmer, connections) in batch_terms.iter() {
4079 let entry = global_terms.entry(*kmer).or_insert_with(Vec::new);
4080 entry.extend(connections.iter().cloned());
4081 entry.sort_unstable();
4082 entry.dedup();
4083 }
4084 }
4085
4086 // Clear batch-local state
4087 batch_local_groups.lock().unwrap().clear();
4088 batch_local_terminators.lock().unwrap().clear();
4089
4090 if config.verbosity > 0 {
4091 eprintln!("CLEANUP_BATCH_PARALLEL: Batch cleanup complete");
4092 }
4093}
4094
4095/// Classify raw segments at barrier (Thread 0 only)
4096/// This eliminates lock contention by doing all classification single-threaded.
4097/// Raw segments are sorted for determinism, then classified using the same
4098/// Case 2/3a/3b logic as before, just without contention.
4099/// Includes fallback minimizer support to match C++ AGC grouping quality.
4100fn classify_raw_segments_at_barrier(
4101 raw_segment_buffers: &Arc<Vec<Mutex<Vec<RawBufferedSegment>>>>,
4102 buffered_seg_part: &Arc<BufferedSegPart>,
4103 map_segments: &Arc<RwLock<BTreeMap<SegmentGroupKey, u32>>>,
4104 map_segments_terminators: &Arc<RwLock<BTreeMap<u64, Vec<u64>>>>,
4105 segment_groups: &Arc<Mutex<BTreeMap<SegmentGroupKey, SegmentGroupBuffer>>>,
4106 reference_segments: &Arc<RwLock<BTreeMap<u32, Vec<u8>>>>,
4107 fallback_filter: &FallbackFilter,
4108 map_fallback_minimizers: &Arc<Mutex<BTreeMap<u64, Vec<(u64, u64)>>>>,
4109 group_counter: &Arc<AtomicU32>,
4110 raw_group_counter: &Arc<AtomicU32>, // FIX 18: For round-robin distribution of orphan segments
4111 config: &StreamingQueueConfig,
4112) {
4113 use crate::segment::MISSING_KMER;
4114
4115 // Drain all raw segments from ALL per-worker buffers into one Vec
4116 let mut raw_segs: Vec<RawBufferedSegment> = Vec::new();
4117 for buffer in raw_segment_buffers.iter() {
4118 let mut worker_segs = buffer.lock().unwrap();
4119 raw_segs.append(&mut *worker_segs);
4120 }
4121
4122 if raw_segs.is_empty() {
4123 return;
4124 }
4125
4126 // Sort for determinism: by sample_name, contig_name, original_place
4127 raw_segs.sort();
4128
4129 // Group segments by (sample, contig) for parallel processing
4130 // Each contig's segments will be processed sequentially for determinism,
4131 // but different contigs can be processed in parallel
4132 use std::collections::BTreeMap;
4133 let mut contig_groups: BTreeMap<(String, String), Vec<RawBufferedSegment>> = BTreeMap::new();
4134 for raw_seg in raw_segs.drain(..) {
4135 let key = (raw_seg.sample_name.clone(), raw_seg.contig_name.clone());
4136 contig_groups.entry(key).or_default().push(raw_seg);
4137 }
4138
4139 // Sort segments within each contig by original_place (should already be sorted, but ensure)
4140 for segs in contig_groups.values_mut() {
4141 segs.sort_by_key(|s| s.original_place);
4142 }
4143
4144 let num_contigs = contig_groups.len();
4145 let total_segments: usize = contig_groups.values().map(|v| v.len()).sum();
4146
4147 if config.verbosity > 0 {
4148 eprintln!(
4149 "CLASSIFY_RAW_BARRIER: Processing {} raw segments across {} contigs (parallel)",
4150 total_segments, num_contigs
4151 );
4152 }
4153
4154 // DETERMINISM FIX: Process contigs SEQUENTIALLY to ensure deterministic group creation order.
4155 // Parallelism caused non-deterministic group IDs because different threads created groups
4156 // in unpredictable order, affecting which segments could split into which groups.
4157 // The compression phase is still parallel - only classification needs to be sequential.
4158 let contig_vec: Vec<_> = contig_groups.into_iter().collect();
4159
4160 for ((sample_name, contig_name), contig_segs) in contig_vec.into_iter() {
4161 // Track seg_part_no for this contig (local to this parallel task)
4162 let mut seg_part_no: usize = 0;
4163
4164 for raw_seg in contig_segs {
4165 // Use local seg_part_no for this contig
4166 let output_seg_part_no = seg_part_no;
4167 // Case 2/3a/3b classification (same logic as before)
4168 let (key_front, key_back, should_reverse) =
4169 if raw_seg.front_kmer != MISSING_KMER && raw_seg.back_kmer != MISSING_KMER {
4170 // Case 2: Both k-mers present
4171 if raw_seg.front_kmer < raw_seg.back_kmer {
4172 (raw_seg.front_kmer, raw_seg.back_kmer, false)
4173 } else {
4174 (raw_seg.back_kmer, raw_seg.front_kmer, true)
4175 }
4176 } else if raw_seg.front_kmer != MISSING_KMER {
4177 // Case 3a: Only front k-mer present
4178 let (mut kf, mut kb, mut sr) = find_group_with_one_kmer(
4179 raw_seg.front_kmer,
4180 raw_seg.front_kmer_is_dir,
4181 &raw_seg.data,
4182 &raw_seg.data_rc,
4183 map_segments_terminators,
4184 map_segments,
4185 segment_groups,
4186 reference_segments,
4187 config,
4188 );
4189 // Fallback: If Case 3a returned MISSING, try fallback minimizers
4190 if (kf == MISSING_KMER || kb == MISSING_KMER) && fallback_filter.is_enabled() {
4191 let (fb_kf, fb_kb, fb_sr) = find_cand_segment_using_fallback_minimizers(
4192 &raw_seg.data,
4193 &raw_seg.data_rc,
4194 config.k,
4195 5, // min_shared_kmers = 5 for Case 3
4196 fallback_filter,
4197 map_fallback_minimizers,
4198 map_segments,
4199 segment_groups,
4200 reference_segments,
4201 config,
4202 );
4203 if fb_kf != MISSING_KMER && fb_kb != MISSING_KMER {
4204 kf = fb_kf;
4205 kb = fb_kb;
4206 sr = fb_sr;
4207 }
4208 }
4209 (kf, kb, sr)
4210 } else if raw_seg.back_kmer != MISSING_KMER {
4211 // Case 3b: Only back k-mer present
4212 let kmer_is_dir_after_swap = !raw_seg.back_kmer_is_dir;
4213 let (mut kf, mut kb, mut sr) = find_group_with_one_kmer(
4214 raw_seg.back_kmer,
4215 kmer_is_dir_after_swap,
4216 &raw_seg.data_rc,
4217 &raw_seg.data,
4218 map_segments_terminators,
4219 map_segments,
4220 segment_groups,
4221 reference_segments,
4222 config,
4223 );
4224 sr = !sr;
4225 // Fallback: If Case 3b returned MISSING, try fallback minimizers
4226 // Note: C++ AGC uses segment_rc for fallback in Case 3b
4227 if (kf == MISSING_KMER || kb == MISSING_KMER) && fallback_filter.is_enabled() {
4228 let (fb_kf, fb_kb, fb_sr) = find_cand_segment_using_fallback_minimizers(
4229 &raw_seg.data_rc, // Use RC for Case 3b (matches C++ AGC)
4230 &raw_seg.data,
4231 config.k,
4232 5, // min_shared_kmers = 5 for Case 3
4233 fallback_filter,
4234 map_fallback_minimizers,
4235 map_segments,
4236 segment_groups,
4237 reference_segments,
4238 config,
4239 );
4240 if fb_kf != MISSING_KMER && fb_kb != MISSING_KMER {
4241 kf = fb_kf;
4242 kb = fb_kb;
4243 sr = !fb_sr; // C++ AGC: store_rc = !store_dir_alt
4244 }
4245 }
4246 (kf, kb, sr)
4247 } else {
4248 // Case 1: Both MISSING - try fallback minimizers
4249 let mut kf = MISSING_KMER;
4250 let mut kb = MISSING_KMER;
4251 let mut sr = false;
4252
4253 if fallback_filter.is_enabled() {
4254 let (fb_kf, fb_kb, fb_sr) = find_cand_segment_using_fallback_minimizers(
4255 &raw_seg.data,
4256 &raw_seg.data_rc,
4257 config.k,
4258 1, // min_shared_kmers = 1 for Case 1 (matches C++ AGC)
4259 fallback_filter,
4260 map_fallback_minimizers,
4261 map_segments,
4262 segment_groups,
4263 reference_segments,
4264 config,
4265 );
4266 if fb_kf != MISSING_KMER && fb_kb != MISSING_KMER {
4267 kf = fb_kf;
4268 kb = fb_kb;
4269 sr = fb_sr;
4270 }
4271 }
4272 (kf, kb, sr)
4273 };
4274
4275 let key = SegmentGroupKey {
4276 kmer_front: key_front,
4277 kmer_back: key_back,
4278 };
4279
4280 // Prepare segment data (reverse complement if needed)
4281 let segment_data = if should_reverse {
4282 raw_seg.data_rc.clone()
4283 } else {
4284 raw_seg.data.clone()
4285 };
4286
4287 // Add fallback mapping for this segment (matches C++ AGC add_fallback_mapping)
4288 // This populates the fallback minimizers map for use by later segments
4289 add_fallback_mapping(
4290 &segment_data,
4291 config.k,
4292 key.kmer_front,
4293 key.kmer_back,
4294 fallback_filter,
4295 map_fallback_minimizers,
4296 );
4297
4298 // Check if group exists (NO CONTENTION - we're single-threaded)
4299 let group_id_opt = {
4300 let seg_map = map_segments.read().unwrap();
4301 seg_map.get(&key).copied()
4302 };
4303
4304 if let Some(group_id) = group_id_opt {
4305 // KNOWN: add to per-group buffer
4306 // FIX 18: For orphan segments (key = MISSING, MISSING), use round-robin across groups 0-15
4307 // instead of always using group 0. This matches C++ AGC's distribute_segments() behavior.
4308 let actual_group_id =
4309 if key.kmer_front == MISSING_KMER && key.kmer_back == MISSING_KMER {
4310 // Orphan segment - distribute across raw groups 0-15 via round-robin
4311 raw_group_counter.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
4312 % NO_RAW_GROUPS
4313 } else {
4314 group_id
4315 };
4316 buffered_seg_part.add_known(
4317 actual_group_id,
4318 BufferedSegment {
4319 sample_name: raw_seg.sample_name.clone(),
4320 contig_name: raw_seg.contig_name.clone(),
4321 seg_part_no: output_seg_part_no,
4322 data: segment_data,
4323 is_rev_comp: should_reverse,
4324 sample_priority: raw_seg.sample_priority,
4325 },
4326 );
4327 // Increment counter by 1 for non-split segment
4328 seg_part_no += 1;
4329 } else {
4330 // NEW: Try segment splitting before adding as new group
4331 // C++ AGC only attempts splits when key doesn't exist and both k-mers valid
4332 if config.verbosity > 0 {
4333 eprintln!(
4334 "BARRIER_NEW_SEGMENT: key=({},{}) sample={}",
4335 key_front, key_back, raw_seg.sample_name
4336 );
4337 }
4338 let mut was_split = false;
4339
4340 // Skip barrier splitting if disabled (for C++ AGC parity testing)
4341 let try_split = !crate::env_cache::disable_barrier_split()
4342 && key_front != MISSING_KMER
4343 && key_back != MISSING_KMER
4344 && key_front != key_back;
4345
4346 if try_split {
4347 // Try to find a middle splitter
4348 let middle_kmer_opt = {
4349 let terminators = map_segments_terminators.read().unwrap();
4350 let front_conn = terminators.get(&key_front).map(|v| v.len()).unwrap_or(0);
4351 let back_conn = terminators.get(&key_back).map(|v| v.len()).unwrap_or(0);
4352 if config.verbosity > 0 {
4353 // Always print the first few, then only when connections exist
4354 eprintln!("BARRIER_SPLIT_TRY: key=({},{}) term_size={} front_conn={} back_conn={} sample={}",
4355 key_front, key_back, terminators.len(), front_conn, back_conn, raw_seg.sample_name);
4356 }
4357 find_middle_splitter(key_front, key_back, &terminators)
4358 };
4359
4360 if let Some(middle_kmer) = middle_kmer_opt {
4361 if config.verbosity > 0 {
4362 eprintln!(
4363 "BARRIER_SPLIT_FOUND_MIDDLE: key=({},{}) middle={}",
4364 key_front, key_back, middle_kmer
4365 );
4366 }
4367 // Found potential middle k-mer - check if both target groups exist
4368 let left_key = if key_front <= middle_kmer {
4369 SegmentGroupKey {
4370 kmer_front: key_front,
4371 kmer_back: middle_kmer,
4372 }
4373 } else {
4374 SegmentGroupKey {
4375 kmer_front: middle_kmer,
4376 kmer_back: key_front,
4377 }
4378 };
4379 let right_key = if middle_kmer <= key_back {
4380 SegmentGroupKey {
4381 kmer_front: middle_kmer,
4382 kmer_back: key_back,
4383 }
4384 } else {
4385 SegmentGroupKey {
4386 kmer_front: key_back,
4387 kmer_back: middle_kmer,
4388 }
4389 };
4390
4391 let (left_group_id, right_group_id) = {
4392 let seg_map = map_segments.read().unwrap();
4393 (
4394 seg_map.get(&left_key).copied(),
4395 seg_map.get(&right_key).copied(),
4396 )
4397 };
4398
4399 // Only split if BOTH target groups already exist
4400 if left_group_id.is_none() || right_group_id.is_none() {
4401 if config.verbosity > 0 {
4402 eprintln!("BARRIER_SPLIT_MISSING_GROUP: left={:?} right={:?} left_key=({},{}) right_key=({},{})",
4403 left_group_id, right_group_id, left_key.kmer_front, left_key.kmer_back, right_key.kmer_front, right_key.kmer_back);
4404 }
4405 }
4406 if let (Some(left_gid), Some(right_gid)) = (left_group_id, right_group_id) {
4407 // Get reference segment data for BOTH left and right
4408 let (left_ref_data, right_ref_data) = {
4409 let refs = reference_segments.read().unwrap();
4410 let left = refs.get(&left_gid).cloned().unwrap_or_default();
4411 let right = refs.get(&right_gid).cloned().unwrap_or_default();
4412 (left, right)
4413 };
4414
4415 if config.verbosity > 1 {
4416 eprintln!("BARRIER_SPLIT_GROUPS_EXIST: left_gid={} right_gid={} left_ref_len={} right_ref_len={} seg_len={}",
4417 left_gid, right_gid, left_ref_data.len(), right_ref_data.len(), raw_seg.data.len());
4418 }
4419
4420 // Find split decision using cost-based optimization
4421 // This matches C++ AGC's find_cand_segment_with_missing_middle_splitter
4422 // which computes LZ encoding cost at every position and decides:
4423 // - AssignToLeft: entire segment goes to (front, middle) group
4424 // - AssignToRight: entire segment goes to (middle, back) group
4425 // - SplitAt(pos): actually split the segment
4426 //
4427 // C++ AGC line 1393: passes (kmer1, kmer2) after normalization swap,
4428 // and swaps segment_dir/segment_rc based on use_rc flag.
4429 // When should_reverse=true (i.e., original front > back), we need to
4430 // swap the segments to match C++ AGC's behavior.
4431 let (seg_dir, seg_rc) = if should_reverse {
4432 (&raw_seg.data_rc, &raw_seg.data) // Swap when use_rc=true
4433 } else {
4434 (&raw_seg.data, &raw_seg.data_rc)
4435 };
4436 let split_decision = find_split_by_cost(
4437 seg_dir,
4438 seg_rc,
4439 &left_ref_data,
4440 &right_ref_data,
4441 key_front, // normalized k-mers (key_front < key_back)
4442 key_back,
4443 middle_kmer,
4444 config.k,
4445 config.min_match_len as u32,
4446 );
4447
4448 match split_decision {
4449 SplitDecision::SplitAt(split_pos) => {
4450 // Actually split the segment into two parts
4451 let (left_data, right_data) = split_segment_at_position(
4452 &segment_data,
4453 split_pos,
4454 config.k,
4455 );
4456 let left_len = left_data.len();
4457 let right_len = right_data.len();
4458
4459 // FIX 27 v4: Compute orientations using ORIGINAL k-mers and should_reverse
4460 let (left_should_reverse, right_should_reverse) =
4461 if should_reverse {
4462 let left_rc = middle_kmer >= raw_seg.back_kmer;
4463 let right_rc = raw_seg.front_kmer >= middle_kmer;
4464 (left_rc, right_rc)
4465 } else {
4466 let left_rc = raw_seg.front_kmer >= middle_kmer;
4467 let right_rc = middle_kmer >= raw_seg.back_kmer;
4468 (left_rc, right_rc)
4469 };
4470
4471 let left_final = if left_should_reverse != should_reverse {
4472 reverse_complement_sequence(&left_data)
4473 } else {
4474 left_data
4475 };
4476 let right_final = if right_should_reverse != should_reverse {
4477 reverse_complement_sequence(&right_data)
4478 } else {
4479 right_data
4480 };
4481
4482 // FIX: When should_reverse=true, left_data is the RIGHT part of
4483 // the original and right_data is the LEFT part. C++ AGC swaps
4484 // left_size/right_size when use_rc=true (line 1419), so the
4485 // first segment stored is always the LEFT part of the original.
4486 // We need to swap seg_part_no assignments to match.
4487 let (left_seg_part, right_seg_part) = if should_reverse {
4488 (output_seg_part_no + 1, output_seg_part_no)
4489 } else {
4490 (output_seg_part_no, output_seg_part_no + 1)
4491 };
4492
4493 buffered_seg_part.add_known(
4494 left_gid,
4495 BufferedSegment {
4496 sample_name: raw_seg.sample_name.clone(),
4497 contig_name: raw_seg.contig_name.clone(),
4498 seg_part_no: left_seg_part,
4499 data: left_final,
4500 is_rev_comp: left_should_reverse,
4501 sample_priority: raw_seg.sample_priority,
4502 },
4503 );
4504
4505 buffered_seg_part.add_known(
4506 right_gid,
4507 BufferedSegment {
4508 sample_name: raw_seg.sample_name.clone(),
4509 contig_name: raw_seg.contig_name.clone(),
4510 seg_part_no: right_seg_part,
4511 data: right_final,
4512 is_rev_comp: right_should_reverse,
4513 sample_priority: raw_seg.sample_priority,
4514 },
4515 );
4516
4517 seg_part_no += 2;
4518 was_split = true;
4519 if config.verbosity > 0 {
4520 eprintln!("BARRIER_SPLIT_SUCCESS: sample={} contig={} place={} split_pos={} left_len={} right_len={}",
4521 raw_seg.sample_name, raw_seg.contig_name, raw_seg.original_place, split_pos, left_len, right_len);
4522 }
4523 }
4524 SplitDecision::AssignToLeft => {
4525 // Assign entire segment to left group (front -> middle)
4526 // C++ AGC lines 1408-1414: right_size == 0 case
4527 let assign_rc = if should_reverse {
4528 middle_kmer >= raw_seg.back_kmer
4529 } else {
4530 raw_seg.front_kmer >= middle_kmer
4531 };
4532 let assign_data = if assign_rc != should_reverse {
4533 reverse_complement_sequence(&segment_data)
4534 } else {
4535 segment_data.clone()
4536 };
4537
4538 buffered_seg_part.add_known(
4539 left_gid,
4540 BufferedSegment {
4541 sample_name: raw_seg.sample_name.clone(),
4542 contig_name: raw_seg.contig_name.clone(),
4543 seg_part_no: output_seg_part_no,
4544 data: assign_data,
4545 is_rev_comp: assign_rc,
4546 sample_priority: raw_seg.sample_priority,
4547 },
4548 );
4549 seg_part_no += 1;
4550 was_split = true;
4551 if config.verbosity > 0 {
4552 eprintln!("BARRIER_ASSIGN_LEFT: sample={} contig={} place={} group={}",
4553 raw_seg.sample_name, raw_seg.contig_name, raw_seg.original_place, left_gid);
4554 }
4555 }
4556 SplitDecision::AssignToRight => {
4557 // Assign entire segment to right group (middle -> back)
4558 // C++ AGC lines 1400-1406: left_size == 0 case
4559 let assign_rc = if should_reverse {
4560 raw_seg.front_kmer >= middle_kmer
4561 } else {
4562 middle_kmer >= raw_seg.back_kmer
4563 };
4564 let assign_data = if assign_rc != should_reverse {
4565 reverse_complement_sequence(&segment_data)
4566 } else {
4567 segment_data.clone()
4568 };
4569
4570 buffered_seg_part.add_known(
4571 right_gid,
4572 BufferedSegment {
4573 sample_name: raw_seg.sample_name.clone(),
4574 contig_name: raw_seg.contig_name.clone(),
4575 seg_part_no: output_seg_part_no,
4576 data: assign_data,
4577 is_rev_comp: assign_rc,
4578 sample_priority: raw_seg.sample_priority,
4579 },
4580 );
4581 seg_part_no += 1;
4582 was_split = true;
4583 if config.verbosity > 0 {
4584 eprintln!("BARRIER_ASSIGN_RIGHT: sample={} contig={} place={} group={}",
4585 raw_seg.sample_name, raw_seg.contig_name, raw_seg.original_place, right_gid);
4586 }
4587 }
4588 SplitDecision::NoDecision => {
4589 if config.verbosity > 0 {
4590 eprintln!("BARRIER_SPLIT_SKIPPED: sample={} contig={} place={} left_ref={} right_ref={}",
4591 raw_seg.sample_name, raw_seg.contig_name, raw_seg.original_place, left_ref_data.len(), right_ref_data.len());
4592 }
4593 }
4594 }
4595 }
4596 }
4597 }
4598
4599 if !was_split {
4600 // Register group IMMEDIATELY so later segments can split into it.
4601 // With SEQUENTIAL processing (not parallel), this is now DETERMINISTIC.
4602 // C++ AGC store_segments() updates map_segments at barrier, and segments
4603 // within the same barrier batch CAN reference groups from earlier segments.
4604 let new_group_id = {
4605 let mut seg_map = map_segments.write().unwrap();
4606 if let Some(&existing_gid) = seg_map.get(&key) {
4607 existing_gid
4608 } else {
4609 // Allocate new group ID (matches C++ AGC no_segments++)
4610 let gid =
4611 group_counter.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
4612 seg_map.insert(key.clone(), gid);
4613 gid
4614 }
4615 };
4616
4617 // Ensure buffered_seg_part has capacity for this group ID
4618 buffered_seg_part.ensure_capacity(new_group_id);
4619
4620 // Store reference data IMMEDIATELY for new groups
4621 // C++ AGC: first segment becomes reference for LZ encoding
4622 {
4623 let mut ref_segs = reference_segments.write().unwrap();
4624 ref_segs
4625 .entry(new_group_id)
4626 .or_insert_with(|| segment_data.clone());
4627 }
4628
4629 // Update terminators IMMEDIATELY (C++ AGC lines 1015-1025)
4630 // This allows find_middle_splitter to find shared k-mers for splitting
4631 if key.kmer_front != MISSING_KMER && key.kmer_back != MISSING_KMER {
4632 let mut term_map = map_segments_terminators.write().unwrap();
4633 let front_vec = term_map.entry(key.kmer_front).or_insert_with(Vec::new);
4634 if !front_vec.contains(&key.kmer_back) {
4635 front_vec.push(key.kmer_back);
4636 front_vec.sort();
4637 }
4638 if key.kmer_front != key.kmer_back {
4639 let back_vec = term_map.entry(key.kmer_back).or_insert_with(Vec::new);
4640 if !back_vec.contains(&key.kmer_front) {
4641 back_vec.push(key.kmer_front);
4642 back_vec.sort();
4643 }
4644 }
4645 }
4646
4647 // Add segment to buffer
4648 buffered_seg_part.add_new(NewSegment {
4649 kmer_front: key.kmer_front,
4650 kmer_back: key.kmer_back,
4651 sample_priority: raw_seg.sample_priority,
4652 sample_name: raw_seg.sample_name,
4653 contig_name: raw_seg.contig_name,
4654 seg_part_no: output_seg_part_no,
4655 data: segment_data,
4656 should_reverse,
4657 });
4658 seg_part_no += 1;
4659 }
4660 }
4661 } // end for raw_seg
4662 } // end for contig (sequential)
4663
4664 // DETERMINISM FIX: Process all new segments sequentially after parallel classification
4665 // This ensures group IDs are assigned in deterministic BTreeSet order (by k-mer pair)
4666 // regardless of how many threads were used during classification.
4667 {
4668 let mut map_seg = map_segments.write().unwrap();
4669 let mut ref_seg = reference_segments.write().unwrap();
4670 let mut term_map = map_segments_terminators.write().unwrap();
4671 let mut next_gid = group_counter.load(std::sync::atomic::Ordering::SeqCst);
4672
4673 let new_groups =
4674 buffered_seg_part.process_new(&mut map_seg, &mut next_gid, &mut ref_seg, &mut term_map);
4675
4676 // Update the atomic counter with the final value
4677 group_counter.store(next_gid, std::sync::atomic::Ordering::SeqCst);
4678
4679 if config.verbosity > 0 && new_groups > 0 {
4680 eprintln!(
4681 "CLASSIFY_RAW_BARRIER: Registered {} new groups deterministically",
4682 new_groups
4683 );
4684 }
4685 }
4686
4687 if config.verbosity > 0 {
4688 eprintln!("CLASSIFY_RAW_BARRIER: Classification complete");
4689 }
4690}
4691
4692/// Flush batch-local groups to global state (matches C++ AGC batch boundary)
4693/// This updates the global map_segments registry with batch-local groups,
4694/// then clears the batch-local state (like C++ AGC destroying m_kmers at batch end)
4695fn flush_batch(
4696 segment_groups: &Arc<Mutex<BTreeMap<SegmentGroupKey, SegmentGroupBuffer>>>,
4697 pending_batch_segments: &Arc<Mutex<Vec<PendingSegment>>>,
4698 batch_local_groups: &Arc<Mutex<BTreeMap<SegmentGroupKey, u32>>>,
4699 batch_local_terminators: &Arc<Mutex<BTreeMap<u64, Vec<u64>>>>,
4700 map_segments: &Arc<RwLock<BTreeMap<SegmentGroupKey, u32>>>,
4701 group_counter: &Arc<AtomicU32>,
4702 raw_group_counter: &Arc<AtomicU32>, // FIX 17: Round-robin counter for raw groups (0-15)
4703 map_segments_terminators: &Arc<RwLock<BTreeMap<u64, Vec<u64>>>>,
4704 archive: &Arc<Mutex<Archive>>,
4705 collection: &Arc<Mutex<CollectionV3>>,
4706 reference_segments: &Arc<RwLock<BTreeMap<u32, Vec<u8>>>>,
4707 reference_orientations: &Arc<RwLock<BTreeMap<u32, bool>>>,
4708 #[cfg(feature = "cpp_agc")] grouping_engine: &Arc<Mutex<crate::ragc_ffi::GroupingEngine>>,
4709 config: &StreamingQueueConfig,
4710) -> Result<()> {
4711 use crate::segment::MISSING_KMER;
4712
4713 // Get pending segments and check if anything needs flushing
4714 let mut pending = pending_batch_segments.lock().unwrap();
4715 let batch_map_len = batch_local_groups.lock().unwrap().len();
4716 let batch_terms_len = batch_local_terminators.lock().unwrap().len();
4717
4718 if batch_map_len == 0 && batch_terms_len == 0 && pending.is_empty() {
4719 #[cfg(feature = "verbose_debug")]
4720 if config.verbosity > 0 {
4721 eprintln!("FLUSH_BATCH: No pending groups to flush");
4722 }
4723 return Ok(());
4724 }
4725
4726 #[cfg(feature = "verbose_debug")]
4727 if config.verbosity > 0 {
4728 eprintln!("FLUSH_BATCH: Processing {} batch-local groups, {} pending segments, {} terminator keys",
4729 batch_map_len, pending.len(), batch_terms_len);
4730 }
4731
4732 // CRITICAL: Sort pending segments by (sample, contig, place) before assigning group_ids
4733 // This matches C++ AGC's BTreeSet iteration order (agc_compressor.cpp process_new())
4734 pending.sort();
4735
4736 if config.verbosity > 1 && !pending.is_empty() {
4737 eprintln!(
4738 "FLUSH_BATCH: Sorted {} pending segments for group_id assignment",
4739 pending.len()
4740 );
4741 }
4742
4743 // Process sorted pending segments - assign group_ids and write to archive
4744 let mut groups_map = segment_groups.lock().unwrap();
4745
4746 for pend in pending.iter() {
4747 // Assign group_id: Orphan segments (both k-mers MISSING) distributed across raw groups 0-15
4748 // For segments with k-mers: lookup existing group, or create new group if not found
4749 // FIX 17: Distribute orphan segments across groups 0-15 (round-robin) to match C++ AGC's
4750 // distribute_segments(0, 0, no_raw_groups) behavior (agc_compressor.cpp line 986)
4751 //
4752 // FIX 18: Create unique buffer keys for each raw group (0-15)
4753 // Previously all orphans used key (MISSING, MISSING), causing them to share one buffer.
4754 // Now orphans use key (raw_group_id, MISSING) so each raw group has its own buffer.
4755 let (group_id, buffer_key) = if pend.key.kmer_back == MISSING_KMER
4756 && pend.key.kmer_front == MISSING_KMER
4757 {
4758 // Round-robin distribution across raw groups 0-15
4759 let raw_group_id = raw_group_counter.fetch_add(1, Ordering::SeqCst) % NO_RAW_GROUPS;
4760 // Create unique buffer key for this raw group
4761 let unique_key = SegmentGroupKey {
4762 kmer_front: raw_group_id as u64, // Use raw_group_id to distinguish buffers
4763 kmer_back: MISSING_KMER,
4764 };
4765 // DEBUG: Trace orphan segment distribution
4766 if crate::env_cache::trace_group() {
4767 eprintln!("ORPHAN_SEGMENT: sample={} contig={} place={} raw_group_id={} buffer_key=({}, MISSING)",
4768 pend.sample_name, pend.contig_name, pend.place, raw_group_id, raw_group_id);
4769 }
4770 (raw_group_id, unique_key)
4771 } else {
4772 // Check if this k-mer pair already has a group assigned
4773 let mut global_map = map_segments.write().unwrap();
4774 let gid = if let Some(&existing_group_id) = global_map.get(&pend.key) {
4775 // Use existing group
4776 if crate::env_cache::trace_group() {
4777 eprintln!("GROUPING_LOOKUP_HIT: sample={} contig={} place={} front={} back={} found_group={}",
4778 pend.sample_name, pend.contig_name, pend.place,
4779 pend.key.kmer_front, pend.key.kmer_back, existing_group_id);
4780 }
4781 drop(global_map);
4782 existing_group_id
4783 } else {
4784 // Create new group
4785 let new_group_id = group_counter.fetch_add(1, Ordering::SeqCst);
4786 if crate::env_cache::trace_group() {
4787 eprintln!("GROUPING_LOOKUP_MISS: sample={} contig={} place={} front={} back={} creating_group={} (map has {} entries)",
4788 pend.sample_name, pend.contig_name, pend.place,
4789 pend.key.kmer_front, pend.key.kmer_back, new_group_id, global_map.len());
4790 }
4791 global_map.insert(pend.key.clone(), new_group_id);
4792 drop(global_map);
4793 new_group_id
4794 };
4795 // For non-orphan segments, use pend.key as the buffer key
4796 (gid, pend.key.clone())
4797 };
4798
4799 if config.verbosity > 2 {
4800 eprintln!(
4801 "FLUSH_BATCH_ASSIGN: group_id={} front={} back={} sample={} contig={} place={}",
4802 group_id,
4803 pend.key.kmer_front,
4804 pend.key.kmer_back,
4805 pend.sample_name,
4806 pend.contig_name,
4807 pend.place
4808 );
4809 }
4810
4811 // Register orphan segments to global map (non-orphans already registered above)
4812 if pend.key.kmer_back == MISSING_KMER && pend.key.kmer_front == MISSING_KMER {
4813 let mut global_map = map_segments.write().unwrap();
4814 global_map.insert(pend.key.clone(), group_id);
4815 }
4816
4817 // TRACE: Log when segments from AAA#0 are registered
4818 if crate::env_cache::trace_group() && pend.sample_name.contains("AAA#0") {
4819 let global_map = map_segments.read().unwrap();
4820 eprintln!("TRACE_REGISTER: sample={} contig={} place={} front={} back={} group_id={} (map_segments now has {} entries)",
4821 pend.sample_name, pend.contig_name, pend.place,
4822 pend.key.kmer_front, pend.key.kmer_back, group_id, global_map.len());
4823 }
4824
4825 // Register to batch-local map
4826 {
4827 let mut batch_map = batch_local_groups.lock().unwrap();
4828 batch_map.insert(pend.key.clone(), group_id);
4829 }
4830
4831 // Register with FFI engine
4832 #[cfg(feature = "cpp_agc")]
4833 if pend.key.kmer_front != MISSING_KMER && pend.key.kmer_back != MISSING_KMER {
4834 let mut eng = grouping_engine.lock().unwrap();
4835 eng.register_group(pend.key.kmer_front, pend.key.kmer_back, group_id);
4836 }
4837
4838 // Update batch-local terminators (will be merged to global below)
4839 // Only for LZ groups (both k-mers non-MISSING)
4840 if pend.key.kmer_front != MISSING_KMER && pend.key.kmer_back != MISSING_KMER {
4841 let mut term_map = batch_local_terminators.lock().unwrap();
4842
4843 term_map
4844 .entry(pend.key.kmer_front)
4845 .or_insert_with(Vec::new)
4846 .push(pend.key.kmer_back);
4847
4848 if pend.key.kmer_front != pend.key.kmer_back {
4849 term_map
4850 .entry(pend.key.kmer_back)
4851 .or_insert_with(Vec::new)
4852 .push(pend.key.kmer_front);
4853 }
4854 }
4855
4856 // Get or create SegmentGroupBuffer for this group
4857 // FIX 18: Use buffer_key (unique per raw group for orphans) instead of pend.key
4858 let buffer = groups_map.entry(buffer_key.clone()).or_insert_with(|| {
4859 // Register streams
4860 let archive_version = ragc_common::AGC_FILE_MAJOR * 1000 + ragc_common::AGC_FILE_MINOR;
4861 let delta_stream_name = ragc_common::stream_delta_name(archive_version, group_id);
4862 let ref_stream_name = ragc_common::stream_ref_name(archive_version, group_id);
4863
4864 let mut arch = archive.lock().unwrap();
4865 let stream_id = arch.register_stream(&delta_stream_name);
4866 let ref_stream_id = arch.register_stream(&ref_stream_name);
4867 drop(arch);
4868
4869 SegmentGroupBuffer::new(group_id, stream_id, ref_stream_id)
4870 });
4871
4872 // Add to buffer or write as reference
4873 let is_raw_group = group_id < NO_RAW_GROUPS; // Groups 0-15 are raw groups (match C++ AGC)
4874 if !is_raw_group && buffer.reference_segment.is_none() && buffer.segments.is_empty() {
4875 // First segment in LZ group - write as reference immediately
4876 // Create BufferedSegment with original orientation (reference sets the group orientation)
4877 let buffered = BufferedSegment {
4878 sample_name: pend.sample_name.clone(),
4879 contig_name: pend.contig_name.clone(),
4880 seg_part_no: pend.place,
4881 data: pend.segment_data.clone(),
4882 is_rev_comp: pend.should_reverse,
4883 sample_priority: pend.sample_priority,
4884 };
4885 if let Err(e) = write_reference_immediately(
4886 &buffered,
4887 buffer,
4888 collection,
4889 archive,
4890 reference_segments,
4891 reference_orientations,
4892 config,
4893 ) {
4894 eprintln!("ERROR in flush_batch: Failed to write reference: {}", e);
4895 buffer.segments.push(buffered);
4896 }
4897 } else {
4898 // Delta segment (joining existing group)
4899 // FIX 18: Do NOT adjust orientation to match reference - C++ AGC stores each segment
4900 // with its own computed is_rev_comp based on k-mer comparison (front < back -> false,
4901 // front >= back -> true). Segments in the same group can have different is_rev_comp.
4902 let buffered = BufferedSegment {
4903 sample_name: pend.sample_name.clone(),
4904 contig_name: pend.contig_name.clone(),
4905 seg_part_no: pend.place,
4906 data: pend.segment_data.clone(),
4907 is_rev_comp: pend.should_reverse,
4908 sample_priority: pend.sample_priority,
4909 };
4910 buffer.segments.push(buffered);
4911 }
4912
4913 // FIX 4: Removed mid-batch pack flush to match C++ AGC's batch-level sorting
4914 // C++ AGC calls sort_known() on ALL segments in batch BEFORE writing ANY
4915 // Flushing mid-batch would write segments in pack-level sorted order, not batch-level
4916 // All groups will be flushed at end of batch (after loop) instead
4917 }
4918
4919 // Clear pending segments
4920 pending.clear();
4921 drop(pending);
4922
4923 // FIX 4: Flush all group buffers at end of batch (match C++ AGC's sort_known + store_segments)
4924 // This ensures segments within each group are sorted globally across the entire batch,
4925 // not just within individual packs. Matches C++ AGC architecture:
4926 // - C++ AGC: register_segments() calls sort_known() on ALL segments, then store_segments() writes ALL
4927 // - RAGC: Accumulate all segments for batch, then flush_pack() sorts + writes at end
4928 for (_key, buffer) in groups_map.iter_mut() {
4929 if !buffer.segments.is_empty() || !buffer.ref_written {
4930 flush_pack(buffer, collection, archive, config, reference_segments)
4931 .context("Failed to flush pack at end of batch")?;
4932 }
4933 }
4934
4935 drop(groups_map);
4936
4937 // Update global registry with batch-local groups (from existing group processing)
4938 {
4939 let batch_map = batch_local_groups.lock().unwrap();
4940 let mut global_map = map_segments.write().unwrap();
4941 for (key, group_id) in batch_map.iter() {
4942 global_map.entry(key.clone()).or_insert(*group_id);
4943 }
4944 }
4945
4946 // CRITICAL: Merge batch-local terminators into global terminators
4947 // This is where C++ AGC makes terminators visible for find_middle in subsequent samples
4948 {
4949 let batch_terms = batch_local_terminators.lock().unwrap();
4950 let mut global_terms = map_segments_terminators.write().unwrap();
4951 for (kmer, connections) in batch_terms.iter() {
4952 let entry = global_terms.entry(*kmer).or_insert_with(Vec::new);
4953 entry.extend(connections.iter().cloned());
4954 entry.sort_unstable();
4955 entry.dedup();
4956 }
4957 }
4958
4959 // Clear batch-local state (like C++ AGC destroying m_kmers)
4960 batch_local_groups.lock().unwrap().clear();
4961 batch_local_terminators.lock().unwrap().clear();
4962
4963 #[cfg(feature = "verbose_debug")]
4964 if config.verbosity > 0 {
4965 eprintln!("FLUSH_BATCH: Batch flush complete, batch-local state cleared");
4966 }
4967
4968 Ok(())
4969}
4970
4971/// Helper function to fix orientation of segment data to match reference orientation.
4972/// Returns (fixed_data, fixed_is_rev_comp) tuple.
4973/// Used for both normal segments and split segments to ensure consistent orientation within groups.
4974fn fix_orientation_for_group(
4975 data: &[u8],
4976 should_reverse: bool,
4977 _key: &SegmentGroupKey,
4978 _map_segments: &Arc<RwLock<BTreeMap<SegmentGroupKey, u32>>>,
4979 _batch_local_groups: &Arc<Mutex<BTreeMap<SegmentGroupKey, u32>>>,
4980 _reference_orientations: &Arc<RwLock<BTreeMap<u32, bool>>>,
4981) -> (Vec<u8>, bool) {
4982 // FIX 18: Do NOT adjust orientation to match reference - C++ AGC stores each segment
4983 // with its own computed is_rev_comp based on k-mer comparison. Segments in the same
4984 // group can have different is_rev_comp values.
4985 (data.to_vec(), should_reverse)
4986}
4987
4988/// Worker thread that pulls from queue and compresses
4989fn worker_thread(
4990 worker_id: usize,
4991 queue: Arc<MemoryBoundedQueue<ContigTask>>,
4992 collection: Arc<Mutex<CollectionV3>>,
4993 splitters: Arc<AHashSet<u64>>,
4994 ref_singletons: Arc<Vec<u64>>, // For dynamic splitter discovery (sorted)
4995 ref_duplicates: Arc<AHashSet<u64>>, // For dynamic splitter discovery
4996 archive: Arc<Mutex<Archive>>,
4997 segment_groups: Arc<Mutex<BTreeMap<SegmentGroupKey, SegmentGroupBuffer>>>,
4998 group_counter: Arc<AtomicU32>,
4999 raw_group_counter: Arc<AtomicU32>,
5000 reference_sample_name: Arc<Mutex<Option<String>>>,
5001 map_segments: Arc<RwLock<BTreeMap<SegmentGroupKey, u32>>>,
5002 map_segments_terminators: Arc<RwLock<BTreeMap<u64, Vec<u64>>>>,
5003 reference_segments: Arc<RwLock<BTreeMap<u32, Vec<u8>>>>,
5004 reference_orientations: Arc<RwLock<BTreeMap<u32, bool>>>,
5005 split_offsets: Arc<Mutex<BTreeMap<(String, String, usize), usize>>>,
5006 #[cfg(feature = "cpp_agc")] grouping_engine: Arc<Mutex<crate::ragc_ffi::GroupingEngine>>,
5007 batch_samples: Arc<Mutex<HashSet<String>>>,
5008 batch_local_groups: Arc<Mutex<BTreeMap<SegmentGroupKey, u32>>>,
5009 batch_local_terminators: Arc<Mutex<BTreeMap<u64, Vec<u64>>>>,
5010 pending_batch_segments: Arc<Mutex<Vec<PendingSegment>>>,
5011 buffered_seg_part: Arc<BufferedSegPart>, // Per-group buffers for parallel Phase 1
5012 map_fallback_minimizers: Arc<Mutex<BTreeMap<u64, Vec<(u64, u64)>>>>,
5013 raw_segment_buffers: Arc<Vec<Mutex<Vec<RawBufferedSegment>>>>, // Per-worker buffers for deferred classification
5014 barrier: Arc<std::sync::Barrier>, // Synchronization barrier for batch boundaries
5015 parallel_state: Arc<ParallelFlushState>, // Shared state for parallel Phase 3
5016 write_buffer: Arc<ParallelWriteBuffer>, // Per-stream buffers for parallel writes
5017 config: StreamingQueueConfig,
5018) -> Result<()> {
5019 let mut processed_count = 0;
5020
5021 // Create fallback filter from config
5022 let fallback_filter = FallbackFilter::new(config.fallback_frac);
5023
5024 // Timing accumulators for performance analysis
5025 let mut total_queue_wait = std::time::Duration::ZERO;
5026 let mut total_segment_processing = std::time::Duration::ZERO;
5027 let mut total_barrier_wait = std::time::Duration::ZERO;
5028 let mut total_sync_processing = std::time::Duration::ZERO;
5029 let mut contig_count = 0usize;
5030 let mut sync_count = 0usize;
5031
5032 loop {
5033 // Pull from queue (blocks if empty, returns None when closed)
5034 let queue_start = std::time::Instant::now();
5035 let Some(task) = queue.pull() else {
5036 // Print timing summary on exit
5037 if config.verbosity > 0 {
5038 eprintln!("Worker {} TIMING: queue_wait={:?} segment_proc={:?} barrier_wait={:?} sync_proc={:?} contigs={} syncs={}",
5039 worker_id, total_queue_wait, total_segment_processing, total_barrier_wait, total_sync_processing, contig_count, sync_count);
5040 }
5041 // Queue is closed and empty - flush any pending batch before exiting
5042 if config.verbosity > 0 {
5043 eprintln!("Worker {} flushing final batch before exit", worker_id);
5044 }
5045 flush_batch(
5046 &segment_groups,
5047 &pending_batch_segments,
5048 &batch_local_groups,
5049 &batch_local_terminators,
5050 &map_segments,
5051 &group_counter,
5052 &raw_group_counter, // FIX 17: Pass raw_group_counter for round-robin distribution
5053 &map_segments_terminators,
5054 &archive,
5055 &collection,
5056 &reference_segments,
5057 &reference_orientations,
5058 #[cfg(feature = "cpp_agc")]
5059 &grouping_engine,
5060 &config,
5061 )
5062 .ok(); // Ignore errors on final flush
5063
5064 if config.verbosity > 1 {
5065 eprintln!(
5066 "Worker {} finished ({} contigs processed)",
5067 worker_id, processed_count
5068 );
5069 }
5070 break;
5071 };
5072
5073 let queue_wait = queue_start.elapsed();
5074 total_queue_wait += queue_wait;
5075
5076 // Handle sync tokens with barrier synchronization (matches C++ AGC registration stage)
5077 if task.is_sync_token {
5078 let sync_start = std::time::Instant::now();
5079 sync_count += 1;
5080 if config.verbosity > 0 {
5081 eprintln!(
5082 "Worker {} hit sync token for sample {}",
5083 worker_id, task.sample_name
5084 );
5085 }
5086
5087 // =================================================================
5088 // C++ AGC 4-Phase Parallel Pattern
5089 // =================================================================
5090
5091 // Barrier 1: All workers arrive at sample boundary
5092 let barrier_start = std::time::Instant::now();
5093 barrier.wait();
5094 total_barrier_wait += barrier_start.elapsed();
5095
5096 // Phase 2 (Thread 0 only): Classify raw segments and prepare batch
5097 if worker_id == 0 {
5098 if config.verbosity > 0 {
5099 eprintln!(
5100 "Worker 0 preparing batch at sample boundary for {}",
5101 task.sample_name
5102 );
5103 }
5104
5105 let phase2_start = std::time::Instant::now();
5106
5107 // Step 1: Classify all raw segments (deferred from parallel segment loop)
5108 // This eliminates lock contention by doing classification single-threaded
5109 classify_raw_segments_at_barrier(
5110 &raw_segment_buffers,
5111 &buffered_seg_part,
5112 &map_segments,
5113 &map_segments_terminators,
5114 &segment_groups,
5115 &reference_segments,
5116 &fallback_filter,
5117 &map_fallback_minimizers,
5118 &group_counter,
5119 &raw_group_counter, // FIX 18: Pass raw_group_counter for orphan distribution
5120 &config,
5121 );
5122
5123 let classify_time = phase2_start.elapsed();
5124 if config.verbosity > 0 {
5125 eprintln!("TIMING: Classification took {:?}", classify_time);
5126 }
5127
5128 // Step 2: Prepare batch for parallel compression
5129 let prepare_start = std::time::Instant::now();
5130 prepare_batch_parallel(
5131 &segment_groups,
5132 &buffered_seg_part,
5133 &batch_local_groups,
5134 &batch_local_terminators,
5135 &map_segments,
5136 &map_segments_terminators,
5137 &group_counter,
5138 &raw_group_counter,
5139 &archive,
5140 &collection,
5141 &reference_segments,
5142 &reference_orientations,
5143 #[cfg(feature = "cpp_agc")]
5144 &grouping_engine,
5145 ¶llel_state,
5146 &config,
5147 )?;
5148 let prepare_time = prepare_start.elapsed();
5149 if config.verbosity > 0 {
5150 eprintln!("TIMING: Prepare took {:?}", prepare_time);
5151 }
5152 }
5153
5154 // Barrier 2: All workers see prepared buffers
5155 let barrier_start = std::time::Instant::now();
5156 barrier.wait();
5157 total_barrier_wait += barrier_start.elapsed();
5158
5159 let compress_start = std::time::Instant::now();
5160 // Phase 3a (ALL workers): Atomic work-stealing to COMPRESS and BUFFER writes
5161 // Workers compress segments and buffer archive writes (C++ AGC: AddPartBuffered)
5162 // Buffering is fast (memory only), flush happens after barrier
5163 loop {
5164 let Some(idx) = parallel_state.claim_next_idx() else {
5165 break;
5166 };
5167
5168 if let Some((key, mut buffer)) = parallel_state.get_buffer_at(idx) {
5169 // Compress this buffer
5170 if !buffer.segments.is_empty() || !buffer.ref_written {
5171 match flush_pack_compress_only(&mut buffer, &config) {
5172 Ok(mut result) => {
5173 // Buffer archive writes using per-stream mutexes (NO global lock!)
5174 // Workers on different streams can buffer concurrently
5175 for part in result.archive_writes.drain(..) {
5176 write_buffer.buffer_write(
5177 part.stream_id,
5178 part.data,
5179 part.metadata,
5180 );
5181 }
5182 // Store result (now without archive_writes)
5183 parallel_state.store_result(idx, result);
5184 }
5185 Err(e) => {
5186 eprintln!(
5187 "Worker {} error compressing group {}: {}",
5188 worker_id, buffer.group_id, e
5189 );
5190 }
5191 }
5192 }
5193 // Return buffer
5194 parallel_state.return_buffer(idx, key, buffer);
5195 }
5196 }
5197
5198 // Barrier 3: All workers done with compression and buffering
5199 let barrier_start = std::time::Instant::now();
5200 barrier.wait();
5201 total_barrier_wait += barrier_start.elapsed();
5202
5203 if worker_id == 0 && config.verbosity > 0 {
5204 eprintln!(
5205 "TIMING: Compression took {:?} (all workers)",
5206 compress_start.elapsed()
5207 );
5208 }
5209
5210 // Phase 3b + Phase 4 (Thread 0 only): Flush writes, registrations, and cleanup
5211 // Combined to reduce barrier overhead (was 2 separate barriers)
5212 if worker_id == 0 {
5213 // Phase 3b: Flush buffered writes and process registrations
5214 let sorted_results = parallel_state.drain_results_sorted();
5215
5216 // Take all locks once at the start
5217 let mut arch = archive.lock().unwrap();
5218 let mut ref_segs = reference_segments.write().unwrap();
5219 let mut coll = collection.lock().unwrap();
5220
5221 // Flush all buffered writes from per-stream buffer to archive
5222 // BTreeMap ensures sorted stream_id order for determinism
5223 if let Err(e) = write_buffer.flush_to_archive(&mut *arch) {
5224 eprintln!("Thread 0 error flushing archive buffers: {}", e);
5225 }
5226 // Clear write buffer for next batch
5227 write_buffer.clear();
5228
5229 // Process ref_to_store and registrations in sorted group_id order
5230 for result in sorted_results {
5231 // Store reference in global map
5232 if let Some((group_id, ref_data)) = result.ref_to_store {
5233 ref_segs.insert(group_id, ref_data);
5234 }
5235
5236 // Register segments in collection
5237 for reg in result.registrations {
5238 if let Err(e) = coll.add_segment_placed(
5239 ®.sample_name,
5240 ®.contig_name,
5241 reg.seg_part_no,
5242 reg.group_id,
5243 reg.in_group_id,
5244 reg.is_rev_comp,
5245 reg.raw_length,
5246 ) {
5247 eprintln!("Thread 0 error registering segment: {}", e);
5248 }
5249 }
5250 }
5251 // Locks released here when guards go out of scope
5252 drop(arch);
5253 drop(ref_segs);
5254 drop(coll);
5255
5256 // Phase 4: Cleanup
5257 cleanup_batch_parallel(
5258 &segment_groups,
5259 &batch_local_groups,
5260 &batch_local_terminators,
5261 &map_segments,
5262 &map_segments_terminators,
5263 ¶llel_state,
5264 &config,
5265 );
5266
5267 // Clear batch-local state after flush (start fresh for new sample)
5268 let mut samples = batch_samples.lock().unwrap();
5269 samples.clear();
5270 }
5271
5272 // Barrier 4: All workers ready for next batch (reduced from 2 barriers)
5273 let barrier_start = std::time::Instant::now();
5274 barrier.wait();
5275 total_barrier_wait += barrier_start.elapsed();
5276
5277 // Track total sync token processing time
5278 total_sync_processing += sync_start.elapsed();
5279
5280 // Sync token processed - continue to next task
5281 continue;
5282 }
5283
5284 // Start timing for segment processing
5285 let segment_start = std::time::Instant::now();
5286 contig_count += 1;
5287
5288 // NOTE: Removed per-contig lock on batch_samples - not needed for deferred classification
5289 // The batch tracking is handled at the barrier level, not per-contig
5290
5291 // Split into segments
5292 // Dynamic splitter discovery for non-reference contigs (matches C++ AGC find_new_splitters)
5293 // OPTIMIZATION: Cache reference sample name check at thread level to avoid lock per contig
5294 let is_reference_sample = task.sample_priority >= 1_000_000; // Reference sample has boosted priority
5295
5296 let segments = if !is_reference_sample && !ref_singletons.is_empty() {
5297 // Non-reference contig with dynamic discovery enabled
5298 // Find NEW splitter k-mers unique to this contig (not in reference)
5299 // Position-based selection ensures only optimally-positioned k-mers become splitters
5300 let new_splitters = find_new_splitters_for_contig(
5301 &task.data,
5302 config.k,
5303 config.segment_size,
5304 &ref_singletons,
5305 &ref_duplicates,
5306 );
5307
5308 // Combine base splitters with new splitters
5309 let mut combined_splitters = (*splitters).clone();
5310 combined_splitters.extend(new_splitters.iter());
5311
5312 if config.verbosity > 2 && !new_splitters.is_empty() {
5313 eprintln!(
5314 "DYNAMIC_SPLITTER: {} found {} new splitters for {} (total: {})",
5315 task.sample_name,
5316 new_splitters.len(),
5317 task.contig_name,
5318 combined_splitters.len()
5319 );
5320 }
5321
5322 split_at_splitters_with_size(
5323 &task.data,
5324 &combined_splitters,
5325 config.k,
5326 config.segment_size,
5327 )
5328 } else {
5329 // Reference contig or dynamic discovery disabled - use base splitters only
5330 split_at_splitters_with_size(&task.data, &splitters, config.k, config.segment_size)
5331 };
5332
5333 if config.verbosity > 2 {
5334 eprintln!(
5335 "Worker {} processing {} (split into {} segments)",
5336 worker_id,
5337 task.contig_name,
5338 segments.len()
5339 );
5340 }
5341
5342 // NOTE: split_offsets and local_splits are no longer needed in the parallel loop
5343 // since classification (including splits) is deferred to the barrier.
5344 // This eliminates a lock acquisition per contig that was causing contention.
5345
5346 // =================================================================
5347 // DEFERRED CLASSIFICATION: Buffer raw segments for parallel Phase 1
5348 // Classification is deferred to Thread 0 at the barrier to eliminate
5349 // lock contention from find_group_with_one_kmer and split logic.
5350 // =================================================================
5351 // OPTIMIZATION: Collect all segments for this contig locally, then push once
5352 // This reduces lock acquisitions from O(segments_per_contig) to O(1) per contig
5353 let contig_segments: Vec<RawBufferedSegment> = segments
5354 .iter()
5355 .enumerate()
5356 .map(|(original_place, segment)| {
5357 // Precompute reverse complement (NO LOCKS - can run in parallel)
5358 // Segment data uses numeric encoding: 0=A, 1=C, 2=G, 3=T
5359 let segment_data_rc: Vec<u8> = segment
5360 .data
5361 .iter()
5362 .rev()
5363 .map(|&base| {
5364 match base {
5365 0 => 3, // A -> T
5366 1 => 2, // C -> G
5367 2 => 1, // G -> C
5368 3 => 0, // T -> A
5369 _ => base, // N or other non-ACGT
5370 }
5371 })
5372 .collect();
5373
5374 RawBufferedSegment {
5375 data: segment.data.clone(),
5376 data_rc: segment_data_rc,
5377 front_kmer: segment.front_kmer,
5378 back_kmer: segment.back_kmer,
5379 front_kmer_is_dir: segment.front_kmer_is_dir,
5380 back_kmer_is_dir: segment.back_kmer_is_dir,
5381 sample_name: task.sample_name.clone(),
5382 contig_name: task.contig_name.clone(),
5383 original_place,
5384 sample_priority: task.sample_priority,
5385 }
5386 })
5387 .collect();
5388
5389 // ONE lock acquisition for entire contig (reduces contention significantly)
5390 // Push to this worker's own buffer (NO CONTENTION - each worker has its own buffer)
5391 raw_segment_buffers[worker_id]
5392 .lock()
5393 .unwrap()
5394 .extend(contig_segments);
5395
5396 // End timing for segment processing
5397 total_segment_processing += segment_start.elapsed();
5398
5399 // OLD CODE BELOW - REPLACED BY DEFERRED CLASSIFICATION
5400 // This block is preserved but commented out for reference during the transition.
5401 // The classification logic has been moved to classify_raw_segments_at_barrier().
5402 #[cfg(feature = "old_immediate_classification")]
5403 for (original_place, segment) in std::iter::empty::<(usize, &crate::segment::Segment)>() {
5404 // Calculate adjusted place based on prior splits in this contig
5405 // (matches C++ AGC lines 2033-2036: increment seg_part_no twice when split occurs)
5406 // OPTIMIZATION: Count splits before current position from both prior and local sets
5407 let prior_count = prior_contig_splits.range(..original_place).count();
5408 let local_count = local_splits.range(..original_place).count();
5409 let place = original_place + prior_count + local_count;
5410
5411 // DEBUG: Output every segment for comparison with C++ AGC
5412 #[cfg(feature = "verbose_debug")]
5413 eprintln!(
5414 "RAGC_SEGMENT: sample={} contig={} part={} len={} front={} back={}",
5415 task.sample_name,
5416 task.contig_name,
5417 place,
5418 segment.data.len(),
5419 segment.front_kmer,
5420 segment.back_kmer
5421 );
5422
5423 // Match C++ AGC Case 2: Normalize segment group key by ensuring front <= back
5424 // (agc_compressor.cpp lines 1306-1327)
5425 use crate::segment::MISSING_KMER;
5426
5427 // Precompute reverse complement for all cases that might need it
5428 // Segment data uses numeric encoding: 0=A, 1=C, 2=G, 3=T
5429 let segment_data_rc: Vec<u8> = segment
5430 .data
5431 .iter()
5432 .rev()
5433 .map(|&base| {
5434 match base {
5435 0 => 3, // A -> T
5436 1 => 2, // C -> G
5437 2 => 1, // G -> C
5438 3 => 0, // T -> A
5439 _ => base, // N or other non-ACGT
5440 }
5441 })
5442 .collect();
5443
5444 let (key_front, key_back, should_reverse) = if segment.front_kmer != MISSING_KMER
5445 && segment.back_kmer != MISSING_KMER
5446 {
5447 // Both k-mers present
5448 // C++ AGC uses `<` not `<=`, which means degenerate k-mers (front == back)
5449 // go to the else branch and get store_rc=true (lines 1306-1313)
5450 if segment.front_kmer < segment.back_kmer {
5451 // Already normalized - keep original orientation
5452 if config.verbosity > 2 {
5453 #[cfg(feature = "verbose_debug")]
5454 eprintln!(
5455 "RAGC_CASE2_KEEP: sample={} front={} back={} len={}",
5456 task.sample_name,
5457 segment.front_kmer,
5458 segment.back_kmer,
5459 segment.data.len()
5460 );
5461 }
5462 (segment.front_kmer, segment.back_kmer, false)
5463 } else {
5464 // Swap k-mers and reverse complement data
5465 if config.verbosity > 2 {
5466 #[cfg(feature = "verbose_debug")]
5467 eprintln!(
5468 "RAGC_CASE2_SWAP: sample={} front={} back={} -> key=({},{}) len={}",
5469 task.sample_name,
5470 segment.front_kmer,
5471 segment.back_kmer,
5472 segment.back_kmer,
5473 segment.front_kmer,
5474 segment.data.len()
5475 );
5476 }
5477 (segment.back_kmer, segment.front_kmer, true)
5478 }
5479 } else if segment.front_kmer != MISSING_KMER {
5480 // Case 3a: Only front k-mer present, back is MISSING (terminator)
5481 // Match C++ AGC lines 1315-1336: reverse complement and find candidate with one splitter
5482 // Use the actual is_dir_oriented value from segment detection
5483 #[cfg(feature = "verbose_debug")]
5484 eprintln!("RAGC_CASE3A_TERMINATOR: sample={} front={} front_is_dir={} back=MISSING -> finding best group",
5485 task.sample_name, segment.front_kmer, segment.front_kmer_is_dir);
5486 // Debug: trace is_dir value before find_group call
5487 if crate::env_cache::debug_is_dir() {
5488 eprintln!("RAGC_CASE3A_CALL: contig={} seg_part={} front_kmer={} front_kmer_is_dir={}",
5489 task.contig_name, place, segment.front_kmer, segment.front_kmer_is_dir);
5490 }
5491 let (mut kf, mut kb, mut sr) = find_group_with_one_kmer(
5492 segment.front_kmer,
5493 segment.front_kmer_is_dir, // Use actual orientation from segment detection
5494 &segment.data,
5495 &segment_data_rc,
5496 &map_segments_terminators,
5497 &map_segments,
5498 &segment_groups,
5499 &reference_segments,
5500 &config,
5501 );
5502
5503 // Fallback: If Case 3a returned MISSING, try fallback minimizers (C++ AGC lines 1322-1334)
5504 if (kf == MISSING_KMER || kb == MISSING_KMER) && fallback_filter.is_enabled() {
5505 let (fb_kf, fb_kb, fb_sr) = find_cand_segment_using_fallback_minimizers(
5506 &segment.data,
5507 &segment_data_rc,
5508 config.k,
5509 5, // min_shared_kmers = 5 for Case 3 (matches C++ AGC)
5510 &fallback_filter,
5511 &map_fallback_minimizers,
5512 &map_segments,
5513 &segment_groups,
5514 &reference_segments,
5515 &config,
5516 );
5517 if fb_kf != MISSING_KMER && fb_kb != MISSING_KMER {
5518 if config.verbosity > 1 {
5519 #[cfg(feature = "verbose_debug")]
5520 eprintln!(
5521 "RAGC_CASE3A_FALLBACK: found ({},{}) rc={}",
5522 fb_kf, fb_kb, fb_sr
5523 );
5524 }
5525 kf = fb_kf;
5526 kb = fb_kb;
5527 sr = fb_sr;
5528 }
5529 }
5530 (kf, kb, sr)
5531 } else if segment.back_kmer != MISSING_KMER {
5532 // Case 3b: Only back k-mer present, front is MISSING (terminator)
5533 // Match C++ AGC lines 1337-1360: swap_dir_rc() inverts is_dir_oriented()
5534 //
5535 // C++ AGC calls kmer.swap_dir_rc() which swaps kmer_dir and kmer_rc fields,
5536 // effectively inverting is_dir_oriented() (which checks kmer_dir <= kmer_rc).
5537 // So if back_kmer was originally dir-oriented, after swap it becomes NOT dir-oriented.
5538 let kmer_is_dir_after_swap = !segment.back_kmer_is_dir;
5539 #[cfg(feature = "verbose_debug")]
5540 eprintln!("RAGC_CASE3B_TERMINATOR: sample={} front=MISSING back={} back_is_dir={} -> kmer_is_dir_after_swap={}",
5541 task.sample_name, segment.back_kmer, segment.back_kmer_is_dir, kmer_is_dir_after_swap);
5542
5543 // C++ AGC line 1344 passes (segment_rc, segment) to find_cand_segment_with_one_splitter
5544 // and then inverts the result: store_rc = !store_dir
5545 // So we swap the segment parameters here AND invert sr below
5546 let (mut kf, mut kb, mut sr) = find_group_with_one_kmer(
5547 segment.back_kmer, // Use original k-mer value
5548 kmer_is_dir_after_swap, // Inverted due to swap_dir_rc()
5549 &segment_data_rc, // SWAPPED: RC first (matches C++ AGC segment_rc param)
5550 &segment.data, // SWAPPED: Original second (matches C++ AGC segment param)
5551 &map_segments_terminators,
5552 &map_segments,
5553 &segment_groups,
5554 &reference_segments,
5555 &config,
5556 );
5557 // Invert sr to match C++ AGC's store_rc = !store_dir
5558 sr = !sr;
5559
5560 // Fallback: If Case 3b returned MISSING, try fallback minimizers (C++ AGC lines 1347-1359)
5561 // Note: C++ AGC uses segment_rc for fallback in Case 3b
5562 if (kf == MISSING_KMER || kb == MISSING_KMER) && fallback_filter.is_enabled() {
5563 let (fb_kf, fb_kb, fb_sr) = find_cand_segment_using_fallback_minimizers(
5564 &segment_data_rc, // Use RC for Case 3b (matches C++ AGC)
5565 &segment.data,
5566 config.k,
5567 5, // min_shared_kmers = 5 for Case 3 (matches C++ AGC)
5568 &fallback_filter,
5569 &map_fallback_minimizers,
5570 &map_segments,
5571 &segment_groups,
5572 &reference_segments,
5573 &config,
5574 );
5575 if fb_kf != MISSING_KMER && fb_kb != MISSING_KMER {
5576 if config.verbosity > 1 {
5577 #[cfg(feature = "verbose_debug")]
5578 eprintln!(
5579 "RAGC_CASE3B_FALLBACK: found ({},{}) rc={}",
5580 fb_kf, fb_kb, !fb_sr
5581 );
5582 }
5583 kf = fb_kf;
5584 kb = fb_kb;
5585 sr = !fb_sr; // C++ AGC: store_rc = !store_dir_alt
5586 }
5587 }
5588 (kf, kb, sr)
5589 } else {
5590 // Case 1: Both MISSING - try fallback minimizers (C++ AGC lines 1286-1298)
5591 let mut kf = MISSING_KMER;
5592 let mut kb = MISSING_KMER;
5593 let mut sr = false;
5594
5595 if fallback_filter.is_enabled() {
5596 let (fb_kf, fb_kb, fb_sr) = find_cand_segment_using_fallback_minimizers(
5597 &segment.data,
5598 &segment_data_rc,
5599 config.k,
5600 1, // min_shared_kmers = 1 for Case 1 (matches C++ AGC line 1293)
5601 &fallback_filter,
5602 &map_fallback_minimizers,
5603 &map_segments,
5604 &segment_groups,
5605 &reference_segments,
5606 &config,
5607 );
5608 if fb_kf != MISSING_KMER && fb_kb != MISSING_KMER {
5609 if config.verbosity > 1 {
5610 #[cfg(feature = "verbose_debug")]
5611 eprintln!(
5612 "RAGC_CASE1_FALLBACK: sample={} found ({},{}) rc={} len={}",
5613 task.sample_name,
5614 fb_kf,
5615 fb_kb,
5616 fb_sr,
5617 segment.data.len()
5618 );
5619 }
5620 kf = fb_kf;
5621 kb = fb_kb;
5622 sr = fb_sr;
5623 }
5624 }
5625
5626 (kf, kb, sr)
5627 };
5628
5629 // Create grouping key from normalized k-mers
5630 // For raw segments (both k-mers MISSING), use the same key for all
5631 // This matches C++ AGC: map_segments[make_pair(~0ull, ~0ull)] = 0
5632 // All raw segments share the same grouping key and will be assigned to the same group
5633 let key = SegmentGroupKey {
5634 kmer_front: key_front,
5635 kmer_back: key_back,
5636 };
5637
5638 // Reverse complement data if needed (matching C++ AGC lines 1315-1316, 1320-1321)
5639 let segment_data = if should_reverse {
5640 segment
5641 .data
5642 .iter()
5643 .rev()
5644 .map(|&base| {
5645 match base {
5646 0 => 3, // A -> T
5647 1 => 2, // C -> G
5648 2 => 1, // G -> C
5649 3 => 0, // T -> A
5650 _ => base, // N or other non-ACGT
5651 }
5652 })
5653 .collect()
5654 } else {
5655 segment.data.clone()
5656 };
5657
5658 // NOTE: Split check must happen BEFORE creating BufferedSegment
5659 // to avoid moving segment_data prematurely
5660 // PERF: Lock deferred - normal path doesn't need segment_groups lock
5661 {
5662 // Phase 1: Check if group already exists
5663 // (matches C++ AGC: seg_map_mtx.lock() then find at line 1020)
5664 let key_exists = {
5665 let seg_map = map_segments.read().unwrap();
5666 seg_map.contains_key(&key)
5667 };
5668
5669 // Phase 2: Try to split
5670 // C++ AGC only attempts splits when key doesn't exist (agc_compressor.cpp:1367)
5671 // This is the condition: p == map_segments.end() && both k-mers valid && both in terminators
5672 // Set RAGC_SPLIT_ALL=1 to try splitting even when key exists (experimental)
5673 // CRITICAL: C++ AGC lines 1374-1378 skip segment splitting when front == back!
5674 // When front == back, it just sets store_rc based on orientation, does NOT call
5675 // find_cand_segment_with_missing_middle_splitter. We must do the same.
5676 let split_allowed = if crate::env_cache::split_all() {
5677 true
5678 } else {
5679 !key_exists
5680 };
5681
5682 // Debug: trace split decision
5683 if crate::env_cache::debug_split()
5684 && task.contig_name.contains("chrVII")
5685 && place >= 2
5686 && place <= 5
5687 {
5688 eprintln!("RAGC_SPLIT_CHECK: contig={} seg={} key=({},{}) key_exists={} split_allowed={} front_missing={} back_missing={} front==back={}",
5689 task.contig_name, place, key_front, key_back, key_exists, split_allowed,
5690 key_front == MISSING_KMER, key_back == MISSING_KMER, key_front == key_back);
5691 }
5692
5693 if split_allowed
5694 && key_front != MISSING_KMER
5695 && key_back != MISSING_KMER
5696 && key_front != key_back
5697 {
5698 // CRITICAL: First attempt to find middle splitter
5699 // Use ONLY global terminators (not batch-local) to match C++ AGC behavior
5700 // C++ AGC only sees terminators from previous batches, not the current one
5701 let middle_kmer_opt = {
5702 let terminators = map_segments_terminators.read().unwrap();
5703 let result = find_middle_splitter(key_front, key_back, &terminators);
5704 // Debug: trace middle splitter result
5705 if crate::env_cache::debug_split()
5706 && task.contig_name.contains("chrVII")
5707 && place >= 2
5708 && place <= 5
5709 {
5710 let front_conn =
5711 terminators.get(&key_front).map(|v| v.len()).unwrap_or(0);
5712 let back_conn =
5713 terminators.get(&key_back).map(|v| v.len()).unwrap_or(0);
5714 eprintln!("RAGC_SPLIT_MIDDLE: contig={} seg={} key=({},{}) middle={:?} front_conn={} back_conn={}",
5715 task.contig_name, place, key_front, key_back, result, front_conn, back_conn);
5716 }
5717 result
5718 };
5719
5720 #[cfg(feature = "verbose_debug")]
5721 if config.verbosity > 0 {
5722 if middle_kmer_opt.is_some() {
5723 eprintln!(
5724 "DEBUG_SPLIT: Found middle k-mer for ({},{}) sample={}",
5725 key_front, key_back, task.sample_name
5726 );
5727 } else if config.verbosity > 1 {
5728 eprintln!(
5729 "SPLIT_NO_MIDDLE: ({},{}) sample={} place={} should_reverse={}",
5730 key_front, key_back, task.sample_name, place, should_reverse
5731 );
5732 }
5733 }
5734
5735 if let Some(middle_kmer) = middle_kmer_opt {
5736 // Found potential middle k-mer
5737 // Now check if BOTH split groups already exist in map_segments
5738 // (This is the key difference from just checking terminators!)
5739
5740 // Debug: trace middle found
5741 if crate::env_cache::debug_split() {
5742 eprintln!(
5743 "RAGC_SPLIT_FOUND_MIDDLE: contig={} seg={} middle={}",
5744 task.contig_name, place, middle_kmer
5745 );
5746 }
5747
5748 let left_key = if key_front <= middle_kmer {
5749 SegmentGroupKey {
5750 kmer_front: key_front,
5751 kmer_back: middle_kmer,
5752 }
5753 } else {
5754 SegmentGroupKey {
5755 kmer_front: middle_kmer,
5756 kmer_back: key_front,
5757 }
5758 };
5759
5760 let right_key = if middle_kmer <= key_back {
5761 SegmentGroupKey {
5762 kmer_front: middle_kmer,
5763 kmer_back: key_back,
5764 }
5765 } else {
5766 SegmentGroupKey {
5767 kmer_front: key_back,
5768 kmer_back: middle_kmer,
5769 }
5770 };
5771
5772 // CRITICAL: C++ AGC requires BOTH target groups to exist in map_segments
5773 // at split decision time (agc_compressor.cpp lines 1472, 1486 use .at() which throws)
5774 // If either group doesn't exist, C++ AGC aborts the split.
5775 // We must check map_segments (global), not batch_local_groups, to match C++ behavior.
5776 let (left_exists, right_exists) = {
5777 let global_map = map_segments.read().unwrap();
5778 (
5779 global_map.contains_key(&left_key),
5780 global_map.contains_key(&right_key),
5781 )
5782 };
5783
5784 // EXPERIMENTAL: Allow split even when groups don't exist
5785 // Set RAGC_SPLIT_CREATE_GROUPS=1 to enable creating new groups during split
5786 // This is needed for streaming mode where non-reference samples may create
5787 // new segment groups that the reference sample didn't have.
5788 let allow_create_groups = crate::env_cache::split_create_groups();
5789
5790 if !left_exists || !right_exists {
5791 // Skip split - one or both target groups don't exist yet
5792 // This matches C++ AGC behavior where .at() would throw
5793 // UNLESS we're in experimental mode where we allow creating groups
5794 if config.verbosity > 1 {
5795 eprintln!(
5796 "SPLIT_SKIP_NO_GROUP: left_key=({},{}) exists={} right_key=({},{}) exists={} allow_create={}",
5797 left_key.kmer_front, left_key.kmer_back, left_exists,
5798 right_key.kmer_front, right_key.kmer_back, right_exists, allow_create_groups
5799 );
5800 }
5801 if crate::env_cache::debug_split() {
5802 eprintln!(
5803 "RAGC_SPLIT_SKIP_NO_GROUP: left=({},{}) exists={} right=({},{}) exists={} allow_create={}",
5804 left_key.kmer_front, left_key.kmer_back, left_exists,
5805 right_key.kmer_front, right_key.kmer_back, right_exists, allow_create_groups
5806 );
5807 }
5808 if !allow_create_groups {
5809 // Don't attempt split - fall through to normal segment processing
5810 }
5811 }
5812
5813 // Proceed with split if groups exist OR if we allow creating groups
5814 if (left_exists && right_exists) || allow_create_groups {
5815 // Both groups exist - proceed with split cost calculation
5816 #[cfg(feature = "verbose_debug")]
5817 if config.verbosity > 0 {
5818 eprintln!("DEBUG_SPLIT: Attempting cost-based split for ({},{}) sample={}",
5819 key_front, key_back, task.sample_name);
5820 }
5821
5822 let split_result = try_split_segment_with_cost(
5823 &segment_data,
5824 key_front,
5825 key_back,
5826 middle_kmer,
5827 &left_key,
5828 &right_key,
5829 &map_segments,
5830 &map_segments_terminators,
5831 &reference_segments,
5832 &config,
5833 should_reverse,
5834 allow_create_groups, // Force split at middle k-mer position if refs are empty
5835 );
5836
5837 if let Some((left_data, right_data, _mid)) = split_result {
5838 // PERF: Acquire lock only for split path (rare case)
5839 // Normal segments bypass this entirely for better parallelism
5840 let mut groups = segment_groups.lock().unwrap();
5841
5842 // FIX 27 v4: Compute separate orientations for left and right parts
5843 // C++ AGC lines 1526-1536 and 1540-1550:
5844 // store_rc = (kmer_front.data() >= split_match.first) -- for left
5845 // store2_rc = (split_match.first >= kmer_back.data()) -- for right
5846 //
5847 // When should_reverse=true, the segment was RC'd before splitting,
5848 // so "left" in the split is from original RIGHT, and "right" is from original LEFT.
5849 // We need to swap the k-mer comparisons accordingly.
5850 let (left_should_rc, right_should_rc) = if should_reverse {
5851 // Segment was RC'd: left is from original right, right is from original left
5852 // Swap the k-mer associations
5853 let left_should_rc = middle_kmer >= segment.back_kmer; // use back_kmer for "left"
5854 let right_should_rc = segment.front_kmer >= middle_kmer; // use front_kmer for "right"
5855 (left_should_rc, right_should_rc)
5856 } else {
5857 // Normal: left is from original left, right is from original right
5858 let left_should_rc = segment.front_kmer >= middle_kmer;
5859 let right_should_rc = middle_kmer >= segment.back_kmer;
5860 (left_should_rc, right_should_rc)
5861 };
5862
5863 // Transform data if needed: current state is `should_reverse`
5864 // If target state differs, we RC the data
5865 let left_data = if left_should_rc != should_reverse {
5866 left_data
5867 .iter()
5868 .rev()
5869 .map(|&base| match base {
5870 0 => 3,
5871 1 => 2,
5872 2 => 1,
5873 3 => 0,
5874 _ => base,
5875 })
5876 .collect::<Vec<u8>>()
5877 } else {
5878 left_data
5879 };
5880 let right_data = if right_should_rc != should_reverse {
5881 right_data
5882 .iter()
5883 .rev()
5884 .map(|&base| match base {
5885 0 => 3,
5886 1 => 2,
5887 2 => 1,
5888 3 => 0,
5889 _ => base,
5890 })
5891 .collect::<Vec<u8>>()
5892 } else {
5893 right_data
5894 };
5895
5896 // Check if this is a degenerate split (one side empty)
5897 let is_degenerate_left = left_data.is_empty();
5898 let is_degenerate_right = right_data.is_empty();
5899
5900 if config.verbosity > 1 {
5901 if is_degenerate_right {
5902 eprintln!(
5903 "SPLIT_DEGENERATE_RIGHT: ({},{}) -> left_only=({},{})",
5904 key_front,
5905 key_back,
5906 left_key.kmer_front,
5907 left_key.kmer_back
5908 );
5909 } else if is_degenerate_left {
5910 eprintln!(
5911 "SPLIT_DEGENERATE_LEFT: ({},{}) -> right_only=({},{})",
5912 key_front,
5913 key_back,
5914 right_key.kmer_front,
5915 right_key.kmer_back
5916 );
5917 } else {
5918 eprintln!(
5919 "SPLIT: original=({},{}) -> left=({},{}) right=({},{})",
5920 key_front,
5921 key_back,
5922 left_key.kmer_front,
5923 left_key.kmer_back,
5924 right_key.kmer_front,
5925 right_key.kmer_back
5926 );
5927 }
5928 }
5929
5930 // Determine emission order. By default match C++ logic:
5931 // - Normal orientation (should_reverse=false): emit left then right
5932 // - Reversed orientation (should_reverse=true): emit right then left
5933 // Allow env override for diagnostics:
5934 // RAGC_EMIT_ORDER=left -> force left-first
5935 // RAGC_EMIT_ORDER=right -> force right-first
5936 // RAGC_EMIT_ORDER=flip -> invert default
5937 // RAGC_EMIT_ORDER=auto -> default behavior (or if unset)
5938 let emit_left_first = match std::env::var("RAGC_EMIT_ORDER") {
5939 Ok(val) => match val.to_ascii_lowercase().as_str() {
5940 "left" | "left-first" => true,
5941 "right" | "right-first" => false,
5942 "flip" => should_reverse, // invert default (!should_reverse)
5943 _ => !should_reverse, // auto/default
5944 },
5945 Err(_) => !should_reverse,
5946 };
5947 if config.verbosity > 1 {
5948 eprintln!(
5949 "EMIT_ORDER: should_reverse={} -> emit_left_first={} (env RAGC_EMIT_ORDER)",
5950 should_reverse, emit_left_first
5951 );
5952 }
5953
5954 // Optional targeted split trace for a specific (sample, contig, index)
5955 if let (Ok(ts), Ok(tc), Ok(ti)) = (
5956 std::env::var("RAGC_TRACE_SAMPLE"),
5957 std::env::var("RAGC_TRACE_CONTIG"),
5958 std::env::var("RAGC_TRACE_INDEX").and_then(|s| {
5959 s.parse::<usize>()
5960 .map_err(|e| std::env::VarError::NotPresent)
5961 }),
5962 ) {
5963 if ts == task.sample_name
5964 && tc == task.contig_name
5965 && ti == place
5966 {
5967 // Derive seg2_start from lengths (robust for both FFI and local mapping)
5968 let seg_len = segment_data.len();
5969 let right_len = right_data.len();
5970 let left_len = left_data.len();
5971 let seg2_start_derived = seg_len.saturating_sub(right_len);
5972 let left_end_derived = seg2_start_derived
5973 .saturating_add(config.k)
5974 .min(seg_len);
5975 eprintln!(
5976 "TRACE_SPLIT: {}/{} idx={} rev={} emit_left_first={} degL={} degR={} seg2_start={} left_end={} left_len={} right_len={}",
5977 task.sample_name, task.contig_name, place, should_reverse, emit_left_first,
5978 is_degenerate_left, is_degenerate_right, seg2_start_derived, left_end_derived, left_len, right_len
5979 );
5980 }
5981 }
5982
5983 // Emit in correct contig order
5984 if emit_left_first {
5985 // left first
5986 if !is_degenerate_left {
5987 let left_buffer =
5988 groups.entry(left_key.clone()).or_insert_with(|| {
5989 // OPTIMIZATION: Read-check-write pattern to reduce lock contention
5990 // First check with read lock (fast path - most groups already exist)
5991 let group_id = {
5992 let global_map = map_segments.read().unwrap();
5993 if let Some(&existing_id) =
5994 global_map.get(&left_key)
5995 {
5996 existing_id
5997 } else {
5998 drop(global_map);
5999 // Group doesn't exist - upgrade to write lock
6000 let mut global_map =
6001 map_segments.write().unwrap();
6002 // Double-check after acquiring write lock (race condition)
6003 if let Some(&existing_id) =
6004 global_map.get(&left_key)
6005 {
6006 existing_id
6007 } else {
6008 // Create new group ID and register IMMEDIATELY to global map
6009 let new_id = group_counter
6010 .fetch_add(1, Ordering::SeqCst);
6011 global_map
6012 .insert(left_key.clone(), new_id);
6013 drop(global_map);
6014 // Also register to batch-local for flush tracking
6015 let mut batch_map =
6016 batch_local_groups.lock().unwrap();
6017 batch_map
6018 .insert(left_key.clone(), new_id);
6019 new_id
6020 }
6021 }
6022 };
6023 // Register with FFI engine
6024 #[cfg(feature = "cpp_agc")]
6025 if left_key.kmer_front != MISSING_KMER
6026 && left_key.kmer_back != MISSING_KMER
6027 {
6028 let mut eng = grouping_engine.lock().unwrap();
6029 eng.register_group(
6030 left_key.kmer_front,
6031 left_key.kmer_back,
6032 group_id,
6033 );
6034 }
6035 // Update GLOBAL terminators map IMMEDIATELY (matches C++ AGC)
6036 if left_key.kmer_front != MISSING_KMER
6037 && left_key.kmer_back != MISSING_KMER
6038 {
6039 let mut term_map =
6040 map_segments_terminators.write().unwrap();
6041 term_map
6042 .entry(left_key.kmer_front)
6043 .or_insert_with(Vec::new)
6044 .push(left_key.kmer_back);
6045 if left_key.kmer_front != left_key.kmer_back {
6046 term_map
6047 .entry(left_key.kmer_back)
6048 .or_insert_with(Vec::new)
6049 .push(left_key.kmer_front);
6050 }
6051 if let Some(front_vec) =
6052 term_map.get_mut(&left_key.kmer_front)
6053 {
6054 front_vec.sort_unstable();
6055 front_vec.dedup();
6056 }
6057 if left_key.kmer_front != left_key.kmer_back {
6058 if let Some(back_vec) =
6059 term_map.get_mut(&left_key.kmer_back)
6060 {
6061 back_vec.sort_unstable();
6062 back_vec.dedup();
6063 }
6064 }
6065 }
6066 // Register streams for this group
6067 let archive_version = ragc_common::AGC_FILE_MAJOR
6068 * 1000
6069 + ragc_common::AGC_FILE_MINOR;
6070 let delta_stream_name =
6071 ragc_common::stream_delta_name(
6072 archive_version,
6073 group_id,
6074 );
6075 let ref_stream_name = ragc_common::stream_ref_name(
6076 archive_version,
6077 group_id,
6078 );
6079 let mut arch = archive.lock().unwrap();
6080 let stream_id =
6081 arch.register_stream(&delta_stream_name);
6082 let ref_stream_id =
6083 arch.register_stream(&ref_stream_name);
6084 drop(arch);
6085 SegmentGroupBuffer::new(
6086 group_id,
6087 stream_id,
6088 ref_stream_id,
6089 )
6090 });
6091 // FIX 27 v4: Use left_should_rc instead of should_reverse
6092 let (fixed_left_data, fixed_left_rc) =
6093 fix_orientation_for_group(
6094 &left_data,
6095 left_should_rc,
6096 &left_key,
6097 &map_segments,
6098 &batch_local_groups,
6099 &reference_orientations,
6100 );
6101 let left_buffered = BufferedSegment {
6102 sample_name: task.sample_name.clone(),
6103 contig_name: task.contig_name.clone(),
6104 seg_part_no: place,
6105 data: fixed_left_data,
6106 is_rev_comp: fixed_left_rc,
6107 sample_priority: task.sample_priority,
6108 };
6109 left_buffer.segments.push(left_buffered);
6110 // Flush pack if full (matches C++ AGC write-as-you-go behavior)
6111 if left_buffer.should_flush_pack(config.pack_size) {
6112 flush_pack(
6113 left_buffer,
6114 &collection,
6115 &archive,
6116 &config,
6117 &reference_segments,
6118 )
6119 .context("Failed to flush left pack")?;
6120 }
6121 }
6122 if !is_degenerate_right {
6123 let right_buffer =
6124 groups.entry(right_key.clone()).or_insert_with(|| {
6125 // OPTIMIZATION: Read-check-write pattern to reduce lock contention
6126 // First check with read lock (fast path - most groups already exist)
6127 let group_id = {
6128 let global_map = map_segments.read().unwrap();
6129 if let Some(&existing_id) =
6130 global_map.get(&right_key)
6131 {
6132 existing_id
6133 } else {
6134 drop(global_map);
6135 // Group doesn't exist - upgrade to write lock
6136 let mut global_map =
6137 map_segments.write().unwrap();
6138 // Double-check after acquiring write lock (race condition)
6139 if let Some(&existing_id) =
6140 global_map.get(&right_key)
6141 {
6142 existing_id
6143 } else {
6144 // Create new group ID and register IMMEDIATELY to global map
6145 let new_id = group_counter
6146 .fetch_add(1, Ordering::SeqCst);
6147 global_map
6148 .insert(right_key.clone(), new_id);
6149 drop(global_map);
6150 // Also register to batch-local for flush tracking
6151 let mut batch_map =
6152 batch_local_groups.lock().unwrap();
6153 batch_map
6154 .insert(right_key.clone(), new_id);
6155 new_id
6156 }
6157 }
6158 };
6159 // Register with FFI engine
6160 #[cfg(feature = "cpp_agc")]
6161 if right_key.kmer_front != MISSING_KMER
6162 && right_key.kmer_back != MISSING_KMER
6163 {
6164 let mut eng = grouping_engine.lock().unwrap();
6165 eng.register_group(
6166 right_key.kmer_front,
6167 right_key.kmer_back,
6168 group_id,
6169 );
6170 }
6171 // Update GLOBAL terminators map IMMEDIATELY (matches C++ AGC)
6172 if right_key.kmer_front != MISSING_KMER
6173 && right_key.kmer_back != MISSING_KMER
6174 {
6175 let mut term_map =
6176 map_segments_terminators.write().unwrap();
6177 term_map
6178 .entry(right_key.kmer_front)
6179 .or_insert_with(Vec::new)
6180 .push(right_key.kmer_back);
6181 if right_key.kmer_front != right_key.kmer_back {
6182 term_map
6183 .entry(right_key.kmer_back)
6184 .or_insert_with(Vec::new)
6185 .push(right_key.kmer_front);
6186 }
6187 if let Some(front_vec) =
6188 term_map.get_mut(&right_key.kmer_front)
6189 {
6190 front_vec.sort_unstable();
6191 front_vec.dedup();
6192 }
6193 if right_key.kmer_front != right_key.kmer_back {
6194 if let Some(back_vec) =
6195 term_map.get_mut(&right_key.kmer_back)
6196 {
6197 back_vec.sort_unstable();
6198 back_vec.dedup();
6199 }
6200 }
6201 }
6202 // Register streams for this group
6203 let archive_version = ragc_common::AGC_FILE_MAJOR
6204 * 1000
6205 + ragc_common::AGC_FILE_MINOR;
6206 let delta_stream_name =
6207 ragc_common::stream_delta_name(
6208 archive_version,
6209 group_id,
6210 );
6211 let ref_stream_name = ragc_common::stream_ref_name(
6212 archive_version,
6213 group_id,
6214 );
6215 let mut arch = archive.lock().unwrap();
6216 let stream_id =
6217 arch.register_stream(&delta_stream_name);
6218 let ref_stream_id =
6219 arch.register_stream(&ref_stream_name);
6220 drop(arch);
6221 SegmentGroupBuffer::new(
6222 group_id,
6223 stream_id,
6224 ref_stream_id,
6225 )
6226 });
6227 let seg_part =
6228 if is_degenerate_left { place } else { place + 1 };
6229 // FIX 27 v4: Use right_should_rc instead of should_reverse
6230 let (fixed_right_data, fixed_right_rc) =
6231 fix_orientation_for_group(
6232 &right_data,
6233 right_should_rc,
6234 &right_key,
6235 &map_segments,
6236 &batch_local_groups,
6237 &reference_orientations,
6238 );
6239 let right_buffered = BufferedSegment {
6240 sample_name: task.sample_name.clone(),
6241 contig_name: task.contig_name.clone(),
6242 seg_part_no: seg_part,
6243 data: fixed_right_data,
6244 is_rev_comp: fixed_right_rc,
6245 sample_priority: task.sample_priority,
6246 };
6247 right_buffer.segments.push(right_buffered);
6248 // Flush pack if full (matches C++ AGC write-as-you-go behavior)
6249 if right_buffer.should_flush_pack(config.pack_size) {
6250 flush_pack(
6251 right_buffer,
6252 &collection,
6253 &archive,
6254 &config,
6255 &reference_segments,
6256 )
6257 .context("Failed to flush right pack")?;
6258 }
6259 }
6260 } else {
6261 // reversed: right first
6262 if !is_degenerate_right {
6263 let right_buffer = groups.entry(right_key.clone()).or_insert_with(|| {
6264 // BATCH-LOCAL: Check global first, then batch-local (group must exist from earlier)
6265 let group_id = {
6266 let global_map = map_segments.read().unwrap();
6267 if let Some(&id) = global_map.get(&right_key) {
6268 id
6269 } else {
6270 drop(global_map);
6271 let batch_map = batch_local_groups.lock().unwrap();
6272 *batch_map.get(&right_key).expect("Split right group must exist in batch_local_groups or map_segments")
6273 }
6274 };
6275 let archive_version = ragc_common::AGC_FILE_MAJOR * 1000 + ragc_common::AGC_FILE_MINOR;
6276 let delta_stream_name = ragc_common::stream_delta_name(archive_version, group_id);
6277 let ref_stream_name = ragc_common::stream_ref_name(archive_version, group_id);
6278 let mut arch = archive.lock().unwrap();
6279 let stream_id = arch.register_stream(&delta_stream_name);
6280 let ref_stream_id = arch.register_stream(&ref_stream_name);
6281 drop(arch);
6282 SegmentGroupBuffer::new(group_id, stream_id, ref_stream_id)
6283 });
6284 // FIX 27 v4: Use right_should_rc instead of should_reverse
6285 let (fixed_right_data, fixed_right_rc) =
6286 fix_orientation_for_group(
6287 &right_data,
6288 right_should_rc,
6289 &right_key,
6290 &map_segments,
6291 &batch_local_groups,
6292 &reference_orientations,
6293 );
6294 let right_buffered = BufferedSegment {
6295 sample_name: task.sample_name.clone(),
6296 contig_name: task.contig_name.clone(),
6297 seg_part_no: place,
6298 data: fixed_right_data,
6299 is_rev_comp: fixed_right_rc,
6300 sample_priority: task.sample_priority,
6301 };
6302 right_buffer.segments.push(right_buffered);
6303 // Flush pack if full (matches C++ AGC write-as-you-go behavior)
6304 if right_buffer.should_flush_pack(config.pack_size) {
6305 flush_pack(
6306 right_buffer,
6307 &collection,
6308 &archive,
6309 &config,
6310 &reference_segments,
6311 )
6312 .context("Failed to flush right pack")?;
6313 }
6314 }
6315 if !is_degenerate_left {
6316 let left_buffer = groups.entry(left_key.clone()).or_insert_with(|| {
6317 // BATCH-LOCAL: Check global first, then batch-local (group must exist from earlier)
6318 let group_id = {
6319 let global_map = map_segments.read().unwrap();
6320 if let Some(&id) = global_map.get(&left_key) {
6321 id
6322 } else {
6323 drop(global_map);
6324 let batch_map = batch_local_groups.lock().unwrap();
6325 *batch_map.get(&left_key).expect("Split left group must exist in batch_local_groups or map_segments")
6326 }
6327 };
6328 let archive_version = ragc_common::AGC_FILE_MAJOR * 1000 + ragc_common::AGC_FILE_MINOR;
6329 let delta_stream_name = ragc_common::stream_delta_name(archive_version, group_id);
6330 let ref_stream_name = ragc_common::stream_ref_name(archive_version, group_id);
6331 let mut arch = archive.lock().unwrap();
6332 let stream_id = arch.register_stream(&delta_stream_name);
6333 let ref_stream_id = arch.register_stream(&ref_stream_name);
6334 drop(arch);
6335 SegmentGroupBuffer::new(group_id, stream_id, ref_stream_id)
6336 });
6337 let seg_part = if is_degenerate_right {
6338 place
6339 } else {
6340 place + 1
6341 };
6342 // FIX 27 v4: Use left_should_rc instead of should_reverse
6343 let (fixed_left_data, fixed_left_rc) =
6344 fix_orientation_for_group(
6345 &left_data,
6346 left_should_rc,
6347 &left_key,
6348 &map_segments,
6349 &batch_local_groups,
6350 &reference_orientations,
6351 );
6352 let left_buffered = BufferedSegment {
6353 sample_name: task.sample_name.clone(),
6354 contig_name: task.contig_name.clone(),
6355 seg_part_no: seg_part,
6356 data: fixed_left_data,
6357 is_rev_comp: fixed_left_rc,
6358 sample_priority: task.sample_priority,
6359 };
6360 left_buffer.segments.push(left_buffered);
6361 // Flush pack if full (matches C++ AGC write-as-you-go behavior)
6362 if left_buffer.should_flush_pack(config.pack_size) {
6363 flush_pack(
6364 left_buffer,
6365 &collection,
6366 &archive,
6367 &config,
6368 &reference_segments,
6369 )
6370 .context("Failed to flush left pack")?;
6371 }
6372 }
6373 }
6374
6375 // Optional: assert lengths vs C++ archive if provided
6376 if let Some(assert_path) = crate::env_cache::assert_cpp_archive() {
6377 use crate::{Decompressor, DecompressorConfig};
6378 let mut dec = match Decompressor::open(
6379 &assert_path,
6380 DecompressorConfig { verbosity: 0 },
6381 ) {
6382 Ok(d) => d,
6383 Err(_) => {
6384 if config.verbosity > 1 {
6385 eprintln!(
6386 "ASSERT_SKIP: cannot open {}",
6387 assert_path
6388 );
6389 }
6390 return Ok(());
6391 }
6392 };
6393 if let Ok(all) = dec.get_all_segments() {
6394 if let Some((_, _, segs)) =
6395 all.into_iter().find(|(s, c, _)| {
6396 *s == task.sample_name && *c == task.contig_name
6397 })
6398 {
6399 // Compute our emitted lens and expected lens at indices
6400 let mut checks: Vec<(usize, usize)> = Vec::new();
6401 if emit_left_first {
6402 if !is_degenerate_left {
6403 checks.push((place, left_data.len()));
6404 }
6405 if !is_degenerate_right {
6406 checks.push((
6407 if is_degenerate_left {
6408 place
6409 } else {
6410 place + 1
6411 },
6412 right_data.len(),
6413 ));
6414 }
6415 } else {
6416 if !is_degenerate_right {
6417 checks.push((place, right_data.len()));
6418 }
6419 if !is_degenerate_left {
6420 checks.push((
6421 if is_degenerate_right {
6422 place
6423 } else {
6424 place + 1
6425 },
6426 left_data.len(),
6427 ));
6428 }
6429 }
6430
6431 // Derive segmentation geometry for detailed diagnostics
6432 let seg_len = segment_data.len();
6433 let right_len = right_data.len();
6434 let left_len = left_data.len();
6435 let seg2_start_derived =
6436 seg_len.saturating_sub(right_len);
6437 let left_end_derived = seg2_start_derived
6438 .saturating_add(config.k)
6439 .min(seg_len);
6440 let emit_idx_left = if emit_left_first {
6441 place
6442 } else {
6443 if is_degenerate_right {
6444 place
6445 } else {
6446 place + 1
6447 }
6448 };
6449 let emit_idx_right = if emit_left_first {
6450 if is_degenerate_left {
6451 place
6452 } else {
6453 place + 1
6454 }
6455 } else {
6456 place
6457 };
6458
6459 for (idx, got) in checks {
6460 if idx < segs.len() {
6461 let exp = segs[idx].raw_length as usize;
6462 if exp != got {
6463 eprintln!("ASSERT_LEN_MISMATCH: {}/{} idx={} got={} exp={} keys L=({:#x},{:#x}) R=({:#x},{:#x})",
6464 task.sample_name, task.contig_name, idx, got, exp,
6465 left_key.kmer_front, left_key.kmer_back,
6466 right_key.kmer_front, right_key.kmer_back);
6467 // Extended context (guarded by env to limit noise)
6468 if crate::env_cache::assert_verbose() {
6469 eprintln!(" CONTEXT: place={} orig_place={} emit_left_first={} should_reverse={}",
6470 place, original_place, emit_left_first, should_reverse);
6471 eprintln!(" GEOM: seg_len={} left_len={} right_len={} seg2_start={} left_end={}",
6472 seg_len, left_len, right_len, seg2_start_derived, left_end_derived);
6473 eprintln!(" EMIT_IDX: left_at={} right_at={}", emit_idx_left, emit_idx_right);
6474 }
6475 }
6476 } else {
6477 eprintln!(
6478 "ASSERT_IDX_OOB: {}/{} idx={} (segs={})",
6479 task.sample_name,
6480 task.contig_name,
6481 idx,
6482 segs.len()
6483 );
6484 }
6485 }
6486 }
6487 }
6488 }
6489
6490 // Record this split so subsequent segments from this contig get shifted
6491 // (matches C++ AGC lines 2033-2036: ++seg_part_no twice when split)
6492 // For degenerate splits, only increment once (no actual split)
6493 if !is_degenerate_left && !is_degenerate_right {
6494 // OPTIMIZATION: Track locally for this task AND globally for other workers
6495 local_splits.insert(original_place);
6496 let mut offsets = split_offsets.lock().unwrap();
6497 offsets.insert(
6498 (
6499 task.sample_name.clone(),
6500 task.contig_name.clone(),
6501 original_place,
6502 ),
6503 1,
6504 );
6505 }
6506
6507 // Skip adding original segment - we've added the split/reclassified segment
6508 continue;
6509 }
6510 // If split_result was None, fall through to normal path
6511 } // end of else { both groups exist }
6512 }
6513 }
6514
6515 // Phase 2.5: Secondary fallback attempt (C++ AGC lines 1477-1494)
6516 // If the group doesn't exist yet, try fallback minimizers one more time with min_shared=2
6517 // This helps segments find existing groups that share internal k-mers
6518 let (key, key_front, key_back, should_reverse) = {
6519 // Re-check if key exists (may have changed since split logic ran)
6520 let key_exists_now = {
6521 let seg_map = map_segments.read().unwrap();
6522 seg_map.contains_key(&key)
6523 };
6524
6525 // Debug: count how many segments could be eligible for secondary fallback
6526 if crate::env_cache::debug_fallback2_enabled() {
6527 if !key_exists_now
6528 && key.kmer_front != MISSING_KMER
6529 && key.kmer_back != MISSING_KMER
6530 {
6531 eprintln!(
6532 "SECONDARY_FB_CANDIDATE: sample={} contig={} place={} key=({},{})",
6533 task.sample_name,
6534 task.contig_name,
6535 place,
6536 key.kmer_front,
6537 key.kmer_back
6538 );
6539 }
6540 }
6541
6542 if !key_exists_now
6543 && key.kmer_front != MISSING_KMER
6544 && key.kmer_back != MISSING_KMER
6545 && fallback_filter.is_enabled()
6546 {
6547 // Generate reverse complement for fallback lookup
6548 let segment_data_rc_fb: Vec<u8> = segment_data
6549 .iter()
6550 .rev()
6551 .map(|&b| if b > 3 { b } else { 3 - b })
6552 .collect();
6553
6554 let (fb_kf, fb_kb, fb_sr) = find_cand_segment_using_fallback_minimizers(
6555 &segment_data,
6556 &segment_data_rc_fb,
6557 config.k,
6558 2, // min_shared_kmers = 2 for secondary fallback (C++ AGC line 1482)
6559 &fallback_filter,
6560 &map_fallback_minimizers,
6561 &map_segments,
6562 &segment_groups,
6563 &reference_segments,
6564 &config,
6565 );
6566
6567 if crate::env_cache::debug_fallback2_enabled() {
6568 if fb_kf == MISSING_KMER || fb_kb == MISSING_KMER {
6569 eprintln!(
6570 "SECONDARY_FB_NO_MATCH: orig_key=({},{})",
6571 key.kmer_front, key.kmer_back
6572 );
6573 } else {
6574 eprintln!(
6575 "SECONDARY_FB_FOUND: orig=({},{}) found=({},{}) rc={}",
6576 key.kmer_front, key.kmer_back, fb_kf, fb_kb, fb_sr
6577 );
6578 }
6579 }
6580
6581 if fb_kf != MISSING_KMER && fb_kb != MISSING_KMER {
6582 // Verify the found group actually exists
6583 let found_key = SegmentGroupKey {
6584 kmer_front: fb_kf,
6585 kmer_back: fb_kb,
6586 };
6587 let found_exists = {
6588 let seg_map = map_segments.read().unwrap();
6589 seg_map.contains_key(&found_key)
6590 };
6591
6592 if found_exists {
6593 if config.verbosity > 1 {
6594 eprintln!(
6595 "SECONDARY_FALLBACK_SUCCESS: ({},{}) -> ({},{}) sr={}->{}",
6596 key_front, key_back, fb_kf, fb_kb, should_reverse, fb_sr
6597 );
6598 }
6599 (found_key, fb_kf, fb_kb, fb_sr)
6600 } else {
6601 // Fallback found k-mers but group doesn't exist - keep original
6602 (key, key_front, key_back, should_reverse)
6603 }
6604 } else {
6605 // Fallback didn't find anything - keep original
6606 (key, key_front, key_back, should_reverse)
6607 }
6608 } else {
6609 // Group exists or not eligible for fallback - keep original
6610 (key, key_front, key_back, should_reverse)
6611 }
6612 };
6613
6614 // Phase 3: Normal path - add segment to group as-is (group exists, or split failed/impossible)
6615
6616 // FIX 18: Do NOT adjust orientation to match reference - C++ AGC stores each segment
6617 // with its own computed is_rev_comp based on k-mer comparison. Segments in the same
6618 // group can have different is_rev_comp values.
6619 let (final_should_reverse, final_segment_data) = (should_reverse, segment_data);
6620
6621 if config.verbosity > 2 {
6622 eprintln!(
6623 "DEFER_SEGMENT: front={} back={} sample={} contig={} place={}",
6624 key_front, key_back, task.sample_name, task.contig_name, place
6625 );
6626 }
6627
6628 // PHASE 1 (PARALLEL): Add segment to buffered_seg_part
6629 // Check if group exists (brief read lock on map_segments)
6630 let group_id_opt = {
6631 let seg_map = map_segments.read().unwrap();
6632 seg_map.get(&key).copied()
6633 };
6634
6635 if let Some(group_id) = group_id_opt {
6636 // KNOWN: add to per-group buffer (per-group lock only - PARALLEL)
6637 buffered_seg_part.add_known(
6638 group_id,
6639 BufferedSegment {
6640 sample_name: task.sample_name.clone(),
6641 contig_name: task.contig_name.clone(),
6642 seg_part_no: place,
6643 data: final_segment_data,
6644 is_rev_comp: final_should_reverse,
6645 sample_priority: task.sample_priority,
6646 },
6647 );
6648 } else {
6649 // NEW: add to s_seg_part (brief global lock on BTreeSet)
6650 buffered_seg_part.add_new(NewSegment {
6651 kmer_front: key.kmer_front,
6652 kmer_back: key.kmer_back,
6653 sample_priority: task.sample_priority,
6654 sample_name: task.sample_name.clone(),
6655 contig_name: task.contig_name.clone(),
6656 seg_part_no: place,
6657 data: final_segment_data,
6658 should_reverse: final_should_reverse,
6659 });
6660 }
6661
6662 // Segment will be handled in flush_batch at barrier synchronization point
6663 }
6664 }
6665
6666 processed_count += 1;
6667 }
6668
6669 Ok(())
6670}
6671
6672// ========== SEGMENT SPLITTING HELPER FUNCTIONS ==========
6673// (Phase 3-6 implementation)
6674
6675/// Phase 3: Find a k-mer that connects both front and back
6676/// Returns the first k-mer that appears in the terminator lists of BOTH front and back
6677/// (matches C++ AGC find_cand_segment_with_missing_middle_splitter lines 1531-1554)
6678fn find_middle_splitter(
6679 front_kmer: u64,
6680 back_kmer: u64,
6681 terminators: &BTreeMap<u64, Vec<u64>>,
6682) -> Option<u64> {
6683 let front_connections = terminators.get(&front_kmer)?;
6684 let back_connections = terminators.get(&back_kmer)?;
6685
6686 #[cfg(feature = "cpp_agc")]
6687 {
6688 if let Some(m) = crate::ragc_ffi::find_middle(front_connections, back_connections) {
6689 return Some(m);
6690 }
6691 if crate::env_cache::debug_split_find() {
6692 eprintln!(
6693 "DEBUG_FIND_MIDDLE_MISS: front={} back={} front_conn={} back_conn={} shared=0",
6694 front_kmer,
6695 back_kmer,
6696 front_connections.len(),
6697 back_connections.len()
6698 );
6699 }
6700 None
6701 }
6702
6703 #[cfg(not(feature = "cpp_agc"))]
6704 {
6705 // Fallback: local set_intersection
6706 let mut i = 0;
6707 let mut j = 0;
6708 while i < front_connections.len() && j < back_connections.len() {
6709 let a = front_connections[i];
6710 let b = back_connections[j];
6711 if a == b {
6712 if a != MISSING_KMER {
6713 return Some(a);
6714 }
6715 i += 1;
6716 j += 1;
6717 } else if a < b {
6718 i += 1;
6719 } else {
6720 j += 1;
6721 }
6722 }
6723 if crate::env_cache::debug_split_find() {
6724 eprintln!(
6725 "DEBUG_FIND_MIDDLE_MISS: front={} back={} front_conn={} back_conn={} shared=0",
6726 front_kmer,
6727 back_kmer,
6728 front_connections.len(),
6729 back_connections.len()
6730 );
6731 eprintln!(
6732 " front_connections: {:?}",
6733 &front_connections[..front_connections.len().min(5)]
6734 );
6735 eprintln!(
6736 " back_connections: {:?}",
6737 &back_connections[..back_connections.len().min(5)]
6738 );
6739 }
6740 None
6741 }
6742}
6743
6744/// Find split position by searching for k-mers from the right reference segment
6745/// This is more robust than searching for a single middle k-mer, as mutations
6746/// may eliminate that specific k-mer but preserve nearby ones.
6747///
6748/// The algorithm:
6749/// 1. Extract the first few k-mers from the right reference segment
6750/// 2. Search for these k-mers in the MIDDLE portion of the current segment
6751/// 3. Return the position closest to the expected split (based on reference proportions)
6752fn find_split_by_kmer_match(segment_data: &[u8], right_ref_data: &[u8], k: usize) -> Option<usize> {
6753 use crate::kmer::{Kmer, KmerMode};
6754 use ahash::AHashSet;
6755
6756 let seg_len = segment_data.len();
6757
6758 // Both parts must be at least this size for a valid split
6759 let min_segment_size = 500; // Minimum 500 bytes per part
6760
6761 if seg_len < 2 * min_segment_size || right_ref_data.len() < k {
6762 return None;
6763 }
6764
6765 // Valid split range: ensure both parts are >= min_segment_size
6766 let min_pos = min_segment_size;
6767 let max_pos = seg_len.saturating_sub(min_segment_size);
6768
6769 if max_pos <= min_pos {
6770 return None;
6771 }
6772
6773 // Extract first N k-mers from the right reference segment
6774 // These are k-mers that should appear at the START of the right part
6775 let num_ref_kmers = 50.min(right_ref_data.len() / 2); // First 50 k-mers or half of ref
6776 let mut ref_kmers: AHashSet<u64> = AHashSet::new();
6777 let mut ref_kmer = Kmer::new(k as u32, KmerMode::Canonical);
6778
6779 for &base in right_ref_data.iter().take(num_ref_kmers + k) {
6780 ref_kmer.insert(base as u64);
6781 if ref_kmer.is_full() {
6782 ref_kmers.insert(ref_kmer.data());
6783 }
6784 }
6785
6786 if ref_kmers.is_empty() {
6787 return None;
6788 }
6789
6790 // Search for k-mers only in the VALID RANGE of the segment
6791 // This ensures we don't create tiny segments
6792 let mut seg_kmer = Kmer::new(k as u32, KmerMode::Canonical);
6793 let mut best_match: Option<usize> = None;
6794
6795 for (pos, &base) in segment_data.iter().enumerate() {
6796 seg_kmer.insert(base as u64);
6797
6798 if seg_kmer.is_full() {
6799 let current_kmer = seg_kmer.data();
6800 let split_pos = pos + 1;
6801
6802 if split_pos >= min_pos && split_pos <= max_pos && ref_kmers.contains(¤t_kmer) {
6803 // Found a valid match - return the first one (earliest valid split)
6804 best_match = Some(split_pos);
6805 break;
6806 }
6807 }
6808 }
6809
6810 best_match
6811}
6812
6813/// Result of cost-based split analysis
6814/// Matches C++ AGC's find_cand_segment_with_missing_middle_splitter behavior (lines 1400-1454)
6815#[derive(Debug, Clone, Copy)]
6816enum SplitDecision {
6817 /// Assign entire segment to left group (best_pos == 0)
6818 AssignToLeft,
6819 /// Assign entire segment to right group (best_pos == seg_len)
6820 AssignToRight,
6821 /// Actually split at this position
6822 SplitAt(usize),
6823 /// Cannot determine (refs empty or segment too small)
6824 NoDecision,
6825}
6826
6827/// Find optimal split position using LZ encoding cost
6828/// Matches C++ AGC's find_cand_segment_with_missing_middle_splitter (lines 1502-1621)
6829///
6830/// This computes the LZ encoding cost at every position for both the left and right
6831/// reference segments, then finds the position where the total cost is minimized.
6832///
6833/// # Arguments
6834/// * `segment_dir` - The segment in original (forward) orientation
6835/// * `segment_rc` - The segment in reverse complement orientation
6836/// * `left_ref` - Reference data for the left segment (front_kmer -> middle_kmer)
6837/// * `right_ref` - Reference data for the right segment (middle_kmer -> back_kmer)
6838/// * `kmer_front` - Original front k-mer (not normalized)
6839/// * `kmer_back` - Original back k-mer (not normalized)
6840/// * `middle` - The middle splitter k-mer
6841/// * `k` - K-mer length
6842/// * `min_match_len` - Minimum match length for LZ encoding
6843///
6844/// # Returns
6845/// SplitDecision indicating whether to assign to left, right, or split at position
6846fn find_split_by_cost(
6847 segment_dir: &[u8],
6848 segment_rc: &[u8],
6849 left_ref: &[u8],
6850 right_ref: &[u8],
6851 kmer_front: u64,
6852 kmer_back: u64,
6853 middle: u64,
6854 k: usize,
6855 min_match_len: u32,
6856) -> SplitDecision {
6857 use crate::lz_diff::LZDiff;
6858
6859 let seg_len = segment_dir.len();
6860
6861 // C++ AGC uses kmer_length + 1 as minimum split size
6862 let min_size = k + 1;
6863
6864 // Need enough data on both sides
6865 if seg_len < 2 * min_size || left_ref.is_empty() || right_ref.is_empty() {
6866 return SplitDecision::NoDecision;
6867 }
6868
6869 // Compute left costs (seg1): cost of encoding segment against left reference
6870 // C++ AGC lines 1539-1548: choose orientation based on kmer_front vs middle
6871 let mut lz_left = LZDiff::new(min_match_len);
6872 lz_left.prepare(&left_ref.to_vec());
6873
6874 let left_cumsum: Vec<u32> = if kmer_front < middle {
6875 // C++ AGC line 1540: use segment_dir with prefix_costs=true
6876 let left_costs = lz_left.get_coding_cost_vector(&segment_dir.to_vec(), true);
6877 // Apply partial_sum forward
6878 let mut cumsum_vec = Vec::with_capacity(left_costs.len());
6879 let mut cumsum = 0u32;
6880 for &cost in &left_costs {
6881 cumsum = cumsum.saturating_add(cost);
6882 cumsum_vec.push(cumsum);
6883 }
6884 cumsum_vec
6885 } else {
6886 // C++ AGC lines 1543-1545: use segment_rc with prefix_costs=false
6887 // IMPORTANT: reverse the COSTS first, then partial_sum
6888 let mut left_costs = lz_left.get_coding_cost_vector(&segment_rc.to_vec(), false);
6889 // Reverse the costs BEFORE partial_sum (C++ line 1544)
6890 left_costs.reverse();
6891 // Apply partial_sum forward (C++ line 1547)
6892 let mut cumsum_vec = Vec::with_capacity(left_costs.len());
6893 let mut cumsum = 0u32;
6894 for &cost in &left_costs {
6895 cumsum = cumsum.saturating_add(cost);
6896 cumsum_vec.push(cumsum);
6897 }
6898 cumsum_vec
6899 };
6900
6901 // Compute right costs (seg2): cost of encoding segment against right reference
6902 // C++ AGC lines 1563-1573: choose orientation based on middle vs kmer_back
6903 let mut lz_right = LZDiff::new(min_match_len);
6904 lz_right.prepare(&right_ref.to_vec());
6905
6906 let right_cumsum: Vec<u32> = if middle < kmer_back {
6907 // C++ AGC lines 1565-1566: use segment_dir with prefix_costs=false
6908 // then partial_sum in reverse direction
6909 let right_costs = lz_right.get_coding_cost_vector(&segment_dir.to_vec(), false);
6910 // Apply partial_sum from the right (reverse direction)
6911 let mut cumsum_vec = vec![0u32; right_costs.len()];
6912 let mut cumsum = 0u32;
6913 for (i, &cost) in right_costs.iter().enumerate().rev() {
6914 cumsum = cumsum.saturating_add(cost);
6915 cumsum_vec[i] = cumsum;
6916 }
6917 cumsum_vec
6918 } else {
6919 // C++ AGC lines 1570-1572: use segment_rc with prefix_costs=true
6920 // partial_sum forward, then reverse
6921 let right_costs = lz_right.get_coding_cost_vector(&segment_rc.to_vec(), true);
6922 // Apply partial_sum forward
6923 let mut cumsum_vec = Vec::with_capacity(right_costs.len());
6924 let mut cumsum = 0u32;
6925 for &cost in &right_costs {
6926 cumsum = cumsum.saturating_add(cost);
6927 cumsum_vec.push(cumsum);
6928 }
6929 // Reverse the cumulative sums
6930 cumsum_vec.reverse();
6931 cumsum_vec
6932 };
6933
6934 // Find position with minimum combined cost
6935 // C++ AGC lines 1606-1614: loop over ALL positions, not just valid split range
6936 let mut best_sum = u32::MAX;
6937 let mut best_pos = 0usize;
6938
6939 // IMPORTANT: C++ AGC loops from 0 to size(), then post-processes
6940 // We must do the same to get equivalent AssignToLeft/AssignToRight decisions
6941 let cost_len = left_cumsum.len().min(right_cumsum.len());
6942 for i in 0..cost_len {
6943 let cs = left_cumsum[i].saturating_add(right_cumsum[i]);
6944 if cs < best_sum {
6945 best_sum = cs;
6946 best_pos = i;
6947 }
6948 }
6949
6950 // Post-process: if best_pos is too close to edges, set to 0 or seg_len
6951 // C++ AGC lines 1616-1619
6952 if best_pos < min_size {
6953 best_pos = 0;
6954 }
6955 if best_pos + min_size > seg_len {
6956 best_pos = seg_len;
6957 }
6958
6959 // Return decision based on best_pos
6960 // C++ AGC lines 1400-1454: left_size==0 means assign to right, right_size==0 means assign to left
6961 if best_pos == 0 {
6962 SplitDecision::AssignToRight
6963 } else if best_pos >= seg_len {
6964 SplitDecision::AssignToLeft
6965 } else {
6966 SplitDecision::SplitAt(best_pos)
6967 }
6968}
6969
6970/// Phase 4: Find split position by scanning for middle k-mer
6971/// Scans the segment to find where the middle k-mer actually occurs
6972/// Returns the split position (in bytes) at the END of the middle k-mer
6973#[allow(dead_code)]
6974fn find_split_position(
6975 segment_data: &[u8],
6976 middle_kmer: u64,
6977 segment_len: usize,
6978 k: usize,
6979) -> Option<usize> {
6980 use crate::kmer::{Kmer, KmerMode};
6981
6982 // Ensure we don't split too close to the ends
6983 // Need at least k+1 bytes on each side for valid segments
6984 if segment_len < 2 * (k + 1) {
6985 return None;
6986 }
6987
6988 // Scan segment to find where middle_kmer occurs
6989 // Use data() to get canonical k-mer (matching how segment boundaries are computed)
6990 let mut kmer = Kmer::new(k as u32, KmerMode::Canonical);
6991
6992 for (pos, &base) in segment_data.iter().enumerate() {
6993 kmer.insert(base as u64);
6994
6995 if kmer.is_full() {
6996 let current_kmer = kmer.data();
6997
6998 if current_kmer == middle_kmer {
6999 // Found the middle k-mer! Position is at the end of the k-mer
7000 let split_pos = pos + 1;
7001
7002 // Validate: ensure we have enough space on both sides
7003 let left_size = split_pos;
7004 let right_size = segment_len - split_pos + k;
7005
7006 if left_size >= k + 1 && right_size >= k + 1 {
7007 return Some(split_pos);
7008 }
7009 }
7010 }
7011 }
7012
7013 // Not found - that's OK, the middle k-mer may not exist in this sample due to mutations
7014 None
7015}
7016
7017/// Phase 5: Split segment into two overlapping segments
7018/// Returns (left_segment, right_segment) with k-mer overlap
7019/// (matches C++ AGC lines 1461-1464)
7020fn split_segment_at_position(
7021 segment_data: &[u8],
7022 split_pos: usize,
7023 k: usize,
7024) -> (Vec<u8>, Vec<u8>) {
7025 // C++ AGC creates overlap of k bytes (not k/2!):
7026 // seg2_start_pos = left_size - ceil(kmer_length / 2)
7027 // segment2 starts at seg2_start_pos
7028 // segment ends at seg2_start_pos + kmer_length
7029 // This creates k bytes of overlap: [split_pos - k/2 .. split_pos + k/2]
7030 let half_ceil = (k + 1) / 2;
7031 let seg2_start_pos = split_pos.saturating_sub(half_ceil);
7032
7033 // Right segment: [seg2_start_pos .. end]
7034 let right = segment_data[seg2_start_pos..].to_vec();
7035
7036 // Left segment: [0 .. seg2_start_pos + k]
7037 let left_end = seg2_start_pos + k;
7038 let left = segment_data[..left_end].to_vec();
7039
7040 (left, right)
7041}
7042
7043/// Split using seg2_start byte index (start of right segment) matching C++ layout
7044fn split_segment_from_start(
7045 segment_data: &[u8],
7046 seg2_start: usize,
7047 k: usize,
7048) -> (Vec<u8>, Vec<u8>) {
7049 let seg2_start_pos = seg2_start.min(segment_data.len());
7050 let right = segment_data[seg2_start_pos..].to_vec();
7051 let left_end = seg2_start_pos.saturating_add(k).min(segment_data.len());
7052 let left = segment_data[..left_end].to_vec();
7053 (left, right)
7054}
7055
7056/// Phase 6: Attempt to split using compression cost heuristic (EXACT C++ AGC algorithm)
7057/// Matches agc_compressor.cpp lines 1387-1503 and 1531-1663
7058/// Returns Some((left_data, right_data, middle_kmer)) if split is beneficial
7059/// Returns None if split would be degenerate (creates segments too small)
7060fn try_split_segment_with_cost(
7061 segment_data: &Contig,
7062 front_kmer: u64,
7063 back_kmer: u64,
7064 middle_kmer: u64,
7065 left_key: &SegmentGroupKey,
7066 right_key: &SegmentGroupKey,
7067 map_segments: &Arc<RwLock<BTreeMap<SegmentGroupKey, u32>>>,
7068 map_segments_terminators: &Arc<RwLock<BTreeMap<u64, Vec<u64>>>>,
7069 reference_segments: &Arc<RwLock<BTreeMap<u32, Vec<u8>>>>,
7070 config: &StreamingQueueConfig,
7071 should_reverse: bool,
7072 force_split_on_empty_refs: bool, // When true, split at middle k-mer position even if FFI says no
7073) -> Option<(Vec<u8>, Vec<u8>, u64)> {
7074 if config.verbosity > 1 {
7075 eprintln!(
7076 "SPLIT_ATTEMPT: front={} back={} middle={}",
7077 front_kmer, back_kmer, middle_kmer
7078 );
7079 }
7080
7081 // Debug: trace split attempt
7082 if crate::env_cache::debug_split() {
7083 eprintln!(
7084 "RAGC_SPLIT_TRY: front={} back={} middle={} left_key=({},{}) right_key=({},{})",
7085 front_kmer,
7086 back_kmer,
7087 middle_kmer,
7088 left_key.kmer_front,
7089 left_key.kmer_back,
7090 right_key.kmer_front,
7091 right_key.kmer_back
7092 );
7093 }
7094
7095 // Prepare LZDiff for both groups from persistent storage
7096 // C++ AGC uses global v_segments[segment_id] (agc_compressor.cpp:1535-1536)
7097 // RAGC uses reference_segments HashMap - ALWAYS prepare on-demand
7098 // Don't require groups to be in local buffer (other workers may have created them)
7099
7100 // Helper to prepare LZDiff from global reference_segments
7101 let prepare_on_demand = |key: &SegmentGroupKey, label: &str| -> Option<LZDiff> {
7102 let map_segments_locked = map_segments.read().unwrap();
7103 let ref_segments_locked = reference_segments.read().unwrap();
7104
7105 // C++ AGC uses map_segments[key] which returns 0 (default) if key doesn't exist.
7106 // v_segments[0] is a raw group initialized with empty_ctg = { 0x7f } (1 byte).
7107 // This gives maximum LZ cost (no compression) for non-existent groups.
7108 // To match C++ AGC behavior, use the actual reference if available,
7109 // otherwise use empty reference (gives max cost like C++ AGC's v_segments[0]).
7110 let segment_id = map_segments_locked.get(key).copied();
7111
7112 if let Some(ref_data) = segment_id.and_then(|id| ref_segments_locked.get(&id)) {
7113 // Reference exists! Prepare LZDiff on-demand
7114 if crate::env_cache::debug_split_ref() {
7115 eprintln!(
7116 "RAGC_SPLIT_REF: {}_key=({},{}) segment_id={:?} ref_size={} (ACTUAL)",
7117 label,
7118 key.kmer_front,
7119 key.kmer_back,
7120 segment_id,
7121 ref_data.len()
7122 );
7123 }
7124
7125 let mut lz = LZDiff::new(config.min_match_len as u32);
7126 lz.prepare(ref_data);
7127 return Some(lz);
7128 } else {
7129 // No reference data available for this group
7130 // C++ AGC uses v_segments[0] which is initialized with empty_ctg = { 0x7f } (1 byte)
7131 // This gives maximum LZ cost (no compression matches possible)
7132 // Return LZDiff prepared with empty reference to match C++ AGC behavior
7133 if crate::env_cache::debug_split_ref() {
7134 eprintln!(
7135 "RAGC_SPLIT_REF: {}_key=({},{}) segment_id={:?} ref_size=1 (EMPTY FALLBACK)",
7136 label, key.kmer_front, key.kmer_back, segment_id
7137 );
7138 }
7139
7140 // Use 1-byte dummy reference like C++ AGC's empty_ctg = { 0x7f }
7141 let empty_ref: Vec<u8> = vec![0x7f];
7142 let mut lz = LZDiff::new(config.min_match_len as u32);
7143 lz.prepare(&empty_ref);
7144 return Some(lz);
7145 }
7146 };
7147
7148 // Build segment in both orientations once
7149 let segment_dir = segment_data; // &Vec<u8>
7150 // Reverse-complement once
7151 let segment_rc_vec: Vec<u8> = reverse_complement_sequence(segment_data);
7152
7153 // Calculate compression costs and best split position using C++ FFI if enabled
7154 // Falls back to Rust implementation otherwise
7155 let maybe_best: Option<(usize, usize)> = None; // (best_pos, seg2_start)
7156 #[cfg(feature = "cpp_agc")]
7157 {
7158 // Inspect availability of left/right references and log keys
7159 let (left_seg_id_opt, right_seg_id_opt) = {
7160 let map_segments_locked = map_segments.read().unwrap();
7161 (
7162 map_segments_locked.get(left_key).copied(),
7163 map_segments_locked.get(right_key).copied(),
7164 )
7165 };
7166 let (left_have_ref, right_have_ref) = {
7167 let ref_segments_locked = reference_segments.read().unwrap();
7168 (
7169 left_seg_id_opt
7170 .and_then(|id| ref_segments_locked.get(&id))
7171 .is_some(),
7172 right_seg_id_opt
7173 .and_then(|id| ref_segments_locked.get(&id))
7174 .is_some(),
7175 )
7176 };
7177
7178 if config.verbosity > 1 {
7179 eprintln!(
7180 "SPLIT_KEYS: left=({:#x},{:#x}) right=({:#x},{:#x}) left_seg_id={:?} right_seg_id={:?} have_left_ref={} have_right_ref={}",
7181 left_key.kmer_front, middle_kmer, middle_kmer, right_key.kmer_back,
7182 left_seg_id_opt, right_seg_id_opt, left_have_ref, right_have_ref
7183 );
7184 }
7185
7186 // Prepare neighbor lists for FFI decision
7187 let (front_neighbors, back_neighbors) = {
7188 let term_map = map_segments_terminators.read().unwrap();
7189 (
7190 term_map.get(&front_kmer).cloned().unwrap_or_default(),
7191 term_map.get(&back_kmer).cloned().unwrap_or_default(),
7192 )
7193 };
7194
7195 // Always attempt FFI decision; if refs are missing, C++ will decide no-split
7196 let (ref_left_opt, ref_right_opt) = {
7197 let ref_segments_locked = reference_segments.read().unwrap();
7198 let l = left_seg_id_opt.and_then(|id| ref_segments_locked.get(&id).cloned());
7199 let r = right_seg_id_opt.and_then(|id| ref_segments_locked.get(&id).cloned());
7200 (l, r)
7201 };
7202 let empty: Vec<u8> = Vec::new();
7203 let ref_left = ref_left_opt.as_ref().unwrap_or(&empty);
7204 let ref_right = ref_right_opt.as_ref().unwrap_or(&empty);
7205
7206 if let Some((has_mid, mid, bp, s2, should)) = crate::ragc_ffi::decide_split(
7207 &front_neighbors,
7208 &back_neighbors,
7209 ref_left,
7210 ref_right,
7211 segment_dir,
7212 front_kmer,
7213 back_kmer,
7214 config.min_match_len as u32,
7215 config.k as u32,
7216 should_reverse,
7217 ) {
7218 if config.verbosity > 1 {
7219 eprintln!("FFI_DECIDE: has_middle={} middle={:#x} best_pos={} seg2_start={} should_split={} refs L={} R={}", has_mid, mid, bp, s2, should, ref_left.len(), ref_right.len());
7220 }
7221 if !has_mid {
7222 return None;
7223 }
7224
7225 // FFI found middle k-mer but may have said !should due to empty refs
7226 if should {
7227 maybe_best = Some((bp, s2));
7228 } else if force_split_on_empty_refs && ref_left.is_empty() && ref_right.is_empty() {
7229 // FALLBACK: FFI can't compute costs because refs are empty, but we want to
7230 // create new groups. Search for ANY terminator k-mer in the segment that can serve as a split point.
7231 // This handles the case where the exact middle_kmer from reference has a mutation in this sample.
7232 if config.verbosity > 1 {
7233 eprintln!("SPLIT_FALLBACK: FFI said no but force_split_on_empty_refs=true, searching for any terminator k-mer in segment");
7234 }
7235
7236 // Build a set of potential middle k-mers from both neighbor lists
7237 let mut potential_middles: AHashSet<u64> = AHashSet::new();
7238 for &kmer in front_neighbors.iter() {
7239 if kmer != MISSING_KMER && kmer != front_kmer && kmer != back_kmer {
7240 potential_middles.insert(kmer);
7241 }
7242 }
7243 for &kmer in back_neighbors.iter() {
7244 if kmer != MISSING_KMER && kmer != front_kmer && kmer != back_kmer {
7245 potential_middles.insert(kmer);
7246 }
7247 }
7248
7249 if config.verbosity > 1 {
7250 eprintln!(
7251 "SPLIT_FALLBACK: {} potential middle k-mers from terminators",
7252 potential_middles.len()
7253 );
7254 for &pm in potential_middles.iter().take(5) {
7255 eprintln!(" potential_middle: {:#x}", pm);
7256 }
7257 }
7258
7259 // Search for ANY terminator k-mer in the segment
7260 let k = config.k;
7261 if segment_dir.len() >= k && !potential_middles.is_empty() {
7262 let mut found_pos: Option<(usize, u64)> = None; // (pos, kmer)
7263 let mut kmer_obj =
7264 crate::kmer::Kmer::new(k as u32, crate::kmer::KmerMode::Canonical);
7265 for (i, &base) in segment_dir.iter().enumerate() {
7266 if base > 3 {
7267 kmer_obj.reset();
7268 } else {
7269 kmer_obj.insert(base as u64);
7270 if kmer_obj.is_full() {
7271 let kmer_at_pos = kmer_obj.data();
7272 let pos = i + 1 - k; // Position of k-mer start
7273 // Check if this k-mer is in our set of potential middles
7274 if potential_middles.contains(&kmer_at_pos) {
7275 // Ensure we're not at the very beginning or end
7276 if pos > k && pos + k + k < segment_dir.len() {
7277 found_pos = Some((pos, kmer_at_pos));
7278 break;
7279 }
7280 }
7281 }
7282 }
7283 }
7284
7285 if let Some((pos, found_kmer)) = found_pos {
7286 // Split at position just after the found k-mer
7287 let split_pos = pos + k;
7288 if split_pos > k + 1 && split_pos + k + 1 < segment_dir.len() {
7289 if config.verbosity > 1 {
7290 eprintln!("SPLIT_FALLBACK_FOUND: terminator kmer={:#x} found at pos={}, splitting at {}", found_kmer, pos, split_pos);
7291 }
7292 maybe_best = Some((split_pos, split_pos));
7293 } else if config.verbosity > 1 {
7294 eprintln!(
7295 "SPLIT_FALLBACK_DEGENERATE: pos={} split_pos={} segment_len={}",
7296 pos,
7297 split_pos,
7298 segment_dir.len()
7299 );
7300 }
7301 } else {
7302 // FALLBACK 2: Terminators not found - discover a NEW singleton k-mer in the segment
7303 // Similar to C++ AGC's find_new_splitters() but simpler: just find any singleton
7304 if config.verbosity > 1 {
7305 eprintln!("SPLIT_FALLBACK_DISCOVER: trying to find singleton k-mer in segment (len={})", segment_dir.len());
7306 }
7307
7308 // Collect all k-mers in the middle region of the segment
7309 let min_margin = k * 2; // Don't split too close to edges
7310 let search_start = min_margin;
7311 let search_end = segment_dir.len().saturating_sub(min_margin);
7312
7313 if search_end > search_start + k {
7314 // Enumerate k-mers and find singletons
7315 let mut kmer_positions: Vec<(u64, usize)> = Vec::new();
7316 let mut kmer_obj2 =
7317 crate::kmer::Kmer::new(k as u32, crate::kmer::KmerMode::Canonical);
7318
7319 for (i, &base) in
7320 segment_dir[search_start..search_end].iter().enumerate()
7321 {
7322 if base > 3 {
7323 kmer_obj2.reset();
7324 } else {
7325 kmer_obj2.insert(base as u64);
7326 if kmer_obj2.is_full() {
7327 let kmer_val = kmer_obj2.data();
7328 let pos = search_start + i + 1 - k;
7329 kmer_positions.push((kmer_val, pos));
7330 }
7331 }
7332 }
7333
7334 // Sort by k-mer value to find duplicates
7335 kmer_positions.sort_by_key(|&(kmer, _)| kmer);
7336
7337 // Find first singleton (k-mer that appears exactly once)
7338 let mut singleton_pos: Option<usize> = None;
7339 let mut i = 0;
7340 while i < kmer_positions.len() {
7341 let (kmer, pos) = kmer_positions[i];
7342 let mut j = i + 1;
7343 while j < kmer_positions.len() && kmer_positions[j].0 == kmer {
7344 j += 1;
7345 }
7346 // If exactly one occurrence, it's a singleton
7347 if j == i + 1 {
7348 singleton_pos = Some(pos);
7349 if config.verbosity > 1 {
7350 eprintln!("SPLIT_FALLBACK_SINGLETON: found singleton kmer={:#x} at pos={}", kmer, pos);
7351 }
7352 break;
7353 }
7354 i = j;
7355 }
7356
7357 if let Some(pos) = singleton_pos {
7358 let split_pos = pos + k;
7359 if config.verbosity > 1 {
7360 eprintln!(
7361 "SPLIT_FALLBACK_SINGLETON_SPLIT: splitting at {}",
7362 split_pos
7363 );
7364 }
7365 maybe_best = Some((split_pos, split_pos));
7366 } else if config.verbosity > 1 {
7367 eprintln!("SPLIT_FALLBACK_NO_SINGLETON: no singleton k-mers found in middle region");
7368 }
7369 } else if config.verbosity > 1 {
7370 eprintln!(
7371 "SPLIT_FALLBACK_TOO_SHORT: segment too short for singleton search"
7372 );
7373 }
7374 }
7375 }
7376 } else {
7377 return None;
7378 }
7379 } else if config.verbosity > 1 {
7380 eprintln!("FFI_DECIDE: unavailable (decide_split returned None)");
7381 }
7382 }
7383
7384 // If FFI provided best position, use it; otherwise compute costs in Rust
7385 let mut v_costs1 = if maybe_best.is_none() {
7386 if let Some(lz_left) = prepare_on_demand(left_key, "left") {
7387 #[cfg(feature = "cpp_agc")]
7388 {
7389 // Unused path when FFI returns best split; kept for completeness
7390 let ref_left = {
7391 let map_segments_locked = map_segments.read().unwrap();
7392 let ref_segments_locked = reference_segments.read().unwrap();
7393 let seg_id = map_segments_locked.get(left_key).copied().unwrap_or(0);
7394 ref_segments_locked.get(&seg_id).cloned()
7395 };
7396 if let Some(ref_data) = ref_left {
7397 if front_kmer < middle_kmer {
7398 crate::ragc_ffi::cost_vector(
7399 true,
7400 &ref_data,
7401 segment_dir,
7402 config.min_match_len as u32,
7403 )
7404 } else {
7405 let mut v = crate::ragc_ffi::cost_vector(
7406 false,
7407 &ref_data,
7408 &segment_rc_vec,
7409 config.min_match_len as u32,
7410 );
7411 v.reverse();
7412 v
7413 }
7414 } else {
7415 if config.verbosity > 1 {
7416 eprintln!("SPLIT_SKIP: left group has no reference yet");
7417 }
7418 return None;
7419 }
7420 }
7421 #[cfg(not(feature = "cpp_agc"))]
7422 {
7423 if front_kmer < middle_kmer {
7424 lz_left.get_coding_cost_vector(segment_dir, true)
7425 } else {
7426 let mut v = lz_left.get_coding_cost_vector(&segment_rc_vec, false);
7427 v.reverse();
7428 v
7429 }
7430 }
7431 } else {
7432 if config.verbosity > 1 {
7433 eprintln!("SPLIT_SKIP: left group has no reference yet");
7434 }
7435 if crate::env_cache::debug_split() {
7436 eprintln!(
7437 "RAGC_SPLIT_SKIP_LEFT: left_key=({},{}) has no reference",
7438 left_key.kmer_front, left_key.kmer_back
7439 );
7440 }
7441 return None;
7442 }
7443 } else {
7444 Vec::new()
7445 };
7446
7447 // Cumulative sum forward for v_costs1
7448 let mut sum = 0u32;
7449 for cost in v_costs1.iter_mut() {
7450 sum = sum.saturating_add(*cost);
7451 *cost = sum;
7452 }
7453
7454 let v_costs2 = if maybe_best.is_none() {
7455 if let Some(lz_right) = prepare_on_demand(right_key, "right") {
7456 #[cfg(feature = "cpp_agc")]
7457 {
7458 let ref_right = {
7459 let map_segments_locked = map_segments.read().unwrap();
7460 let ref_segments_locked = reference_segments.read().unwrap();
7461 let seg_id = map_segments_locked.get(right_key).copied().unwrap_or(0);
7462 ref_segments_locked.get(&seg_id).cloned()
7463 };
7464 if let Some(ref_data) = ref_right {
7465 let mut v = if middle_kmer < back_kmer {
7466 // Suffix placement, cumulative sum right-to-left
7467 crate::ragc_ffi::cost_vector(
7468 false,
7469 &ref_data,
7470 segment_dir,
7471 config.min_match_len as u32,
7472 )
7473 } else {
7474 // RC + prefix placement; cumulative sum left-to-right then reverse
7475 crate::ragc_ffi::cost_vector(
7476 true,
7477 &ref_data,
7478 &segment_rc_vec,
7479 config.min_match_len as u32,
7480 )
7481 };
7482 if middle_kmer < back_kmer {
7483 // Reverse cumulative sum
7484 let mut acc = 0u32;
7485 for cost in v.iter_mut().rev() {
7486 acc = acc.saturating_add(*cost);
7487 *cost = acc;
7488 }
7489 v
7490 } else {
7491 // Forward cumulative then reverse
7492 let mut acc = 0u32;
7493 for cost in v.iter_mut() {
7494 acc = acc.saturating_add(*cost);
7495 *cost = acc;
7496 }
7497 v.reverse();
7498 v
7499 }
7500 } else {
7501 if config.verbosity > 1 {
7502 eprintln!("SPLIT_SKIP: right group has no reference yet");
7503 }
7504 return None;
7505 }
7506 }
7507 #[cfg(not(feature = "cpp_agc"))]
7508 {
7509 if middle_kmer < back_kmer {
7510 let mut v = lz_right.get_coding_cost_vector(segment_dir, false);
7511 let mut acc = 0u32;
7512 for cost in v.iter_mut().rev() {
7513 acc = acc.saturating_add(*cost);
7514 *cost = acc;
7515 }
7516 v
7517 } else {
7518 let mut v = lz_right.get_coding_cost_vector(&segment_rc_vec, true);
7519 let mut acc = 0u32;
7520 for cost in v.iter_mut() {
7521 acc = acc.saturating_add(*cost);
7522 *cost = acc;
7523 }
7524 v.reverse();
7525 v
7526 }
7527 }
7528 } else {
7529 if config.verbosity > 1 {
7530 eprintln!("SPLIT_SKIP: right group has no reference yet");
7531 }
7532 return None;
7533 }
7534 } else {
7535 Vec::new()
7536 };
7537
7538 if maybe_best.is_none() && (v_costs1.is_empty() || v_costs2.is_empty()) {
7539 if config.verbosity > 1 {
7540 eprintln!("SPLIT_SKIP: cost vectors empty");
7541 }
7542 return None;
7543 }
7544
7545 if maybe_best.is_none() && v_costs1.len() != v_costs2.len() {
7546 if config.verbosity > 1 {
7547 eprintln!("SPLIT_SKIP: cost vector length mismatch");
7548 }
7549 return None;
7550 }
7551
7552 // Find position with minimum combined cost
7553 // Matches C++ AGC agc_compressor.cpp:1663-1674
7554 let mut best_pos = if let Some((p, _)) = maybe_best {
7555 p
7556 } else {
7557 let mut best_sum = u32::MAX;
7558 let mut pos = 0usize;
7559 for i in 0..v_costs1.len() {
7560 let cs = v_costs1[i].saturating_add(v_costs2[i]);
7561 if cs < best_sum {
7562 best_sum = cs;
7563 pos = i;
7564 }
7565 }
7566 pos
7567 };
7568
7569 #[cfg(feature = "verbose_debug")]
7570 if crate::env_cache::debug_split_map() && maybe_best.is_none() {
7571 let start = best_pos.saturating_sub(3);
7572 let end = (best_pos + 4).min(v_costs1.len());
7573 eprintln!(
7574 "RAGC_COST_WINDOW: len={} best_pos={}",
7575 v_costs1.len(),
7576 best_pos
7577 );
7578 for i in start..end {
7579 eprintln!(
7580 " i={} Lcum={} Rcum={} Sum={}{}",
7581 i,
7582 v_costs1[i],
7583 v_costs2[i],
7584 v_costs1[i].saturating_add(v_costs2[i]),
7585 if i == best_pos { " <--" } else { "" }
7586 );
7587 }
7588 }
7589
7590 // Apply degenerate position rules ALWAYS to prevent tiny segments at boundaries.
7591 // Even when FFI or fallback paths provide best_pos, we must enforce this constraint
7592 // to match C++ AGC behavior (agc_compressor.cpp:1685-1688).
7593 let k = config.k;
7594 let original_best_pos = best_pos; // Save for logging
7595 if best_pos < k + 1 {
7596 best_pos = 0; // Too close to start
7597 }
7598 if best_pos + k + 1 > v_costs1.len() {
7599 best_pos = v_costs1.len(); // Too close to end
7600 }
7601
7602 if config.verbosity > 1 && original_best_pos != best_pos {
7603 eprintln!(
7604 "BOUNDARY_CLAMP: original_best_pos={} clamped_to={} (len={}, k+1={}) source={}",
7605 original_best_pos,
7606 best_pos,
7607 v_costs1.len(),
7608 k + 1,
7609 if maybe_best.is_some() {
7610 "FFI/fallback"
7611 } else {
7612 "cost_calc"
7613 }
7614 );
7615 }
7616
7617 // Check if split is degenerate (C++ AGC agc_compressor.cpp:1400-1415)
7618 // C++ AGC ACCEPTS degenerate splits and assigns whole segment to one group
7619 // First compute sizes with exact best_pos; map to bytes afterward.
7620 let left_size_pre = best_pos;
7621 let right_size_pre = segment_data.len().saturating_sub(best_pos);
7622
7623 if left_size_pre == 0 {
7624 // Degenerate: whole segment matches RIGHT group
7625 // Return empty left, full segment as right (C++ AGC line 1400-1407)
7626 if config.verbosity > 1 {
7627 eprintln!("SPLIT_DEGENERATE_RIGHT: best_pos=0, assigning whole segment to RIGHT group");
7628 }
7629 return Some((Vec::new(), segment_data.to_vec(), middle_kmer));
7630 }
7631
7632 if right_size_pre == 0 {
7633 // Degenerate: whole segment matches LEFT group
7634 // Return full segment as left, empty right (C++ AGC line 1408-1415)
7635 if config.verbosity > 1 {
7636 eprintln!("SPLIT_DEGENERATE_LEFT: best_pos=len, assigning whole segment to LEFT group");
7637 }
7638 return Some((segment_data.to_vec(), Vec::new(), middle_kmer));
7639 }
7640
7641 // Non-degenerate split: use FFI seg2_start directly (it already accounts for orientation)
7642 let (left_data, right_data) = if let Some((bp, s2)) = maybe_best {
7643 if config.verbosity > 1 {
7644 eprintln!(
7645 "SPLIT_GEOM_SELECT(FFI): best_pos={} seg2_start={} should_reverse={}",
7646 bp, s2, should_reverse
7647 );
7648 }
7649 split_segment_from_start(segment_data.as_slice(), s2, config.k)
7650 } else {
7651 let half = if should_reverse {
7652 (config.k + 1) / 2
7653 } else {
7654 config.k / 2
7655 };
7656 let seg2_start = best_pos.saturating_sub(half);
7657 if config.verbosity > 1 {
7658 eprintln!(
7659 "SPLIT_GEOM_SELECT(local): best_pos={} k={} half={} seg2_start={} should_reverse={}",
7660 best_pos, config.k, half, seg2_start, should_reverse
7661 );
7662 }
7663 split_segment_from_start(segment_data.as_slice(), seg2_start, config.k)
7664 };
7665
7666 if config.verbosity > 1 {
7667 eprintln!(
7668 "SPLIT_SUCCESS: best_pos={} cost={} left_len={} right_len={}",
7669 best_pos,
7670 0u32, // best_sum not available under FFI path; placeholder
7671 left_data.len(),
7672 right_data.len()
7673 );
7674 }
7675
7676 Some((left_data, right_data, middle_kmer))
7677}
7678
7679#[cfg(test)]
7680mod tests {
7681 use super::*;
7682
7683 #[test]
7684 fn test_create_compressor() {
7685 let config = StreamingQueueConfig::default();
7686 let splitters = AHashSet::new();
7687 let compressor =
7688 StreamingQueueCompressor::with_splitters("/tmp/test_stream.agc", config, splitters);
7689 assert!(compressor.is_ok());
7690 }
7691
7692 #[test]
7693 fn test_queue_stats() {
7694 let config = StreamingQueueConfig::default();
7695 let splitters = AHashSet::new();
7696 let compressor =
7697 StreamingQueueCompressor::with_splitters("/tmp/test_stats.agc", config, splitters)
7698 .unwrap();
7699
7700 let stats = compressor.queue_stats();
7701 assert_eq!(stats.current_size_bytes, 0);
7702 assert_eq!(stats.current_items, 0);
7703 assert_eq!(stats.capacity_bytes, 2 * 1024 * 1024 * 1024);
7704 assert!(!stats.is_closed);
7705 }
7706
7707 #[test]
7708 fn test_push_and_finalize() {
7709 let config = StreamingQueueConfig {
7710 verbosity: 0, // Quiet for tests
7711 ..Default::default()
7712 };
7713 let splitters = AHashSet::new();
7714 let mut compressor =
7715 StreamingQueueCompressor::with_splitters("/tmp/test_push.agc", config, splitters)
7716 .unwrap();
7717
7718 // Push a small contig
7719 let data = vec![b'A'; 1000];
7720 compressor
7721 .push("sample1".to_string(), "chr1".to_string(), data)
7722 .unwrap();
7723
7724 // Finalize
7725 compressor.finalize().unwrap();
7726 }
7727}