Skip to main content

bbnorm_rs/
engine.rs

1use crate::cli::{CARDINALITY_MAX_BUCKETS, Config};
2use crate::kmer::{
3    KmerKey, canonical_short_code, for_each_kmer_for_record, unfiltered_kmer_windows_for_record,
4};
5use crate::peaks::write_peaks;
6use crate::seqio::{
7    BaseSettings, QualitySettings, SeqFormat, SequenceReader, SequenceRecord, SequenceSettings,
8    SequenceWriter, create_output_with_append, detect_interleaved_input_with_gzip_threads,
9};
10use anyhow::{Context, Result, bail, ensure};
11use rayon::prelude::*;
12use rustc_hash::FxHashMap;
13use std::alloc::{Layout, alloc_zeroed};
14use std::cmp::Ordering as CmpOrdering;
15use std::collections::{BTreeMap, BinaryHeap};
16use std::fs;
17use std::io::{BufReader, BufWriter, ErrorKind, Read, Write};
18use std::path::{Path, PathBuf};
19use std::process::{Child, ChildStdin, ChildStdout, Command, Stdio};
20use std::sync::{
21    Mutex, OnceLock,
22    atomic::{AtomicU32, AtomicU64, AtomicUsize, Ordering},
23};
24use std::time::{Instant, SystemTime, UNIX_EPOCH};
25
26pub type CountMap = FxHashMap<KmerKey, u64>;
27
28#[derive(Debug, Clone, Default, PartialEq, Eq)]
29pub struct CardinalityEstimate {
30    pub k: usize,
31    pub buckets: usize,
32    pub estimated_unique_kmers: u64,
33}
34
35struct KmerCardinalityEstimator {
36    k: usize,
37    buckets: usize,
38    seed: u64,
39    registers: Vec<u8>,
40}
41
42impl KmerCardinalityEstimator {
43    fn from_config(config: &Config) -> Self {
44        let buckets = config.cardinality.buckets.clamp(1, CARDINALITY_MAX_BUCKETS);
45        Self {
46            k: config.cardinality.k.unwrap_or(config.k),
47            buckets,
48            seed: config.cardinality.seed,
49            registers: vec![0; buckets],
50        }
51    }
52
53    fn observe_pair(&mut self, config: &Config, r1: &SequenceRecord, r2: Option<&SequenceRecord>) {
54        self.observe_record(config, r1);
55        if let Some(mate) = r2 {
56            self.observe_record(config, mate);
57        }
58    }
59
60    fn observe_record(&mut self, config: &Config, record: &SequenceRecord) {
61        for_each_kmer_for_record(record, config, |kmer| self.observe_key(&kmer));
62    }
63
64    fn observe_key(&mut self, key: &KmerKey) {
65        let raw = raw_kmer_key(key);
66        let kind_salt = match key {
67            KmerKey::Short(_) => 0x9E37_79B9_7F4A_7C15,
68            KmerKey::LongHash(_) => 0xD1B5_4A32_D192_ED03,
69        };
70        let hash = mix_seed(raw ^ self.seed ^ kind_salt);
71        let bucket = (((hash as u128) * (self.buckets as u128)) >> 64) as usize;
72        let rank_hash = mix_seed(hash ^ 0x94D0_49BB_1331_11EB);
73        let rank = rank_hash.leading_zeros().saturating_add(1).min(64) as u8;
74        if let Some(slot) = self.registers.get_mut(bucket) {
75            *slot = (*slot).max(rank);
76        }
77    }
78
79    fn estimate(&self) -> CardinalityEstimate {
80        let m = self.buckets as f64;
81        let zero_count = self
82            .registers
83            .iter()
84            .filter(|&&register| register == 0)
85            .count();
86        let inverse_sum: f64 = self
87            .registers
88            .iter()
89            .map(|&register| 2f64.powi(-(i32::from(register))))
90            .sum();
91        let raw_estimate = hll_alpha(self.buckets) * m * m / inverse_sum.max(f64::MIN_POSITIVE);
92        let corrected = if raw_estimate <= 2.5 * m && zero_count > 0 {
93            m * (m / zero_count as f64).ln()
94        } else {
95            raw_estimate
96        };
97        CardinalityEstimate {
98            k: self.k,
99            buckets: self.buckets,
100            estimated_unique_kmers: corrected.round().max(0.0) as u64,
101        }
102    }
103}
104
105fn hll_alpha(buckets: usize) -> f64 {
106    match buckets {
107        16 => 0.673,
108        32 => 0.697,
109        64 => 0.709,
110        _ => 0.7213 / (1.0 + 1.079 / buckets as f64),
111    }
112}
113
114trait CountLookup: Sync {
115    fn depth(&self, key: &KmerKey) -> u64;
116    fn unique_kmers(&self) -> usize;
117    fn unique_kmers_at_least(&self, min_depth: u64) -> usize;
118}
119
120impl CountLookup for CountMap {
121    fn depth(&self, key: &KmerKey) -> u64 {
122        self.get(key).copied().unwrap_or(0)
123    }
124
125    fn unique_kmers(&self) -> usize {
126        self.len()
127    }
128
129    fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
130        if min_depth <= 1 {
131            return self.len();
132        }
133        self.values().filter(|&&depth| depth >= min_depth).count()
134    }
135}
136
137enum InputCounts {
138    Exact(CountMap),
139    Sketch(PackedCountMinSketch),
140    AtomicSketch(AtomicCountMinSketch),
141    AtomicPackedSketch(AtomicPackedCountMinSketch),
142    PrefilteredSketch {
143        prefilter: PrefilterCountMinSketch,
144        limit: u64,
145        main: Box<InputCounts>,
146    },
147}
148
149#[derive(Clone, Copy)]
150struct PrefilterGate<'a> {
151    sketch: &'a PrefilterCountMinSketch,
152    limit: u64,
153}
154
155impl<'a> PrefilterGate<'a> {
156    fn new(sketch: &'a PrefilterCountMinSketch, limit: u64) -> Self {
157        Self {
158            sketch,
159            limit: limit.min(sketch.max_count()),
160        }
161    }
162
163    fn should_count_in_main(&self, key: &KmerKey) -> bool {
164        self.sketch.depth(key) >= self.limit
165    }
166}
167
168impl CountLookup for InputCounts {
169    fn depth(&self, key: &KmerKey) -> u64 {
170        match self {
171            Self::Exact(counts) => counts.depth(key),
172            Self::Sketch(sketch) => sketch.depth(key),
173            Self::AtomicSketch(sketch) => sketch.depth(key),
174            Self::AtomicPackedSketch(sketch) => sketch.depth(key),
175            Self::PrefilteredSketch {
176                prefilter,
177                limit,
178                main,
179            } => {
180                let prefilter_depth = prefilter.depth(key);
181                if prefilter_depth < *limit {
182                    prefilter_depth
183                } else {
184                    main.depth(key)
185                }
186            }
187        }
188    }
189
190    fn unique_kmers(&self) -> usize {
191        match self {
192            Self::Exact(counts) => counts.unique_kmers(),
193            Self::Sketch(sketch) => sketch.unique_kmers(),
194            Self::AtomicSketch(sketch) => sketch.unique_kmers(),
195            Self::AtomicPackedSketch(sketch) => sketch.unique_kmers(),
196            Self::PrefilteredSketch { prefilter, .. } => prefilter.unique_kmers(),
197        }
198    }
199
200    fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
201        match self {
202            Self::Exact(counts) => counts.unique_kmers_at_least(min_depth),
203            Self::Sketch(sketch) => sketch.unique_kmers_at_least(min_depth),
204            Self::AtomicSketch(sketch) => sketch.unique_kmers_at_least(min_depth),
205            Self::AtomicPackedSketch(sketch) => sketch.unique_kmers_at_least(min_depth),
206            Self::PrefilteredSketch {
207                prefilter,
208                limit,
209                main,
210            } => {
211                if min_depth < *limit {
212                    prefilter.unique_kmers_at_least(min_depth)
213                } else {
214                    main.unique_kmers_at_least(min_depth)
215                }
216            }
217        }
218    }
219}
220
221impl InputCounts {
222    #[cfg(test)]
223    fn unique_kmer_estimate_split(&self) -> Option<UniqueKmerEstimateSplit> {
224        self.unique_kmer_estimate().1
225    }
226
227    fn unique_kmer_estimate(&self) -> (usize, Option<UniqueKmerEstimateSplit>) {
228        match self {
229            Self::PrefilteredSketch {
230                prefilter, main, ..
231            } => {
232                let low_depth_max = prefilter.max_count();
233                let high_depth_min = low_depth_max.saturating_add(1);
234                let total = prefilter.unique_kmers();
235                let high_depth_kmers = main.unique_kmers_at_least(high_depth_min);
236                (
237                    total,
238                    Some(UniqueKmerEstimateSplit {
239                        low_depth_max,
240                        low_depth_kmers: total.saturating_sub(high_depth_kmers),
241                        high_depth_min,
242                        high_depth_kmers,
243                    }),
244                )
245            }
246            _ => (self.unique_kmers(), None),
247        }
248    }
249
250    fn sketch_layouts(&self) -> Vec<SketchLayoutSummary> {
251        let mut layouts = Vec::new();
252        self.append_sketch_layouts(&mut layouts, "input_main");
253        layouts
254    }
255
256    fn append_sketch_layouts(&self, layouts: &mut Vec<SketchLayoutSummary>, table: &'static str) {
257        match self {
258            Self::Exact(_) => {}
259            Self::Sketch(sketch) => layouts.push(sketch.layout_summary(table, None)),
260            Self::AtomicSketch(sketch) => layouts.push(sketch.layout_summary(table, None)),
261            Self::AtomicPackedSketch(sketch) => layouts.push(sketch.layout_summary(table, None)),
262            Self::PrefilteredSketch {
263                prefilter,
264                limit,
265                main,
266            } => {
267                layouts.push(prefilter.layout_summary("input_prefilter", Some(*limit)));
268                main.append_sketch_layouts(layouts, "input_main");
269            }
270        }
271    }
272}
273
274enum OutputCounts {
275    Exact(CountMap),
276    Sketch(PackedCountMinSketch),
277    AtomicSketch(AtomicCountMinSketch),
278}
279
280impl CountLookup for OutputCounts {
281    fn depth(&self, key: &KmerKey) -> u64 {
282        match self {
283            Self::Exact(counts) => counts.depth(key),
284            Self::Sketch(sketch) => sketch.depth(key),
285            Self::AtomicSketch(sketch) => sketch.depth(key),
286        }
287    }
288
289    fn unique_kmers(&self) -> usize {
290        match self {
291            Self::Exact(counts) => counts.unique_kmers(),
292            Self::Sketch(sketch) => sketch.unique_kmers(),
293            Self::AtomicSketch(sketch) => sketch.unique_kmers(),
294        }
295    }
296
297    fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
298        match self {
299            Self::Exact(counts) => counts.unique_kmers_at_least(min_depth),
300            Self::Sketch(sketch) => sketch.unique_kmers_at_least(min_depth),
301            Self::AtomicSketch(sketch) => sketch.unique_kmers_at_least(min_depth),
302        }
303    }
304}
305
306impl OutputCounts {
307    #[cfg(test)]
308    #[cfg(test)]
309    #[cfg(test)]
310    fn depth_hist(&self, hist_len: usize) -> Vec<u64> {
311        match self {
312            Self::Exact(counts) => count_map_depth_hist(counts, hist_len),
313            Self::Sketch(sketch) => sketch.depth_hist(hist_len),
314            Self::AtomicSketch(sketch) => sketch.depth_hist(hist_len),
315        }
316    }
317
318    fn sparse_depth_hist(&self, hist_len: usize) -> SparseHist {
319        match self {
320            Self::Exact(counts) => count_map_sparse_depth_hist(counts, hist_len),
321            Self::Sketch(sketch) => sketch.sparse_depth_hist(hist_len),
322            Self::AtomicSketch(sketch) => sketch.sparse_depth_hist(hist_len),
323        }
324    }
325
326    fn append_sketch_layouts(&self, layouts: &mut Vec<SketchLayoutSummary>, table: &'static str) {
327        match self {
328            Self::Exact(_) => {}
329            Self::Sketch(sketch) => layouts.push(sketch.layout_summary(table, None)),
330            Self::AtomicSketch(sketch) => layouts.push(sketch.layout_summary(table, None)),
331        }
332    }
333}
334
335#[derive(Debug, Clone)]
336struct PackedCountMinSketch {
337    cells: usize,
338    hashes: usize,
339    bits: u8,
340    max_count: u64,
341    layout: KCountArrayLayout,
342    update_mode: CountMinUpdateMode,
343    words: Vec<u64>,
344    increments: u64,
345    occupied_slots: usize,
346    tracked_slots: Option<Vec<usize>>,
347}
348
349#[derive(Debug, Clone, Copy, PartialEq, Eq)]
350enum CountMinUpdateMode {
351    Conservative,
352    Independent,
353}
354
355impl CountMinUpdateMode {
356    fn as_str(self) -> &'static str {
357        match self {
358            Self::Conservative => "conservative",
359            Self::Independent => "independent",
360        }
361    }
362}
363
364struct AtomicCountMinSketch {
365    cells: usize,
366    hashes: usize,
367    max_count: u32,
368    layout: KCountArrayLayout,
369    update_mode: CountMinUpdateMode,
370    parallel_replay: bool,
371    cells_by_hash: Vec<AtomicU32>,
372    locks: Vec<Mutex<()>>,
373    increments: AtomicU64,
374    occupied_slots: AtomicUsize,
375}
376
377enum PrefilterCountMinSketch {
378    Packed(PackedCountMinSketch),
379    AtomicPacked(AtomicPackedCountMinSketch),
380}
381
382struct AtomicPackedCountMinSketch {
383    cells: usize,
384    hashes: usize,
385    bits: u8,
386    max_count: u64,
387    layout: KCountArrayLayout,
388    update_mode: CountMinUpdateMode,
389    words: Vec<AtomicU64>,
390    locks: Vec<Mutex<()>>,
391    increments: AtomicU64,
392    occupied_slots: AtomicUsize,
393}
394
395const BBTOOLS_HASH_BITS: u32 = 6;
396const BBTOOLS_HASH_ARRAY_LENGTH: usize = 1 << BBTOOLS_HASH_BITS;
397const BBTOOLS_HASH_CELL_MASK: u64 = (BBTOOLS_HASH_ARRAY_LENGTH as u64) - 1;
398const BBTOOLS_LONG_MAX_VALUE: u64 = i64::MAX as u64;
399type BbtoolsHashMaskTable = [[u64; BBTOOLS_HASH_ARRAY_LENGTH]; 8];
400type BbtoolsHashMaskRef = &'static BbtoolsHashMaskTable;
401type BbtoolsHashMaskCache = FxHashMap<u64, BbtoolsHashMaskRef>;
402
403#[derive(Debug, Clone, Copy)]
404struct KCountArrayLayout {
405    array_mask: u64,
406    array_bits: u32,
407    cells_per_array: usize,
408    mask_seed: u64,
409    masks: BbtoolsHashMaskRef,
410}
411
412const COUNT_PARALLEL_CHUNK_SIZE: usize = 8192;
413const COUNT_CHUNK_LOCAL_MAP_MAX_CAPACITY: usize = 131_072;
414const COUNTUP_SORT_RUN_PAIR_LIMIT: usize = 65_536;
415const COUNTUP_SORT_RUN_BYTE_LIMIT: usize = 64 * 1024 * 1024;
416const COUNTUP_SORT_MERGE_FANIN: usize = 128;
417const COUNTUP_RUN_IO_BUFFER_CAPACITY: usize = 1024 * 1024;
418const COUNTUP_PREPASS_CHUNK_PAIR_LIMIT: usize = 1024;
419const COUNTUP_PREPASS_CHUNK_BYTE_LIMIT: usize = 16 * 1024 * 1024;
420const HIST_PARALLEL_CHUNK_SIZE: usize = 1024;
421const NORMALIZE_PARALLEL_CHUNK_SIZE: usize = 1024;
422const PAIRED_ANALYSIS_JOIN_MIN_BASES: usize = 1024;
423const COVERAGE_PAR_SORT_MIN_WINDOWS: usize = 4096;
424const OVERLAP_AUTO_SAMPLE_PAIRS: u64 = 1_000_000;
425const ATOMIC_SKETCH_PAR_REPLAY_MIN_KEYS: usize = 16_384;
426const PACKED_SKETCH_TRACKED_SLOT_LIMIT: usize = 8_000_000;
427const OVERLAP_AUTO_SAMPLE_INTERVAL: u64 = 100;
428const OVERLAP_AUTO_ENABLE_FRACTION: f64 = 0.25;
429const DEFAULT_PREFILTER_CELLS: usize = 1 << 20;
430const DEFAULT_PREFILTER_BITS: u8 = 2;
431const DEFAULT_PREFILTER_FRACTION_MICROS: u32 = 350_000;
432const OUTPUT_COUNT_MIN_AUTO_FRACTION_MICROS: u32 = 250_000;
433const OUTPUT_COUNT_MIN_AUTO_MIN_MEMORY_BYTES: usize = 64 * 1024 * 1024;
434const AUTO_COUNT_MIN_FALLBACK_MEMORY_BYTES: usize = 2 * 1024 * 1024 * 1024;
435const AUTO_COUNT_MIN_MAX_MEMORY_BYTES: usize = 2 * 1024 * 1024 * 1024;
436const AUTO_COUNT_MIN_MIN_MEMORY_BYTES: usize = 256 * 1024 * 1024;
437const BBTOOLS_MEMORY_HEADROOM_BYTES: usize = 96_000_000;
438const EXPLICIT_COUNT_MIN_SAFE_MEMORY_PERCENT: usize = 85;
439const BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS: usize = 2;
440const BBTOOLS_KCOUNT_ARRAY_SHARD_MIN_CELLS: usize = 64;
441const BBTOOLS_KCOUNT_ARRAY_MAX_HASHES: usize = 8;
442const BBTOOLS_KCOUNT_ARRAY_LOCKS: usize = 1999;
443const BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED: u64 = 0;
444const BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED: u64 = 7;
445const BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP: u64 = 7;
446const BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED: u64 =
447    BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED + BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP;
448const PEAK_COMPACT_ZERO_TAIL: usize = 32;
449static NONDETERMINISTIC_SEED_COUNTER: AtomicU64 = AtomicU64::new(0);
450type AnalysisPair = (SequenceRecord, Option<SequenceRecord>, Option<f64>);
451type NormalizationInput = (usize, SequenceRecord, Option<SequenceRecord>, f64);
452type SparseHist = FxHashMap<usize, u64>;
453type SparseReadDepthHist = FxHashMap<usize, (u64, u64)>;
454
455struct InputHistSinks<'a> {
456    depth: Option<&'a mut SparseHist>,
457    read: Option<&'a mut SparseReadDepthHist>,
458}
459
460#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
461pub struct UniqueKmerEstimateSplit {
462    pub low_depth_max: u64,
463    pub low_depth_kmers: usize,
464    pub high_depth_min: u64,
465    pub high_depth_kmers: usize,
466}
467
468#[derive(Debug, Clone, PartialEq, Eq)]
469pub struct SketchLayoutSummary {
470    pub table: &'static str,
471    pub kind: &'static str,
472    pub cells: usize,
473    pub hashes: usize,
474    pub bits: u8,
475    pub arrays: usize,
476    pub cells_per_array: usize,
477    pub mask_seed: u64,
478    pub update_mode: &'static str,
479    pub max_count: u64,
480    pub memory_bytes: usize,
481    pub prefilter_limit: Option<u64>,
482}
483
484#[derive(Debug, Clone, Default, PartialEq, Eq)]
485pub struct StageTiming {
486    pub name: &'static str,
487    pub elapsed_micros: u128,
488}
489
490#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
491pub struct CountupSpillSummary {
492    pub initial_runs: usize,
493    pub merge_runs: usize,
494    pub final_runs: usize,
495    pub bytes_written: u64,
496    pub peak_live_bytes: u64,
497    pub final_live_bytes: u64,
498}
499
500impl CountupSpillSummary {
501    pub fn has_spills(&self) -> bool {
502        self.initial_runs > 0 || self.merge_runs > 0 || self.bytes_written > 0
503    }
504
505    fn note_initial_run(&mut self, bytes: u64) {
506        self.initial_runs = self.initial_runs.saturating_add(1);
507        self.note_written(bytes);
508    }
509
510    fn note_merge_run(&mut self, bytes: u64) {
511        self.merge_runs = self.merge_runs.saturating_add(1);
512        self.note_written(bytes);
513    }
514
515    fn note_written(&mut self, bytes: u64) {
516        self.bytes_written = self.bytes_written.saturating_add(bytes);
517        self.final_live_bytes = self.final_live_bytes.saturating_add(bytes);
518        self.peak_live_bytes = self.peak_live_bytes.max(self.final_live_bytes);
519    }
520
521    fn note_removed(&mut self, bytes: u64) {
522        self.final_live_bytes = self.final_live_bytes.saturating_sub(bytes);
523    }
524}
525
526#[derive(Debug, Clone, Default, PartialEq, Eq)]
527pub struct RunSummary {
528    pub reads_in: u64,
529    pub bases_in: u64,
530    pub reads_kept: u64,
531    pub reads_tossed: u64,
532    pub bases_kept: u64,
533    pub bases_tossed: u64,
534    pub unique_kmers_in: usize,
535    pub unique_kmers_in_split: Option<UniqueKmerEstimateSplit>,
536    pub unique_kmers_out: Option<usize>,
537    pub cardinality_in: Option<CardinalityEstimate>,
538    pub cardinality_out: Option<CardinalityEstimate>,
539    pub sketch_layouts: Vec<SketchLayoutSummary>,
540    pub stage_timings: Vec<StageTiming>,
541    pub countup_spill: CountupSpillSummary,
542}
543
544#[derive(Debug, Clone, Default)]
545struct ReadAnalysis {
546    depth_al: Option<u64>,
547    true_depth: Option<u64>,
548    min_true_depth: Option<u64>,
549    low_kmer_count: usize,
550    total_kmer_count: usize,
551    error: bool,
552    had_kmer_windows: bool,
553    coverage_desc: Vec<i64>,
554}
555
556#[derive(Debug, Clone, Default)]
557struct PairAnalysis {
558    read1: ReadAnalysis,
559    read2: Option<ReadAnalysis>,
560    depth_proxy_al: Option<u64>,
561    max_true_depth: Option<u64>,
562    low_kmer_count: usize,
563    total_kmer_count: usize,
564    error1: bool,
565    error2: bool,
566}
567
568#[derive(Debug, Clone, Default)]
569struct PairDecision {
570    toss: bool,
571    analysis: PairAnalysis,
572}
573
574#[derive(Debug, Clone, Default, PartialEq, Eq)]
575struct CountupDecisionPlan {
576    toss: bool,
577    eligible_key_indices: Vec<usize>,
578}
579
580#[derive(Debug, Clone)]
581struct NormalizedPair {
582    input_list_index: usize,
583    r1: SequenceRecord,
584    r2: Option<SequenceRecord>,
585    out_r1: SequenceRecord,
586    out_r2: Option<SequenceRecord>,
587    decision: PairDecision,
588    uncorrectable: bool,
589    read_count: u64,
590    base_count: u64,
591}
592
593#[derive(Debug, Clone)]
594struct CountupWorkPair {
595    input_list_index: usize,
596    sort_key: CountupSortKey,
597    r1: SequenceRecord,
598    r2: Option<SequenceRecord>,
599}
600
601#[derive(Debug, Clone)]
602struct CountupWorkCandidate {
603    input_list_index: usize,
604    original_index: usize,
605    rand: f64,
606    r1: SequenceRecord,
607    r2: Option<SequenceRecord>,
608}
609
610struct CountupWorkBuild {
611    source: CountupWorkSource,
612    input_hist: Option<SparseHist>,
613    input_read_hist: Option<SparseReadDepthHist>,
614    input_hist_elapsed_micros: u128,
615    format1: SeqFormat,
616    format2: Option<SeqFormat>,
617    spill_summary: CountupSpillSummary,
618}
619
620struct CountupChunkBuild {
621    work_pairs: Vec<CountupWorkPair>,
622    depth_hist: SparseHist,
623    read_hist: SparseReadDepthHist,
624}
625
626struct CountupInputHistAccumulator<'a> {
627    wants_depth_hist: bool,
628    wants_read_hist: bool,
629    depth_hist: &'a mut SparseHist,
630    read_hist: &'a mut SparseReadDepthHist,
631}
632
633#[derive(Debug, Clone)]
634struct CountupSortKey {
635    errors: usize,
636    total_len: usize,
637    expected_errors: f64,
638    numeric_id: u64,
639    original_index: usize,
640}
641
642struct CountupPrepassResult {
643    include: bool,
644    sort_analysis: Option<PairAnalysis>,
645}
646
647struct CountupWorkSource {
648    temp_dir: Option<tempfile::TempDir>,
649    inner: CountupWorkSourceInner,
650}
651
652enum CountupWorkSourceInner {
653    Memory(Vec<CountupWorkPair>),
654    Spilled(Vec<PathBuf>),
655}
656
657struct CountupWorkIter {
658    _temp_dir: Option<tempfile::TempDir>,
659    inner: CountupWorkIterInner,
660}
661
662enum CountupWorkIterInner {
663    Memory(std::vec::IntoIter<CountupWorkPair>),
664    Spilled(CountupRunMerger),
665}
666
667struct CountupRunMerger {
668    readers: Vec<CountupRunReader>,
669    heap: BinaryHeap<CountupRunHead>,
670}
671
672struct CountupRunReader {
673    reader: BufReader<fs::File>,
674}
675
676struct CountupRunHead {
677    pair: CountupWorkPair,
678    run_index: usize,
679}
680
681#[derive(Debug, Clone, Copy, Default)]
682struct CorrectionResult {
683    corrected: usize,
684    marked: usize,
685    uncorrectable: bool,
686}
687
688#[derive(Debug, Clone, Copy)]
689struct CorrectionTarget {
690    low: i64,
691    lower_bound: i64,
692    upper_bound: i64,
693    mult: i64,
694}
695
696#[derive(Debug, Clone)]
697struct InputLists {
698    first: Vec<PathBuf>,
699    second: Option<Vec<PathBuf>>,
700}
701
702#[derive(Debug, Clone, Default)]
703struct ReadDepthHistogram {
704    reads: Vec<u64>,
705    bases: Vec<u64>,
706}
707
708impl ReadDepthHistogram {
709    fn new(len: usize) -> Self {
710        Self {
711            reads: vec![0; len],
712            bases: vec![0; len],
713        }
714    }
715}
716
717#[derive(Debug, Clone, Copy, Default)]
718struct BaseCounts {
719    a: u64,
720    c: u64,
721    g: u64,
722    t: u64,
723    n: u64,
724}
725
726impl BaseCounts {
727    fn total(self) -> u64 {
728        self.a + self.c + self.g + self.t + self.n
729    }
730}
731
732#[derive(Debug, Clone, Default)]
733struct BaseContentHistogram {
734    first: Vec<BaseCounts>,
735    second: Vec<BaseCounts>,
736}
737
738#[derive(Debug, Clone, Copy, Default)]
739struct MatchCounts {
740    matches: u64,
741    n: u64,
742}
743
744#[derive(Debug, Clone, Default)]
745struct AlignmentFallbackHistograms {
746    first_match: Vec<MatchCounts>,
747    second_match: Vec<MatchCounts>,
748    quality_match: Vec<u64>,
749    read_count: u64,
750    base_count: u64,
751    pair_count: u64,
752    paired: bool,
753}
754
755#[derive(Debug, Clone, Default)]
756struct QualitySideHistograms {
757    overall: Vec<u64>,
758    first_counts: Vec<u64>,
759    second_counts: Vec<u64>,
760    first_avg: Vec<u64>,
761    second_avg: Vec<u64>,
762    first_by_pos: Vec<Vec<u64>>,
763    second_by_pos: Vec<Vec<u64>>,
764    paired: bool,
765}
766
767#[derive(Debug, Clone, Default)]
768struct ReadLocalSideHistograms {
769    quality: Option<QualitySideHistograms>,
770    length: Option<ReadDepthHistogram>,
771    gc: Option<ReadDepthHistogram>,
772    base: Option<BaseContentHistogram>,
773    entropy: Option<Vec<u64>>,
774    identity: Option<ReadDepthHistogram>,
775    alignment: Option<AlignmentFallbackHistograms>,
776    barcodes: Option<BTreeMap<String, u64>>,
777}
778
779#[derive(Debug, Clone)]
780struct JavaXoshiro {
781    s0: u64,
782    s1: u64,
783    s2: u64,
784    s3: u64,
785}
786
787impl JavaXoshiro {
788    fn new(seed: u64) -> Self {
789        let mut rng = Self {
790            s0: seed,
791            s1: mix_seed(seed),
792            s2: 0,
793            s3: 0,
794        };
795        rng.s2 = mix_seed(rng.s1);
796        rng.s3 = mix_seed(rng.s2);
797        if rng.s0 == 0 && rng.s1 == 0 && rng.s2 == 0 && rng.s3 == 0 {
798            rng.s0 = 0x5DEECE66D;
799            rng.s1 = 0xB;
800            rng.s2 = 0xCCA;
801            rng.s3 = 0xF00;
802        }
803        for _ in 0..4 {
804            rng.next_long();
805        }
806        rng
807    }
808
809    fn next_long(&mut self) -> u64 {
810        let result = self.s0.wrapping_add(self.s3);
811        let t = self.s1 << 17;
812
813        self.s2 ^= self.s0;
814        self.s3 ^= self.s1;
815        self.s1 ^= self.s2;
816        self.s0 ^= self.s3;
817
818        self.s2 ^= t;
819        self.s3 = self.s3.rotate_left(45);
820
821        result
822    }
823
824    fn next_double(&mut self) -> f64 {
825        ((self.next_long() >> 11) as f64) * (1.0 / ((1u64 << 53) as f64))
826    }
827}
828
829fn run_random_seed(config: &Config) -> u64 {
830    if config.deterministic {
831        0
832    } else {
833        nondeterministic_seed()
834    }
835}
836
837fn nondeterministic_seed() -> u64 {
838    let nanos = SystemTime::now()
839        .duration_since(UNIX_EPOCH)
840        .map(|duration| duration.as_nanos() as u64)
841        .unwrap_or(0);
842    let counter = NONDETERMINISTIC_SEED_COUNTER.fetch_add(1, Ordering::Relaxed);
843    nanos ^ ((std::process::id() as u64) << 32) ^ mix_seed(counter.wrapping_add(0x9E37_79B9))
844}
845
846fn mix_seed(mut x: u64) -> u64 {
847    x = x.wrapping_add(0x9E37_79B9_7F4A_7C15);
848    x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
849    x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
850    x ^ (x >> 31)
851}
852
853struct PrimaryReaders {
854    r1: SequenceReader,
855    r2: Option<SequenceReader>,
856    interleaved: bool,
857    input_list1: Vec<PathBuf>,
858    input_list2: Option<Vec<PathBuf>>,
859    input_list_index: usize,
860    settings: SequenceSettings,
861    limit_per_file: Option<u64>,
862    pairs_seen_in_file: u64,
863    format1: SeqFormat,
864    format2: Option<SeqFormat>,
865    next_pair_numeric_id: u64,
866    gzip_threads: Option<usize>,
867}
868
869impl PrimaryReaders {
870    fn open(config: &Config, limit_per_file: Option<u64>) -> Result<Self> {
871        let in1 = config.in1.as_ref().context("missing in1")?;
872        let sequence_settings = sequence_settings(config);
873        let input_list = primary_input_lists(config);
874        let first_path = input_list
875            .as_ref()
876            .and_then(|paths| paths.first.first())
877            .unwrap_or(in1);
878        let r2_path = input_list
879            .as_ref()
880            .and_then(|paths| paths.second.as_ref())
881            .and_then(|paths| paths.first())
882            .or(config.in2.as_ref());
883        let gzip_threads = gzip_threads_for_paths(
884            config.gzip_threads,
885            [Some(first_path.as_path()), r2_path.map(PathBuf::as_path)],
886        );
887        let r1 =
888            open_sequence_reader_with_gzip_threads(first_path, sequence_settings, gzip_threads)?;
889        let interleaved = input_list.is_none()
890            && config.in2.is_none()
891            && (config.interleaved
892                || (config.test_interleaved
893                    && detect_interleaved_input_with_gzip_threads(
894                        first_path,
895                        sequence_settings,
896                        config.gzip_threads,
897                    )?));
898        let r2 = r2_path
899            .map(|path| {
900                open_sequence_reader_with_gzip_threads(path, sequence_settings, gzip_threads)
901            })
902            .transpose()?;
903        if let Some(r2_ref) = &r2
904            && r1.format() != r2_ref.format()
905        {
906            bail!("paired inputs must use the same FASTA/FASTQ format");
907        }
908        let format1 = r1.format();
909        let format2 = if interleaved {
910            Some(format1)
911        } else {
912            r2.as_ref().map(SequenceReader::format)
913        };
914        Ok(Self {
915            r1,
916            r2,
917            interleaved,
918            input_list1: input_list
919                .as_ref()
920                .map(|paths| paths.first.clone())
921                .unwrap_or_default(),
922            input_list2: input_list.and_then(|paths| paths.second),
923            input_list_index: 0,
924            settings: sequence_settings,
925            limit_per_file,
926            pairs_seen_in_file: 0,
927            format1,
928            format2,
929            next_pair_numeric_id: 0,
930            gzip_threads: config.gzip_threads,
931        })
932    }
933
934    fn format1(&self) -> SeqFormat {
935        self.format1
936    }
937
938    fn format2(&self) -> Option<SeqFormat> {
939        self.format2
940    }
941
942    fn input_list_index(&self) -> usize {
943        self.input_list_index
944    }
945
946    fn next_pair(&mut self) -> Result<Option<(SequenceRecord, Option<SequenceRecord>)>> {
947        if !self.input_list1.is_empty() {
948            return self.next_list_record();
949        }
950        if limit_reached(self.limit_per_file, self.pairs_seen_in_file) {
951            return Ok(None);
952        }
953        let r1 = self.r1.next_record()?;
954        if self.interleaved {
955            return match r1 {
956                Some(mut record) => {
957                    let mut mate = self
958                        .r1
959                        .next_record()?
960                        .context("interleaved input ended after an unmatched first mate record")?;
961                    record.numeric_id = self.next_pair_numeric_id;
962                    mate.numeric_id = self.next_pair_numeric_id;
963                    self.next_pair_numeric_id += 1;
964                    self.pairs_seen_in_file += 1;
965                    Ok(Some((record, Some(mate))))
966                }
967                None => Ok(None),
968            };
969        }
970
971        let r2 = match &mut self.r2 {
972            Some(reader) => reader.next_record()?,
973            None => None,
974        };
975
976        match (r1, r2) {
977            (None, None) => Ok(None),
978            (Some(record), mate) => {
979                self.pairs_seen_in_file += 1;
980                Ok(Some((record, mate)))
981            }
982            (None, Some(_)) => bail!("in2 has more records than in1"),
983        }
984    }
985
986    fn next_list_record(&mut self) -> Result<Option<(SequenceRecord, Option<SequenceRecord>)>> {
987        loop {
988            if limit_reached(self.limit_per_file, self.pairs_seen_in_file) {
989                if !self.advance_list_reader()? {
990                    return Ok(None);
991                }
992                continue;
993            }
994            let had_r2 = self.r2.is_some();
995            let r1 = self.r1.next_record()?;
996            let r2 = match &mut self.r2 {
997                Some(reader) => reader.next_record()?,
998                None => None,
999            };
1000            match (r1, r2) {
1001                (Some(record), Some(mate)) => {
1002                    self.pairs_seen_in_file += 1;
1003                    return Ok(Some((record, Some(mate))));
1004                }
1005                (Some(record), None) if !had_r2 => {
1006                    self.pairs_seen_in_file += 1;
1007                    return Ok(Some((record, None)));
1008                }
1009                (Some(_), None) => bail!("in2 has fewer records than in1"),
1010                (None, Some(_)) => bail!("in2 has more records than in1"),
1011                (None, None) => {
1012                    if !self.advance_list_reader()? {
1013                        return Ok(None);
1014                    }
1015                }
1016            }
1017        }
1018    }
1019
1020    fn advance_list_reader(&mut self) -> Result<bool> {
1021        if self.input_list_index + 1 >= self.input_list1.len() {
1022            return Ok(false);
1023        }
1024        self.input_list_index += 1;
1025        let path = &self.input_list1[self.input_list_index];
1026        let second_path = self
1027            .input_list2
1028            .as_ref()
1029            .and_then(|paths| paths.get(self.input_list_index));
1030        let gzip_threads = gzip_threads_for_paths(
1031            self.gzip_threads,
1032            [Some(path.as_path()), second_path.map(PathBuf::as_path)],
1033        );
1034        let reader =
1035            SequenceReader::from_path_with_gzip_threads(path, self.settings, gzip_threads)?;
1036        if reader.format() != self.format1 {
1037            bail!("comma-separated input list entries must use the same FASTA/FASTQ format");
1038        }
1039        self.r2 = self
1040            .input_list2
1041            .as_ref()
1042            .and_then(|paths| paths.get(self.input_list_index))
1043            .map(|path| {
1044                SequenceReader::from_path_with_gzip_threads(path, self.settings, gzip_threads)
1045            })
1046            .transpose()?;
1047        if let Some(r2_ref) = &self.r2
1048            && Some(r2_ref.format()) != self.format2
1049        {
1050            bail!("comma-separated paired input list entries must use the same FASTA/FASTQ format");
1051        }
1052        self.r1 = reader;
1053        self.pairs_seen_in_file = 0;
1054        Ok(true)
1055    }
1056}
1057
1058fn open_sequence_reader(
1059    config: &Config,
1060    path: &Path,
1061    settings: SequenceSettings,
1062) -> Result<SequenceReader> {
1063    SequenceReader::from_path_with_gzip_threads(path, settings, config.gzip_threads)
1064}
1065
1066fn open_sequence_reader_with_gzip_threads(
1067    path: &Path,
1068    settings: SequenceSettings,
1069    gzip_threads: Option<usize>,
1070) -> Result<SequenceReader> {
1071    SequenceReader::from_path_with_gzip_threads(path, settings, gzip_threads)
1072}
1073
1074fn open_paired_sequence_readers(
1075    config: &Config,
1076    path1: &Path,
1077    path2: &Path,
1078    settings: SequenceSettings,
1079) -> Result<(SequenceReader, SequenceReader)> {
1080    let gzip_threads = gzip_threads_for_paths(config.gzip_threads, [Some(path1), Some(path2)]);
1081    let reader1 = open_sequence_reader_with_gzip_threads(path1, settings, gzip_threads)?;
1082    let reader2 = open_sequence_reader_with_gzip_threads(path2, settings, gzip_threads)?;
1083    Ok((reader1, reader2))
1084}
1085
1086fn gzip_threads_for_paths<'a>(
1087    gzip_threads: Option<usize>,
1088    paths: impl IntoIterator<Item = Option<&'a Path>>,
1089) -> Option<usize> {
1090    let gzip_streams = paths
1091        .into_iter()
1092        .flatten()
1093        .filter(|path| path_uses_gzip(path))
1094        .count();
1095    gzip_threads_for_streams(gzip_threads, gzip_streams)
1096}
1097
1098fn gzip_threads_for_streams(gzip_threads: Option<usize>, gzip_streams: usize) -> Option<usize> {
1099    gzip_threads.map(|threads| {
1100        if threads <= 1 || gzip_streams <= 1 {
1101            threads
1102        } else {
1103            (threads / gzip_streams).max(1)
1104        }
1105    })
1106}
1107
1108fn path_uses_gzip(path: &Path) -> bool {
1109    path.extension()
1110        .and_then(|ext| ext.to_str())
1111        .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
1112}
1113
1114struct OptionalWriters {
1115    interleaved_output: bool,
1116    current_output_list_index: usize,
1117    keep_plan: OutputPathPlan,
1118    toss_plan: OutputPathPlan,
1119    low_plan: OutputPathPlan,
1120    mid_plan: OutputPathPlan,
1121    high_plan: OutputPathPlan,
1122    uncorrected_plan: OutputPathPlan,
1123    keep1: Option<SequenceWriter>,
1124    keep2: Option<SequenceWriter>,
1125    toss1: Option<SequenceWriter>,
1126    toss2: Option<SequenceWriter>,
1127    low1: Option<SequenceWriter>,
1128    low2: Option<SequenceWriter>,
1129    mid1: Option<SequenceWriter>,
1130    mid2: Option<SequenceWriter>,
1131    high1: Option<SequenceWriter>,
1132    high2: Option<SequenceWriter>,
1133    uncorrected1: Option<SequenceWriter>,
1134    uncorrected2: Option<SequenceWriter>,
1135}
1136
1137impl OptionalWriters {
1138    fn open(config: &Config, _format1: SeqFormat, format2: Option<SeqFormat>) -> Result<Self> {
1139        if format2.is_none() && has_second_output(config) {
1140            bail!(
1141                "second-output paths require paired input; interleaved auto-detection did not detect paired records"
1142            );
1143        }
1144        let paired = format2.is_some();
1145        let input_list_len = primary_input_lists(config)
1146            .map(|paths| paths.first.len())
1147            .unwrap_or(1);
1148        let keep_plan = prepare_output_path_plan(
1149            config.out1.as_deref(),
1150            config.out2.as_deref(),
1151            paired,
1152            input_list_len,
1153        )?;
1154        let toss_plan = prepare_output_path_plan(
1155            config.out_toss1.as_deref(),
1156            config.out_toss2.as_deref(),
1157            paired,
1158            input_list_len,
1159        )?;
1160        let low_plan = prepare_output_path_plan(
1161            config.out_low1.as_deref(),
1162            config.out_low2.as_deref(),
1163            paired,
1164            input_list_len,
1165        )?;
1166        let mid_plan = prepare_output_path_plan(
1167            config.out_mid1.as_deref(),
1168            config.out_mid2.as_deref(),
1169            paired,
1170            input_list_len,
1171        )?;
1172        let high_plan = prepare_output_path_plan(
1173            config.out_high1.as_deref(),
1174            config.out_high2.as_deref(),
1175            paired,
1176            input_list_len,
1177        )?;
1178        let uncorrected_plan = prepare_output_path_plan(
1179            config.out_uncorrected1.as_deref(),
1180            config.out_uncorrected2.as_deref(),
1181            paired,
1182            input_list_len,
1183        )?;
1184        let output_gzip_threads = output_gzip_threads_for_plans(
1185            config.gzip_threads,
1186            [
1187                &keep_plan,
1188                &toss_plan,
1189                &low_plan,
1190                &mid_plan,
1191                &high_plan,
1192                &uncorrected_plan,
1193            ],
1194            0,
1195        )?;
1196        let (keep1, keep2) = open_output_pair(
1197            keep_plan.pair_for_index(0)?,
1198            config.overwrite,
1199            config.append,
1200            config.quality_out_offset,
1201            config.fake_quality,
1202            config.fasta_wrap,
1203            output_gzip_threads,
1204        )?;
1205        let (toss1, toss2) = open_output_pair(
1206            toss_plan.pair_for_index(0)?,
1207            config.overwrite,
1208            config.append,
1209            config.quality_out_offset,
1210            config.fake_quality,
1211            config.fasta_wrap,
1212            output_gzip_threads,
1213        )?;
1214        let (low1, low2) = open_output_pair(
1215            low_plan.pair_for_index(0)?,
1216            config.overwrite,
1217            config.append,
1218            config.quality_out_offset,
1219            config.fake_quality,
1220            config.fasta_wrap,
1221            output_gzip_threads,
1222        )?;
1223        let (mid1, mid2) = open_output_pair(
1224            mid_plan.pair_for_index(0)?,
1225            config.overwrite,
1226            config.append,
1227            config.quality_out_offset,
1228            config.fake_quality,
1229            config.fasta_wrap,
1230            output_gzip_threads,
1231        )?;
1232        let (high1, high2) = open_output_pair(
1233            high_plan.pair_for_index(0)?,
1234            config.overwrite,
1235            config.append,
1236            config.quality_out_offset,
1237            config.fake_quality,
1238            config.fasta_wrap,
1239            output_gzip_threads,
1240        )?;
1241        let (uncorrected1, uncorrected2) = open_output_pair(
1242            uncorrected_plan.pair_for_index(0)?,
1243            config.overwrite,
1244            config.append,
1245            config.quality_out_offset,
1246            config.fake_quality,
1247            config.fasta_wrap,
1248            output_gzip_threads,
1249        )?;
1250        Ok(Self {
1251            interleaved_output: paired,
1252            current_output_list_index: 0,
1253            keep_plan,
1254            toss_plan,
1255            low_plan,
1256            mid_plan,
1257            high_plan,
1258            uncorrected_plan,
1259            keep1,
1260            keep2,
1261            toss1,
1262            toss2,
1263            low1,
1264            low2,
1265            mid1,
1266            mid2,
1267            high1,
1268            high2,
1269            uncorrected1,
1270            uncorrected2,
1271        })
1272    }
1273
1274    fn sync_to_input_list_index(&mut self, config: &Config, index: usize) -> Result<()> {
1275        if self.current_output_list_index == index {
1276            return Ok(());
1277        }
1278        self.flush()?;
1279        let output_gzip_threads = output_gzip_threads_for_plans(
1280            config.gzip_threads,
1281            [
1282                &self.keep_plan,
1283                &self.toss_plan,
1284                &self.low_plan,
1285                &self.mid_plan,
1286                &self.high_plan,
1287                &self.uncorrected_plan,
1288            ],
1289            index,
1290        )?;
1291        reopen_output_pair_if_fanout(
1292            &self.keep_plan,
1293            index,
1294            &mut self.keep1,
1295            &mut self.keep2,
1296            config,
1297            output_gzip_threads,
1298        )?;
1299        reopen_output_pair_if_fanout(
1300            &self.toss_plan,
1301            index,
1302            &mut self.toss1,
1303            &mut self.toss2,
1304            config,
1305            output_gzip_threads,
1306        )?;
1307        reopen_output_pair_if_fanout(
1308            &self.low_plan,
1309            index,
1310            &mut self.low1,
1311            &mut self.low2,
1312            config,
1313            output_gzip_threads,
1314        )?;
1315        reopen_output_pair_if_fanout(
1316            &self.mid_plan,
1317            index,
1318            &mut self.mid1,
1319            &mut self.mid2,
1320            config,
1321            output_gzip_threads,
1322        )?;
1323        reopen_output_pair_if_fanout(
1324            &self.high_plan,
1325            index,
1326            &mut self.high1,
1327            &mut self.high2,
1328            config,
1329            output_gzip_threads,
1330        )?;
1331        reopen_output_pair_if_fanout(
1332            &self.uncorrected_plan,
1333            index,
1334            &mut self.uncorrected1,
1335            &mut self.uncorrected2,
1336            config,
1337            output_gzip_threads,
1338        )?;
1339        self.current_output_list_index = index;
1340        Ok(())
1341    }
1342
1343    fn write_pair(
1344        &mut self,
1345        toss: bool,
1346        r1: &SequenceRecord,
1347        r2: Option<&SequenceRecord>,
1348    ) -> Result<()> {
1349        if toss {
1350            write_to_optional_pair(
1351                &mut self.toss1,
1352                &mut self.toss2,
1353                self.interleaved_output,
1354                r1,
1355                r2,
1356            )?;
1357        } else {
1358            write_to_optional_pair(
1359                &mut self.keep1,
1360                &mut self.keep2,
1361                self.interleaved_output,
1362                r1,
1363                r2,
1364            )?;
1365        }
1366        Ok(())
1367    }
1368
1369    fn write_depth_bin(
1370        &mut self,
1371        config: &Config,
1372        analysis: &PairAnalysis,
1373        r1: &SequenceRecord,
1374        r2: Option<&SequenceRecord>,
1375    ) -> Result<()> {
1376        let d1 = bin_depth(analysis.read1.depth_al);
1377        let d2 = analysis
1378            .read2
1379            .as_ref()
1380            .map(|read| bin_depth(read.depth_al))
1381            .unwrap_or(-1);
1382        let target = if d1 < config.low_bin_depth && d2 < config.low_bin_depth {
1383            DepthBin::Low
1384        } else if (d1 < config.low_bin_depth || d1 > config.high_bin_depth)
1385            && (d2 < config.low_bin_depth || d2 >= config.high_bin_depth)
1386        {
1387            DepthBin::High
1388        } else {
1389            DepthBin::Mid
1390        };
1391
1392        match target {
1393            DepthBin::Low => write_to_optional_pair(
1394                &mut self.low1,
1395                &mut self.low2,
1396                self.interleaved_output,
1397                r1,
1398                r2,
1399            )?,
1400            DepthBin::Mid => write_to_optional_pair(
1401                &mut self.mid1,
1402                &mut self.mid2,
1403                self.interleaved_output,
1404                r1,
1405                r2,
1406            )?,
1407            DepthBin::High => write_to_optional_pair(
1408                &mut self.high1,
1409                &mut self.high2,
1410                self.interleaved_output,
1411                r1,
1412                r2,
1413            )?,
1414        }
1415        Ok(())
1416    }
1417
1418    fn write_uncorrected(
1419        &mut self,
1420        r1: &SequenceRecord,
1421        r2: Option<&SequenceRecord>,
1422    ) -> Result<()> {
1423        write_to_optional_pair(
1424            &mut self.uncorrected1,
1425            &mut self.uncorrected2,
1426            self.interleaved_output,
1427            r1,
1428            r2,
1429        )
1430    }
1431
1432    fn flush(&mut self) -> Result<()> {
1433        for writer in [
1434            self.keep1.as_mut(),
1435            self.keep2.as_mut(),
1436            self.toss1.as_mut(),
1437            self.toss2.as_mut(),
1438            self.low1.as_mut(),
1439            self.low2.as_mut(),
1440            self.mid1.as_mut(),
1441            self.mid2.as_mut(),
1442            self.high1.as_mut(),
1443            self.high2.as_mut(),
1444            self.uncorrected1.as_mut(),
1445            self.uncorrected2.as_mut(),
1446        ]
1447        .into_iter()
1448        .flatten()
1449        {
1450            writer.flush()?;
1451        }
1452        Ok(())
1453    }
1454}
1455
1456#[derive(Debug, Clone, Copy)]
1457enum DepthBin {
1458    Low,
1459    Mid,
1460    High,
1461}
1462
1463fn write_to_optional_pair(
1464    writer1: &mut Option<SequenceWriter>,
1465    writer2: &mut Option<SequenceWriter>,
1466    interleaved_output: bool,
1467    r1: &SequenceRecord,
1468    r2: Option<&SequenceRecord>,
1469) -> Result<()> {
1470    if let Some(writer) = writer1.as_mut() {
1471        writer.write_record(r1)?;
1472        if interleaved_output && writer2.is_none() {
1473            if let Some(mate) = r2 {
1474                writer.write_record(mate)?;
1475            }
1476            return Ok(());
1477        }
1478    }
1479    if let (Some(writer), Some(mate)) = (writer2.as_mut(), r2) {
1480        writer.write_record(mate)?;
1481    }
1482    Ok(())
1483}
1484
1485fn has_second_output(config: &Config) -> bool {
1486    config.out2.is_some()
1487        || config.out_toss2.is_some()
1488        || config.out_low2.is_some()
1489        || config.out_mid2.is_some()
1490        || config.out_high2.is_some()
1491        || config.out_uncorrected2.is_some()
1492}
1493
1494fn depth_bin_outputs_enabled(config: &Config) -> bool {
1495    config.out_low1.is_some()
1496        || config.out_low2.is_some()
1497        || config.out_mid1.is_some()
1498        || config.out_mid2.is_some()
1499        || config.out_high1.is_some()
1500        || config.out_high2.is_some()
1501}
1502
1503fn needs_output_pair_analysis(config: &Config) -> bool {
1504    config.rename_reads || depth_bin_outputs_enabled(config)
1505}
1506
1507#[derive(Debug, Clone)]
1508struct OutputPathPair {
1509    first: Option<PathBuf>,
1510    second: Option<PathBuf>,
1511}
1512
1513#[derive(Debug, Clone)]
1514struct OutputPathPlan {
1515    pairs: Vec<OutputPathPair>,
1516    fanout: bool,
1517}
1518
1519impl OutputPathPlan {
1520    fn pair_for_index(&self, index: usize) -> Result<&OutputPathPair> {
1521        if self.fanout {
1522            self.pairs
1523                .get(index)
1524                .with_context(|| format!("missing output path list entry for input {}", index + 1))
1525        } else {
1526            self.pairs.first().context("missing output path plan entry")
1527        }
1528    }
1529}
1530
1531fn prepare_output_paths(
1532    first: Option<&Path>,
1533    second: Option<&Path>,
1534    paired: bool,
1535) -> OutputPathPair {
1536    let second = match second {
1537        Some(path) => Some(path.to_path_buf()),
1538        None if paired => first.and_then(|path| replace_hash_in_path(path, "2")),
1539        None => None,
1540    };
1541    let first =
1542        first.map(|path| replace_hash_in_path(path, "1").unwrap_or_else(|| path.to_path_buf()));
1543    OutputPathPair { first, second }
1544}
1545
1546fn prepare_output_path_plan(
1547    first: Option<&Path>,
1548    second: Option<&Path>,
1549    paired: bool,
1550    input_list_len: usize,
1551) -> Result<OutputPathPlan> {
1552    if input_list_len > 1
1553        && let Some(first_values) = output_path_values(first)
1554        && first_values.len() > 1
1555    {
1556        let second_values = output_path_values(second);
1557        let fanout_len = second_values
1558            .as_ref()
1559            .map(|values| first_values.len().min(values.len()))
1560            .unwrap_or(first_values.len());
1561        let mut pairs = Vec::with_capacity(fanout_len);
1562        for index in 0..fanout_len {
1563            let mut first_path = first_values[index].clone();
1564            let second_path = if let Some(values) = &second_values {
1565                Some(values[index].clone())
1566            } else if paired {
1567                if let Some(second_path) = replace_hash_in_path(&first_path, "2") {
1568                    first_path = replace_hash_in_path(&first_path, "1").unwrap_or(first_path);
1569                    Some(second_path)
1570                } else {
1571                    None
1572                }
1573            } else {
1574                None
1575            };
1576            pairs.push(OutputPathPair {
1577                first: Some(first_path),
1578                second: second_path,
1579            });
1580        }
1581        return Ok(OutputPathPlan {
1582            pairs,
1583            fanout: true,
1584        });
1585    }
1586
1587    if input_list_len > 1
1588        && let Some(second_values) = output_path_values(second)
1589        && second_values.len() > 1
1590    {
1591        let first_path =
1592            first.map(|path| replace_hash_in_path(path, "1").unwrap_or_else(|| path.to_path_buf()));
1593        return Ok(OutputPathPlan {
1594            pairs: vec![OutputPathPair {
1595                first: first_path,
1596                second: Some(second_values[0].clone()),
1597            }],
1598            fanout: false,
1599        });
1600    }
1601
1602    Ok(OutputPathPlan {
1603        pairs: vec![prepare_output_paths(first, second, paired)],
1604        fanout: false,
1605    })
1606}
1607
1608fn output_path_values(path: Option<&Path>) -> Option<Vec<PathBuf>> {
1609    let path = path?;
1610    if path.exists() {
1611        return Some(vec![path.to_path_buf()]);
1612    }
1613    let text = path.to_string_lossy();
1614    if text.contains(',') {
1615        let paths = split_path_list(&text);
1616        if paths.len() > 1 {
1617            return Some(paths);
1618        }
1619    }
1620    Some(vec![path.to_path_buf()])
1621}
1622
1623fn reopen_output_pair_if_fanout(
1624    plan: &OutputPathPlan,
1625    index: usize,
1626    first: &mut Option<SequenceWriter>,
1627    second: &mut Option<SequenceWriter>,
1628    config: &Config,
1629    gzip_threads: Option<usize>,
1630) -> Result<()> {
1631    if !plan.fanout {
1632        return Ok(());
1633    }
1634    *first = None;
1635    *second = None;
1636    let (new_first, new_second) = open_output_pair(
1637        plan.pair_for_index(index)?,
1638        config.overwrite,
1639        config.append,
1640        config.quality_out_offset,
1641        config.fake_quality,
1642        config.fasta_wrap,
1643        gzip_threads,
1644    )?;
1645    *first = new_first;
1646    *second = new_second;
1647    Ok(())
1648}
1649
1650fn output_gzip_threads_for_plans<'a>(
1651    gzip_threads: Option<usize>,
1652    plans: impl IntoIterator<Item = &'a OutputPathPlan>,
1653    index: usize,
1654) -> Result<Option<usize>> {
1655    let mut gzip_streams = 0usize;
1656    for plan in plans {
1657        gzip_streams =
1658            gzip_streams.saturating_add(output_pair_gzip_streams(plan.pair_for_index(index)?));
1659    }
1660    Ok(gzip_threads_for_streams(gzip_threads, gzip_streams))
1661}
1662
1663fn output_pair_gzip_streams(pair: &OutputPathPair) -> usize {
1664    [pair.first.as_deref(), pair.second.as_deref()]
1665        .into_iter()
1666        .flatten()
1667        .filter(|path| path_uses_gzip(path))
1668        .count()
1669}
1670
1671fn open_output_pair(
1672    pair: &OutputPathPair,
1673    overwrite: bool,
1674    append: bool,
1675    quality_out_offset: u8,
1676    fake_quality: u8,
1677    fasta_wrap: usize,
1678    gzip_threads: Option<usize>,
1679) -> Result<(Option<SequenceWriter>, Option<SequenceWriter>)> {
1680    let first = open_sequence_writer(
1681        pair.first.as_deref(),
1682        overwrite,
1683        append,
1684        quality_out_offset,
1685        fake_quality,
1686        fasta_wrap,
1687        gzip_threads,
1688    )?;
1689    let second = open_sequence_writer(
1690        pair.second.as_deref(),
1691        overwrite,
1692        append,
1693        quality_out_offset,
1694        fake_quality,
1695        fasta_wrap,
1696        gzip_threads,
1697    )?;
1698    Ok((first, second))
1699}
1700
1701fn replace_hash_in_path(path: &Path, replacement: &str) -> Option<PathBuf> {
1702    let text = path.to_string_lossy();
1703    if text.contains('#') {
1704        Some(PathBuf::from(text.replacen('#', replacement, 1)))
1705    } else {
1706        None
1707    }
1708}
1709
1710fn bin_depth(depth: Option<u64>) -> i64 {
1711    depth
1712        .and_then(|value| i64::try_from(value).ok())
1713        .unwrap_or(-1)
1714}
1715
1716pub fn run(config: &Config) -> Result<RunSummary> {
1717    let resolved_config;
1718    let config = if config.overlap_error_correct_auto {
1719        resolved_config = resolve_overlap_error_correct_auto(config)?;
1720        &resolved_config
1721    } else {
1722        config
1723    };
1724    if config.passes > 1 {
1725        return run_multipass(config);
1726    }
1727    run_single_pass(config)
1728}
1729
1730fn resolve_overlap_error_correct_auto(config: &Config) -> Result<Config> {
1731    let mut resolved = config.clone();
1732    resolved.overlap_error_correct_auto = false;
1733    resolved.overlap_error_correct = sampled_overlap_fraction(config)?
1734        .is_some_and(|fraction| fraction > OVERLAP_AUTO_ENABLE_FRACTION);
1735    Ok(resolved)
1736}
1737
1738fn sampled_overlap_fraction(config: &Config) -> Result<Option<f64>> {
1739    let mut readers = PrimaryReaders::open(config, Some(OVERLAP_AUTO_SAMPLE_PAIRS))?;
1740    let mut sampled = 0u64;
1741    let mut seen = 0u64;
1742    let mut mergeable = 0u64;
1743    while let Some((r1, r2)) = readers.next_pair()? {
1744        let Some(r2) = r2 else {
1745            return Ok(None);
1746        };
1747        seen += 1;
1748        if !seen.is_multiple_of(OVERLAP_AUTO_SAMPLE_INTERVAL) {
1749            continue;
1750        }
1751        sampled += 1;
1752        if best_pair_overlap(&r1, &r2).is_some() {
1753            mergeable += 1;
1754        }
1755    }
1756    if sampled == 0 {
1757        Ok(None)
1758    } else {
1759        Ok(Some(mergeable as f64 / sampled as f64))
1760    }
1761}
1762
1763fn run_multipass(config: &Config) -> Result<RunSummary> {
1764    let mut multipass_config = config.clone();
1765    apply_bbtools_multipass_cell_bits_cap(&mut multipass_config);
1766    let config = &multipass_config;
1767    let temp_dir = managed_temp_dir(config, "bbnorm-rs-multipass-")?;
1768    let paired = config.in2.is_some() || config.interleaved;
1769    let separate_pair_outputs = paired && config.out2.is_some();
1770    let temp_ext = temp_sequence_extension(config);
1771    let mut last_in1 = config.in1.clone().context("missing in1")?;
1772    let mut last_in2 = config.in2.clone();
1773    let mut last_interleaved = config.interleaved;
1774
1775    for pass in 1..config.passes {
1776        let temp1 = temp_dir.path().join(format!("pass{pass}.r1.{temp_ext}"));
1777        let temp2 = separate_pair_outputs
1778            .then(|| temp_dir.path().join(format!("pass{pass}.r2.{temp_ext}")));
1779        let mut pass_config = pass_config_for_intermediate(
1780            config,
1781            pass,
1782            &last_in1,
1783            last_in2.as_deref(),
1784            last_interleaved,
1785            temp1.clone(),
1786            temp2.clone(),
1787            None,
1788            None,
1789        );
1790        run_single_pass(&pass_config)
1791            .with_context(|| format!("running Rust multipass intermediate pass {pass}"))?;
1792
1793        last_in1 = temp1;
1794        last_in2 = temp2;
1795        last_interleaved = paired && last_in2.is_none();
1796        pass_config.notes.clear();
1797    }
1798
1799    let mut final_config = config.clone();
1800    final_config.in1 = Some(last_in1);
1801    final_config.in2 = last_in2;
1802    final_config.interleaved = last_interleaved;
1803    final_config.test_interleaved = !last_interleaved && final_config.in2.is_none();
1804    final_config.extra.clear();
1805    final_config.hist_in = None;
1806    final_config.rhist_in = None;
1807    final_config.peaks_in = None;
1808    final_config.match_hist_out = None;
1809    final_config.insert_hist_out = None;
1810    final_config.quality_accuracy_hist_out = None;
1811    final_config.indel_hist_out = None;
1812    final_config.error_hist_out = None;
1813    final_config.quality_hist_out = None;
1814    final_config.base_quality_hist_out = None;
1815    final_config.quality_count_hist_out = None;
1816    final_config.average_quality_hist_out = None;
1817    final_config.overall_base_quality_hist_out = None;
1818    final_config.length_hist_out = None;
1819    final_config.gc_hist_out = None;
1820    final_config.base_hist_out = None;
1821    final_config.entropy_hist_out = None;
1822    final_config.identity_hist_out = None;
1823    final_config.target_bad_percent_low = 1.0;
1824    final_config.target_bad_percent_high = 1.0;
1825    final_config.error_correct = config.error_correct_final;
1826    final_config.overlap_error_correct = config.overlap_error_correct && config.error_correct_final;
1827    final_config.passes = 1;
1828    let final_toss1 = config.out_toss1.as_ref().map(|_| {
1829        temp_dir
1830            .path()
1831            .join(format!("pass{}.final.toss1.{temp_ext}", config.passes))
1832    });
1833    let final_toss2 = config.out_toss2.as_ref().map(|_| {
1834        temp_dir
1835            .path()
1836            .join(format!("pass{}.final.toss2.{temp_ext}", config.passes))
1837    });
1838    final_config.out_toss1 = final_toss1.clone();
1839    final_config.out_toss2 = final_toss2.clone();
1840
1841    let summary = run_single_pass(&final_config).context("running Rust multipass final pass")?;
1842
1843    if let Some(path) = final_toss1
1844        && let Some(output) = config.out_toss1.as_deref()
1845    {
1846        write_multipass_fragments(
1847            &[path],
1848            output,
1849            config.overwrite,
1850            config.append,
1851            "multipass toss output",
1852        )?;
1853    }
1854    if let Some(path) = final_toss2
1855        && let Some(output) = config.out_toss2.as_deref()
1856    {
1857        write_multipass_fragments(
1858            &[path],
1859            output,
1860            config.overwrite,
1861            config.append,
1862            "multipass paired toss output",
1863        )?;
1864    }
1865    Ok(summary)
1866}
1867
1868fn apply_bbtools_multipass_cell_bits_cap(config: &mut Config) {
1869    if config.passes > 1 && config.count_min.bits.unwrap_or(32) > 16 {
1870        config.count_min.bits = Some(16);
1871    }
1872}
1873
1874fn managed_temp_dir(config: &Config, prefix: &str) -> Result<tempfile::TempDir> {
1875    let mut builder = tempfile::Builder::new();
1876    builder.prefix(prefix);
1877    if config.use_temp_dir
1878        && let Some(dir) = config.temp_dir.as_deref()
1879    {
1880        fs::create_dir_all(dir)
1881            .with_context(|| format!("creating temporary directory parent {}", dir.display()))?;
1882        return builder
1883            .tempdir_in(dir)
1884            .with_context(|| format!("creating managed temporary directory in {}", dir.display()));
1885    }
1886    builder
1887        .tempdir()
1888        .context("creating managed temporary directory")
1889}
1890
1891fn write_multipass_fragments(
1892    fragments: &[PathBuf],
1893    output: &Path,
1894    overwrite: bool,
1895    append: bool,
1896    label: &str,
1897) -> Result<()> {
1898    let mut writer = create_output_with_append(output, overwrite, append)
1899        .with_context(|| format!("opening {label} {}", output.display()))?;
1900    for fragment in fragments {
1901        if fragment.exists() {
1902            let mut input = std::fs::File::open(fragment)
1903                .with_context(|| format!("opening multipass fragment {}", fragment.display()))?;
1904            std::io::copy(&mut input, &mut writer)
1905                .with_context(|| format!("copying multipass fragment {}", fragment.display()))?;
1906        }
1907    }
1908    writer
1909        .flush()
1910        .with_context(|| format!("flushing {label} {}", output.display()))?;
1911    Ok(())
1912}
1913
1914#[allow(clippy::too_many_arguments)]
1915fn pass_config_for_intermediate(
1916    config: &Config,
1917    pass: usize,
1918    in1: &Path,
1919    in2: Option<&Path>,
1920    interleaved: bool,
1921    out1: PathBuf,
1922    out2: Option<PathBuf>,
1923    out_toss1: Option<PathBuf>,
1924    out_toss2: Option<PathBuf>,
1925) -> Config {
1926    let mut pass_config = config.clone();
1927    let target = intermediate_target_depth(config, pass);
1928    let (target_bad_low, target_bad_high) = intermediate_bad_depth_targets(config, pass, target);
1929    pass_config.in1 = Some(in1.to_path_buf());
1930    pass_config.in2 = in2.map(Path::to_path_buf);
1931    pass_config.interleaved = interleaved;
1932    pass_config.test_interleaved = !interleaved && pass_config.in2.is_none();
1933    pass_config.extra = if pass == 1 {
1934        config.extra.clone()
1935    } else {
1936        Vec::new()
1937    };
1938    pass_config.out1 = Some(out1);
1939    pass_config.out2 = out2;
1940    pass_config.out_toss1 = out_toss1;
1941    pass_config.out_toss2 = out_toss2;
1942    pass_config.out_low1 = None;
1943    pass_config.out_low2 = None;
1944    pass_config.out_mid1 = None;
1945    pass_config.out_mid2 = None;
1946    pass_config.out_high1 = None;
1947    pass_config.out_high2 = None;
1948    pass_config.out_uncorrected1 = None;
1949    pass_config.out_uncorrected2 = None;
1950    pass_config.hist_in = (pass == 1).then(|| config.hist_in.clone()).flatten();
1951    pass_config.rhist_in = (pass == 1).then(|| config.rhist_in.clone()).flatten();
1952    pass_config.peaks_in = (pass == 1).then(|| config.peaks_in.clone()).flatten();
1953    pass_config.match_hist_out = (pass == 1).then(|| config.match_hist_out.clone()).flatten();
1954    pass_config.insert_hist_out = (pass == 1)
1955        .then(|| config.insert_hist_out.clone())
1956        .flatten();
1957    pass_config.quality_accuracy_hist_out = (pass == 1)
1958        .then(|| config.quality_accuracy_hist_out.clone())
1959        .flatten();
1960    pass_config.indel_hist_out = (pass == 1).then(|| config.indel_hist_out.clone()).flatten();
1961    pass_config.error_hist_out = (pass == 1).then(|| config.error_hist_out.clone()).flatten();
1962    pass_config.quality_hist_out = (pass == 1)
1963        .then(|| config.quality_hist_out.clone())
1964        .flatten();
1965    pass_config.base_quality_hist_out = (pass == 1)
1966        .then(|| config.base_quality_hist_out.clone())
1967        .flatten();
1968    pass_config.quality_count_hist_out = (pass == 1)
1969        .then(|| config.quality_count_hist_out.clone())
1970        .flatten();
1971    pass_config.average_quality_hist_out = (pass == 1)
1972        .then(|| config.average_quality_hist_out.clone())
1973        .flatten();
1974    pass_config.overall_base_quality_hist_out = (pass == 1)
1975        .then(|| config.overall_base_quality_hist_out.clone())
1976        .flatten();
1977    pass_config.length_hist_out = (pass == 1)
1978        .then(|| config.length_hist_out.clone())
1979        .flatten();
1980    pass_config.gc_hist_out = (pass == 1).then(|| config.gc_hist_out.clone()).flatten();
1981    pass_config.base_hist_out = (pass == 1).then(|| config.base_hist_out.clone()).flatten();
1982    pass_config.entropy_hist_out = (pass == 1)
1983        .then(|| config.entropy_hist_out.clone())
1984        .flatten();
1985    pass_config.identity_hist_out = (pass == 1)
1986        .then(|| config.identity_hist_out.clone())
1987        .flatten();
1988    pass_config.hist_out = None;
1989    pass_config.rhist_out = None;
1990    pass_config.peaks_out = None;
1991    if let Some(bits) = config.count_min_bits_first {
1992        pass_config.count_min.bits = Some(bits);
1993    }
1994    pass_config.target_depth = target;
1995    pass_config.target_bad_percent_low = target_bad_low as f64 / target as f64;
1996    pass_config.target_bad_percent_high = target_bad_high as f64 / target as f64;
1997    pass_config.max_depth = Some(target + target / 4);
1998    pass_config.min_depth =
1999        config
2000            .min_depth
2001            .min(if config.passes > 2 && pass < config.passes - 1 {
2002                2
2003            } else {
2004                3
2005            });
2006    pass_config.min_kmers_over_min_depth = if config.passes > 2 && pass < config.passes - 1 {
2007        config.min_kmers_over_min_depth.min(5)
2008    } else {
2009        config.min_kmers_over_min_depth
2010    };
2011    pass_config.depth_percentile = (config.depth_percentile.max(0.4) * 1.2).min(0.8);
2012    pass_config.toss_error_reads = if config.passes > 2 && pass < config.passes - 1 {
2013        false
2014    } else {
2015        config.toss_error_reads_first
2016    };
2017    pass_config.discard_bad_only = if config.passes > 2 && pass < config.passes - 1 {
2018        true
2019    } else {
2020        config.discard_bad_only_first
2021    };
2022    pass_config.low_percentile = if config.passes > 2 && pass < config.passes - 1 {
2023        0.0
2024    } else {
2025        config.low_percentile
2026    };
2027    pass_config.error_detect_ratio = if config.passes > 2 && pass < config.passes - 1 {
2028        if config.error_detect_ratio > 100 {
2029            100 + (config.error_detect_ratio - 100) / 2
2030        } else {
2031            config.error_detect_ratio
2032        }
2033    } else {
2034        config.error_detect_ratio
2035    };
2036    pass_config.fix_spikes = false;
2037    pass_config.count_up = false;
2038    pass_config.error_correct = config.error_correct_first;
2039    pass_config.overlap_error_correct = config.overlap_error_correct && config.error_correct_first;
2040    pass_config.rename_reads = false;
2041    pass_config.overwrite = true;
2042    pass_config.append = false;
2043    pass_config.passes = 1;
2044    pass_config.notes.clear();
2045    pass_config
2046}
2047
2048fn intermediate_target_depth(config: &Config, pass: usize) -> u64 {
2049    if config.passes > 2 && pass == config.passes - 1 {
2050        config
2051            .target_depth_first
2052            .unwrap_or_else(|| config.target_depth.saturating_mul(2))
2053    } else if config.passes > 2 {
2054        config
2055            .target_depth_first
2056            .map(|target| target.saturating_mul(2))
2057            .unwrap_or_else(|| config.target_depth.saturating_mul(4))
2058    } else {
2059        config
2060            .target_depth_first
2061            .unwrap_or_else(|| config.target_depth.saturating_mul(4))
2062    }
2063}
2064
2065fn intermediate_bad_depth_targets(config: &Config, pass: usize, target: u64) -> (u64, u64) {
2066    let early_multiplier = if config.passes > 2 && pass < config.passes - 1 {
2067        1.5
2068    } else {
2069        1.0
2070    };
2071    let target_f = config.target_depth as f64;
2072    let low = (target_f * config.target_bad_percent_low * early_multiplier)
2073        .ceil()
2074        .max(1.0) as u64;
2075    let high = (target_f * config.target_bad_percent_high * early_multiplier)
2076        .ceil()
2077        .max(1.0) as u64;
2078    let low = low.min(target);
2079    let high = high.min(target).max(low);
2080    (low, high)
2081}
2082
2083fn temp_sequence_extension(config: &Config) -> &'static str {
2084    for path in [
2085        config.out1.as_ref(),
2086        config.in1.as_ref(),
2087        config.out2.as_ref(),
2088        config.in2.as_ref(),
2089    ]
2090    .into_iter()
2091    .flatten()
2092    {
2093        let text = path.to_string_lossy().to_ascii_lowercase();
2094        if text.ends_with(".fa")
2095            || text.ends_with(".fasta")
2096            || text.ends_with(".fna")
2097            || text.ends_with(".fa.gz")
2098            || text.ends_with(".fasta.gz")
2099            || text.ends_with(".fna.gz")
2100        {
2101            return "fa";
2102        }
2103    }
2104    "fq"
2105}
2106
2107fn cardinality_kmer_config(config: &Config) -> Config {
2108    let mut cardinality_config = config.clone();
2109    if let Some(k) = config.cardinality.k {
2110        cardinality_config.k = k;
2111    }
2112    if config.cardinality.min_probability > 0.0 {
2113        cardinality_config.min_prob = config.cardinality.min_probability;
2114    }
2115    cardinality_config
2116}
2117
2118fn estimate_primary_cardinality(
2119    config: &Config,
2120    cardinality_config: &Config,
2121) -> Result<CardinalityEstimate> {
2122    let mut estimator = KmerCardinalityEstimator::from_config(config);
2123    let mut readers = PrimaryReaders::open(config, config.table_reads)?;
2124    let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
2125
2126    while let Some((r1, r2)) = readers.next_pair()? {
2127        chunk.push((r1, r2));
2128        if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
2129            observe_cardinality_chunk(&mut estimator, cardinality_config, &chunk);
2130            chunk.clear();
2131        }
2132    }
2133    if !chunk.is_empty() {
2134        observe_cardinality_chunk(&mut estimator, cardinality_config, &chunk);
2135    }
2136    Ok(estimator.estimate())
2137}
2138
2139fn observe_cardinality_chunk(
2140    estimator: &mut KmerCardinalityEstimator,
2141    config: &Config,
2142    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
2143) {
2144    for (r1, r2) in pairs {
2145        estimator.observe_pair(config, r1, r2.as_ref());
2146    }
2147}
2148
2149fn run_single_pass(config: &Config) -> Result<RunSummary> {
2150    if config.count_up {
2151        return run_countup(config);
2152    }
2153    let mut stage_timings = Vec::new();
2154    let cardinality_config = cardinality_kmer_config(config);
2155    let random_seed = run_random_seed(config);
2156    let input_counts = build_input_counts_with_stage_timings(config, &mut stage_timings)?;
2157    let input_cardinality = if config.cardinality.input {
2158        let started = Instant::now();
2159        let estimate = estimate_primary_cardinality(config, &cardinality_config)?;
2160        record_stage_timing(&mut stage_timings, "input_cardinality", started);
2161        Some(estimate)
2162    } else {
2163        None
2164    };
2165
2166    let wants_input_hist = config.hist_in.is_some() || config.peaks_in.is_some();
2167    let wants_input_rhist = config.rhist_in.is_some();
2168    let fuse_input_hist_with_normalize =
2169        (wants_input_hist || wants_input_rhist) && !config.trim_after_marking;
2170    let mut input_rhist_written_with_hist = false;
2171    let started = Instant::now();
2172    let mut fused_input_hist = fuse_input_hist_with_normalize.then(SparseHist::default);
2173    let mut fused_input_read_hist =
2174        fuse_input_hist_with_normalize.then(SparseReadDepthHist::default);
2175    if fuse_input_hist_with_normalize {
2176        input_rhist_written_with_hist = wants_input_rhist;
2177    } else if wants_input_hist && wants_input_rhist {
2178        let (hist, read_hist) =
2179            collect_primary_sparse_hist_and_read_hist(config, &input_counts, None, random_seed)?;
2180        if let Some(path) = &config.hist_in {
2181            write_sparse_depth_hist(path, &hist, config.hist_len, config)?;
2182        }
2183        if let Some(path) = &config.peaks_in {
2184            let dense_hist = sparse_hist_to_peak_dense(&hist, config.hist_len);
2185            write_peaks(path, &dense_hist, config)?;
2186        }
2187        if let Some(path) = &config.rhist_in {
2188            write_sparse_read_depth_hist(path, &read_hist, config.hist_len, config)?;
2189            input_rhist_written_with_hist = true;
2190        }
2191    } else if wants_input_hist {
2192        let hist = collect_primary_sparse_hist(config, &input_counts, None, random_seed)?;
2193        if let Some(path) = &config.hist_in {
2194            write_sparse_depth_hist(path, &hist, config.hist_len, config)?;
2195        }
2196        if let Some(path) = &config.peaks_in {
2197            let dense_hist = sparse_hist_to_peak_dense(&hist, config.hist_len);
2198            write_peaks(path, &dense_hist, config)?;
2199        }
2200    }
2201    record_stage_timing(&mut stage_timings, "input_hist", started);
2202
2203    if input_rhist_written_with_hist {
2204        record_stage_timing(&mut stage_timings, "input_rhist", Instant::now());
2205    } else if let Some(path) = &config.rhist_in {
2206        let started = Instant::now();
2207        let hist = collect_primary_sparse_read_hist(config, &input_counts, None, random_seed)?;
2208        write_sparse_read_depth_hist(path, &hist, config.hist_len, config)?;
2209        record_stage_timing(&mut stage_timings, "input_rhist", started);
2210    }
2211
2212    let started = Instant::now();
2213    emit_read_local_side_outputs(config)?;
2214    record_stage_timing(&mut stage_timings, "side_outputs", started);
2215
2216    let started = Instant::now();
2217    let mut output_counts =
2218        if config.hist_out.is_some() || config.rhist_out.is_some() || config.peaks_out.is_some() {
2219            Some(new_output_counts(config)?)
2220        } else {
2221            None
2222        };
2223    let mut output_cardinality = config
2224        .cardinality
2225        .output
2226        .then(|| KmerCardinalityEstimator::from_config(config));
2227    record_stage_timing(&mut stage_timings, "output_count_init", started);
2228
2229    let started = Instant::now();
2230    let mut summary = normalize_primary(
2231        config,
2232        &input_counts,
2233        output_counts.as_mut(),
2234        output_cardinality.as_mut(),
2235        &cardinality_config,
2236        random_seed,
2237        InputHistSinks {
2238            depth: fused_input_hist.as_mut(),
2239            read: fused_input_read_hist.as_mut(),
2240        },
2241    )?;
2242    record_stage_timing(&mut stage_timings, "normalize", started);
2243
2244    if let Some(hist) = fused_input_hist.as_ref() {
2245        if let Some(path) = &config.hist_in {
2246            write_sparse_depth_hist(path, hist, config.hist_len, config)?;
2247        }
2248        if let Some(path) = &config.peaks_in {
2249            let dense_hist = sparse_hist_to_peak_dense(hist, config.hist_len);
2250            write_peaks(path, &dense_hist, config)?;
2251        }
2252    }
2253    if let (Some(path), Some(read_hist)) = (&config.rhist_in, fused_input_read_hist.as_ref()) {
2254        write_sparse_read_depth_hist(path, read_hist, config.hist_len, config)?;
2255    }
2256
2257    let started = Instant::now();
2258    (summary.unique_kmers_in, summary.unique_kmers_in_split) = input_counts.unique_kmer_estimate();
2259    summary.cardinality_in = input_cardinality;
2260    summary.cardinality_out = output_cardinality
2261        .as_ref()
2262        .map(KmerCardinalityEstimator::estimate);
2263    summary.sketch_layouts = input_counts.sketch_layouts();
2264    if let Some(counts) = output_counts.as_mut() {
2265        apply_output_count_adjustments(config, counts);
2266    }
2267    summary.unique_kmers_out = output_counts.as_ref().map(CountLookup::unique_kmers);
2268    if let Some(counts) = output_counts.as_ref() {
2269        counts.append_sketch_layouts(&mut summary.sketch_layouts, "output_kept");
2270    }
2271    record_stage_timing(&mut stage_timings, "summary_counts", started);
2272
2273    let wants_output_hist = config.hist_out.is_some() || config.peaks_out.is_some();
2274    let wants_output_rhist = config.rhist_out.is_some();
2275    let mut output_rhist_written_with_hist = false;
2276    let started = Instant::now();
2277    if let Some(counts) = &output_counts {
2278        if wants_output_hist && wants_output_rhist {
2279            let (hist, read_hist) = collect_primary_sparse_hist_and_read_hist(
2280                config,
2281                counts,
2282                Some(&input_counts),
2283                random_seed,
2284            )?;
2285            if let Some(path) = &config.hist_out {
2286                write_sparse_depth_hist(path, &hist, config.hist_len, config)?;
2287            }
2288            if let Some(path) = &config.peaks_out {
2289                let dense_hist = sparse_hist_to_peak_dense(&hist, config.hist_len);
2290                write_peaks(path, &dense_hist, config)?;
2291            }
2292            if let Some(path) = &config.rhist_out {
2293                write_sparse_read_depth_hist(path, &read_hist, config.hist_len, config)?;
2294                output_rhist_written_with_hist = true;
2295            }
2296        } else if wants_output_hist {
2297            let hist =
2298                collect_primary_sparse_hist(config, counts, Some(&input_counts), random_seed)?;
2299            if let Some(path) = &config.hist_out {
2300                write_sparse_depth_hist(path, &hist, config.hist_len, config)?;
2301            }
2302            if let Some(path) = &config.peaks_out {
2303                let dense_hist = sparse_hist_to_peak_dense(&hist, config.hist_len);
2304                write_peaks(path, &dense_hist, config)?;
2305            }
2306        }
2307    }
2308    record_stage_timing(&mut stage_timings, "output_hist", started);
2309
2310    if output_rhist_written_with_hist {
2311        record_stage_timing(&mut stage_timings, "output_rhist", Instant::now());
2312    } else if let (Some(path), Some(counts)) = (&config.rhist_out, &output_counts) {
2313        let started = Instant::now();
2314        let hist =
2315            collect_primary_sparse_read_hist(config, counts, Some(&input_counts), random_seed)?;
2316        write_sparse_read_depth_hist(path, &hist, config.hist_len, config)?;
2317        record_stage_timing(&mut stage_timings, "output_rhist", started);
2318    }
2319
2320    summary.stage_timings = stage_timings;
2321    Ok(summary)
2322}
2323
2324fn run_countup(config: &Config) -> Result<RunSummary> {
2325    let mut stage_timings = Vec::new();
2326    let cardinality_config = cardinality_kmer_config(config);
2327    let input_counts = build_input_counts_with_stage_timings(config, &mut stage_timings)?;
2328    let input_cardinality = if config.cardinality.input {
2329        let started = Instant::now();
2330        let estimate = estimate_primary_cardinality(config, &cardinality_config)?;
2331        record_stage_timing(&mut stage_timings, "input_cardinality", started);
2332        Some(estimate)
2333    } else {
2334        None
2335    };
2336
2337    let wants_input_hist = config.hist_in.is_some() || config.peaks_in.is_some();
2338    let wants_input_rhist = config.rhist_in.is_some();
2339
2340    let started = Instant::now();
2341    emit_read_local_side_outputs(config)?;
2342    record_stage_timing(&mut stage_timings, "side_outputs", started);
2343
2344    let random_seed = run_random_seed(config);
2345    let started = Instant::now();
2346    let work_build = collect_countup_work_source(
2347        config,
2348        &input_counts,
2349        random_seed,
2350        wants_input_hist,
2351        wants_input_rhist,
2352    )?;
2353    let countup_work_elapsed = started
2354        .elapsed()
2355        .as_micros()
2356        .saturating_sub(work_build.input_hist_elapsed_micros);
2357    record_stage_timing_micros(
2358        &mut stage_timings,
2359        "input_hist",
2360        work_build.input_hist_elapsed_micros,
2361    );
2362    if let (Some(path), Some(hist)) = (&config.hist_in, &work_build.input_hist) {
2363        write_sparse_depth_hist(path, hist, config.hist_len, config)?;
2364    }
2365    if let (Some(path), Some(hist)) = (&config.peaks_in, &work_build.input_hist) {
2366        let dense_hist = sparse_hist_to_peak_dense(hist, config.hist_len);
2367        write_peaks(path, &dense_hist, config)?;
2368    }
2369    if let (Some(path), Some(hist)) = (&config.rhist_in, &work_build.input_read_hist) {
2370        write_sparse_read_depth_hist(path, hist, config.hist_len, config)?;
2371    }
2372    record_stage_timing_micros(&mut stage_timings, "input_rhist", 0);
2373    record_stage_timing_micros(
2374        &mut stage_timings,
2375        "countup_work_source",
2376        countup_work_elapsed,
2377    );
2378    let format1 = work_build.format1;
2379    let format2 = work_build.format2;
2380    let countup_spill = work_build.spill_summary;
2381    let mut work_pairs = work_build.source.into_iter()?;
2382    let mut writers = OptionalWriters::open(config, format1, format2)?;
2383    let mut summary = RunSummary {
2384        cardinality_in: input_cardinality,
2385        countup_spill,
2386        ..RunSummary::default()
2387    };
2388    let mut kept_counts = new_output_counts(config)?;
2389    let mut output_cardinality = config
2390        .cardinality
2391        .output
2392        .then(|| KmerCardinalityEstimator::from_config(config));
2393    let adjusted_target = ((config.target_depth as f64) * 0.95).round().max(1.0) as u64;
2394    let started = Instant::now();
2395
2396    while let Some(CountupWorkPair {
2397        input_list_index,
2398        mut r1,
2399        mut r2,
2400        ..
2401    }) = work_pairs.next_pair()?
2402    {
2403        writers.sync_to_input_list_index(config, input_list_index)?;
2404        let keys = unique_pair_kmers(config, &r1, r2.as_ref());
2405        let mut decision_plan =
2406            countup_decision_plan(config, &input_counts, &kept_counts, &keys, adjusted_target);
2407        if countup_length_toss(config, &r1, r2.as_ref()) {
2408            decision_plan.toss = true;
2409        }
2410        update_countup_kept_counts_for_plan(config, &mut kept_counts, &keys, &decision_plan);
2411
2412        let output_analysis = needs_output_pair_analysis(config)
2413            .then(|| analyze_pair(config, &input_counts, &r1, r2.as_ref()));
2414        let mut correction = CorrectionResult::default();
2415        if config.error_correct && !decision_plan.toss {
2416            correction =
2417                correct_pair_errors_with_rollback(config, &input_counts, &mut r1, r2.as_mut());
2418        }
2419        if config.trim_after_marking && config.error_correct {
2420            trim_pair(config, &mut r1, r2.as_mut());
2421        }
2422        let (out_r1, out_r2) = match output_analysis.as_ref() {
2423            Some(analysis) => maybe_rename_pair(config, &r1, r2.as_ref(), analysis),
2424            None => (r1.clone(), r2.clone()),
2425        };
2426        let read_count = 1 + u64::from(r2.is_some());
2427        let base_count = r1.len() as u64 + r2.as_ref().map(|r| r.len() as u64).unwrap_or(0);
2428
2429        summary.reads_in += read_count;
2430        summary.bases_in += base_count;
2431        if decision_plan.toss {
2432            summary.reads_tossed += read_count;
2433            summary.bases_tossed += base_count;
2434        } else {
2435            summary.reads_kept += read_count;
2436            summary.bases_kept += base_count;
2437            if let Some(estimator) = output_cardinality.as_mut() {
2438                estimator.observe_pair(&cardinality_config, &r1, r2.as_ref());
2439            }
2440        }
2441        writers.write_pair(decision_plan.toss, &out_r1, out_r2.as_ref())?;
2442        if correction.uncorrectable {
2443            writers.write_uncorrected(&r1, r2.as_ref())?;
2444        }
2445        if let Some(analysis) = output_analysis.as_ref()
2446            && depth_bin_outputs_enabled(config)
2447        {
2448            writers.write_depth_bin(config, analysis, &out_r1, out_r2.as_ref())?;
2449        }
2450    }
2451    writers.flush()?;
2452    record_stage_timing(&mut stage_timings, "countup_normalize", started);
2453
2454    let started = Instant::now();
2455    if config.hist_out.is_some() || config.peaks_out.is_some() || config.rhist_out.is_some() {
2456        apply_output_count_adjustments(config, &mut kept_counts);
2457    }
2458    record_stage_timing(&mut stage_timings, "output_count_adjust", started);
2459
2460    let started = Instant::now();
2461    let output_hist = if config.hist_out.is_some() || config.peaks_out.is_some() {
2462        Some(kept_counts.sparse_depth_hist(config.hist_len))
2463    } else {
2464        None
2465    };
2466    if let (Some(path), Some(hist)) = (&config.hist_out, &output_hist) {
2467        write_sparse_depth_hist(path, hist, config.hist_len, config)?;
2468    }
2469    if let (Some(path), Some(hist)) = (&config.peaks_out, &output_hist) {
2470        let dense_hist = sparse_hist_to_peak_dense(hist, config.hist_len);
2471        write_peaks(path, &dense_hist, config)?;
2472    }
2473    record_stage_timing(&mut stage_timings, "output_hist", started);
2474
2475    if let Some(path) = &config.rhist_out {
2476        let started = Instant::now();
2477        let hist = collect_primary_sparse_read_hist(config, &kept_counts, Some(&input_counts), 0)?;
2478        write_sparse_read_depth_hist(path, &hist, config.hist_len, config)?;
2479        record_stage_timing(&mut stage_timings, "output_rhist", started);
2480    }
2481
2482    let started = Instant::now();
2483    (summary.unique_kmers_in, summary.unique_kmers_in_split) = input_counts.unique_kmer_estimate();
2484    summary.unique_kmers_out = Some(kept_counts.unique_kmers());
2485    summary.cardinality_out = output_cardinality
2486        .as_ref()
2487        .map(KmerCardinalityEstimator::estimate);
2488    summary.sketch_layouts = input_counts.sketch_layouts();
2489    kept_counts.append_sketch_layouts(&mut summary.sketch_layouts, "countup_kept");
2490    record_stage_timing(&mut stage_timings, "summary_counts", started);
2491    summary.stage_timings = stage_timings;
2492    Ok(summary)
2493}
2494
2495fn record_stage_timing(timings: &mut Vec<StageTiming>, name: &'static str, started: Instant) {
2496    timings.push(StageTiming {
2497        name,
2498        elapsed_micros: started.elapsed().as_micros(),
2499    });
2500}
2501
2502fn record_stage_timing_micros(
2503    timings: &mut Vec<StageTiming>,
2504    name: &'static str,
2505    elapsed_micros: u128,
2506) {
2507    timings.push(StageTiming {
2508        name,
2509        elapsed_micros,
2510    });
2511}
2512
2513fn collect_countup_work_source(
2514    config: &Config,
2515    input_counts: &dyn CountLookup,
2516    random_seed: u64,
2517    wants_input_hist: bool,
2518    wants_input_rhist: bool,
2519) -> Result<CountupWorkBuild> {
2520    let mut readers = PrimaryReaders::open(config, config.max_reads)?;
2521    let format1 = readers.format1();
2522    let format2 = readers.format2();
2523    let presort_config = countup_prepass_config(config);
2524    let mut rng = JavaXoshiro::new(random_seed);
2525    let mut work_pairs = Vec::new();
2526    let mut work_pair_bytes = 0usize;
2527    let mut run_paths = Vec::new();
2528    let mut temp_dir = None;
2529    let mut spill_summary = CountupSpillSummary::default();
2530    let mut input_hist = wants_input_hist.then(SparseHist::default);
2531    let mut input_read_hist = wants_input_rhist.then(SparseReadDepthHist::default);
2532    let mut input_hist_elapsed_micros = 0u128;
2533    let mut candidates = Vec::with_capacity(COUNTUP_PREPASS_CHUNK_PAIR_LIMIT);
2534    let mut candidate_bytes = 0usize;
2535    let mut original_index = 0usize;
2536    while let Some((r1, r2)) = readers.next_pair()? {
2537        let candidate = CountupWorkCandidate {
2538            input_list_index: readers.input_list_index(),
2539            original_index,
2540            rand: rng.next_double(),
2541            r1,
2542            r2,
2543        };
2544        candidate_bytes =
2545            candidate_bytes.saturating_add(countup_work_candidate_memory_hint(&candidate));
2546        candidates.push(candidate);
2547        if countup_prepass_chunk_ready(candidates.len(), candidate_bytes) {
2548            let chunk = std::mem::take(&mut candidates);
2549            let chunk_build = process_countup_work_candidate_chunk(
2550                config,
2551                &presort_config,
2552                input_counts,
2553                wants_input_hist,
2554                wants_input_rhist,
2555                chunk,
2556            );
2557            let hist_started = Instant::now();
2558            if let Some(input_hist) = input_hist.as_mut() {
2559                merge_sparse_hist(input_hist, chunk_build.depth_hist);
2560            }
2561            if let Some(input_read_hist) = input_read_hist.as_mut() {
2562                merge_sparse_read_depth_hist(input_read_hist, chunk_build.read_hist);
2563            }
2564            input_hist_elapsed_micros =
2565                input_hist_elapsed_micros.saturating_add(hist_started.elapsed().as_micros());
2566            append_countup_work_pairs(
2567                config,
2568                &mut temp_dir,
2569                &mut run_paths,
2570                &mut spill_summary,
2571                &mut work_pairs,
2572                &mut work_pair_bytes,
2573                chunk_build.work_pairs,
2574            )?;
2575            candidates = Vec::with_capacity(COUNTUP_PREPASS_CHUNK_PAIR_LIMIT);
2576            candidate_bytes = 0;
2577        }
2578        original_index += 1;
2579    }
2580    if !candidates.is_empty() {
2581        let chunk_build = process_countup_work_candidate_chunk(
2582            config,
2583            &presort_config,
2584            input_counts,
2585            wants_input_hist,
2586            wants_input_rhist,
2587            candidates,
2588        );
2589        let hist_started = Instant::now();
2590        if let Some(input_hist) = input_hist.as_mut() {
2591            merge_sparse_hist(input_hist, chunk_build.depth_hist);
2592        }
2593        if let Some(input_read_hist) = input_read_hist.as_mut() {
2594            merge_sparse_read_depth_hist(input_read_hist, chunk_build.read_hist);
2595        }
2596        input_hist_elapsed_micros =
2597            input_hist_elapsed_micros.saturating_add(hist_started.elapsed().as_micros());
2598        append_countup_work_pairs(
2599            config,
2600            &mut temp_dir,
2601            &mut run_paths,
2602            &mut spill_summary,
2603            &mut work_pairs,
2604            &mut work_pair_bytes,
2605            chunk_build.work_pairs,
2606        )?;
2607    }
2608    let source = if run_paths.is_empty() {
2609        work_pairs.sort_by(compare_countup_work_pairs);
2610        CountupWorkSource {
2611            temp_dir: None,
2612            inner: CountupWorkSourceInner::Memory(work_pairs),
2613        }
2614    } else {
2615        if !work_pairs.is_empty() {
2616            spill_countup_run(
2617                config,
2618                &mut temp_dir,
2619                &mut run_paths,
2620                &mut spill_summary,
2621                &mut work_pairs,
2622            )?;
2623        }
2624        compact_countup_runs(config, &mut run_paths, &mut spill_summary)?;
2625        spill_summary.final_runs = run_paths.len();
2626        enforce_countup_spill_limits(config, &spill_summary, run_paths.len())?;
2627        CountupWorkSource {
2628            temp_dir,
2629            inner: CountupWorkSourceInner::Spilled(run_paths),
2630        }
2631    };
2632    Ok(CountupWorkBuild {
2633        source,
2634        input_hist,
2635        input_read_hist,
2636        input_hist_elapsed_micros,
2637        format1,
2638        format2,
2639        spill_summary,
2640    })
2641}
2642
2643fn countup_prepass_chunk_ready(candidate_count: usize, candidate_bytes: usize) -> bool {
2644    candidate_count >= COUNTUP_PREPASS_CHUNK_PAIR_LIMIT
2645        || candidate_bytes >= COUNTUP_PREPASS_CHUNK_BYTE_LIMIT
2646}
2647
2648fn process_countup_work_candidates(
2649    config: &Config,
2650    presort_config: &Config,
2651    input_counts: &dyn CountLookup,
2652    candidates: Vec<CountupWorkCandidate>,
2653) -> Vec<CountupWorkPair> {
2654    candidates
2655        .into_par_iter()
2656        .filter_map(|candidate| {
2657            countup_work_pair_from_candidate(config, presort_config, input_counts, candidate)
2658        })
2659        .collect()
2660}
2661
2662fn process_countup_work_candidate_chunk(
2663    config: &Config,
2664    presort_config: &Config,
2665    input_counts: &dyn CountLookup,
2666    wants_depth_hist: bool,
2667    wants_read_hist: bool,
2668    candidates: Vec<CountupWorkCandidate>,
2669) -> CountupChunkBuild {
2670    if !wants_depth_hist && !wants_read_hist {
2671        return CountupChunkBuild {
2672            work_pairs: process_countup_work_candidates(
2673                config,
2674                presort_config,
2675                input_counts,
2676                candidates,
2677            ),
2678            depth_hist: SparseHist::default(),
2679            read_hist: SparseReadDepthHist::default(),
2680        };
2681    }
2682
2683    candidates
2684        .into_par_iter()
2685        .fold(
2686            || CountupChunkBuild {
2687                work_pairs: Vec::new(),
2688                depth_hist: SparseHist::default(),
2689                read_hist: SparseReadDepthHist::default(),
2690            },
2691            |mut local, candidate| {
2692                let mut hist = CountupInputHistAccumulator {
2693                    wants_depth_hist,
2694                    wants_read_hist,
2695                    depth_hist: &mut local.depth_hist,
2696                    read_hist: &mut local.read_hist,
2697                };
2698                if let Some(work_pair) = countup_work_pair_from_candidate_with_input_hists(
2699                    config,
2700                    presort_config,
2701                    input_counts,
2702                    candidate,
2703                    &mut hist,
2704                ) {
2705                    local.work_pairs.push(work_pair);
2706                }
2707                local
2708            },
2709        )
2710        .reduce(
2711            || CountupChunkBuild {
2712                work_pairs: Vec::new(),
2713                depth_hist: SparseHist::default(),
2714                read_hist: SparseReadDepthHist::default(),
2715            },
2716            |mut left, mut right| {
2717                left.work_pairs.append(&mut right.work_pairs);
2718                merge_sparse_hist(&mut left.depth_hist, right.depth_hist);
2719                merge_sparse_read_depth_hist(&mut left.read_hist, right.read_hist);
2720                left
2721            },
2722        )
2723}
2724
2725fn countup_work_pair_from_candidate(
2726    config: &Config,
2727    presort_config: &Config,
2728    input_counts: &dyn CountLookup,
2729    mut candidate: CountupWorkCandidate,
2730) -> Option<CountupWorkPair> {
2731    if !config.trim_after_marking {
2732        trim_pair(config, &mut candidate.r1, candidate.r2.as_mut());
2733    }
2734    let prepass_result = countup_prepass_pair(
2735        presort_config,
2736        config.add_bad_reads_countup,
2737        input_counts,
2738        &mut candidate.r1,
2739        candidate.r2.as_mut(),
2740        candidate.rand,
2741    );
2742    countup_work_pair_from_prepass_result(presort_config, input_counts, candidate, prepass_result)
2743}
2744
2745fn countup_work_pair_from_candidate_with_input_hists(
2746    config: &Config,
2747    presort_config: &Config,
2748    input_counts: &dyn CountLookup,
2749    mut candidate: CountupWorkCandidate,
2750    hist: &mut CountupInputHistAccumulator<'_>,
2751) -> Option<CountupWorkPair> {
2752    if config.trim_after_marking {
2753        let mut hist_r1 = candidate.r1.clone();
2754        let mut hist_r2 = candidate.r2.clone();
2755        trim_pair(config, &mut hist_r1, hist_r2.as_mut());
2756        let hist_analysis = analyze_pair(config, input_counts, &hist_r1, hist_r2.as_ref());
2757        increment_countup_input_hists_from_analysis(
2758            config,
2759            hist,
2760            &hist_r1,
2761            hist_r2.as_ref(),
2762            &hist_analysis,
2763        );
2764    } else {
2765        trim_pair(config, &mut candidate.r1, candidate.r2.as_mut());
2766        let (hist_analysis, prepass_analysis) = analyze_pair_for_two_configs(
2767            config,
2768            presort_config,
2769            input_counts,
2770            &candidate.r1,
2771            candidate.r2.as_ref(),
2772        );
2773        increment_countup_input_hists_from_analysis(
2774            config,
2775            hist,
2776            &candidate.r1,
2777            candidate.r2.as_ref(),
2778            &hist_analysis,
2779        );
2780        let prepass_result = countup_prepass_pair_from_analysis(
2781            presort_config,
2782            config.add_bad_reads_countup,
2783            input_counts,
2784            &mut candidate.r1,
2785            candidate.r2.as_mut(),
2786            candidate.rand,
2787            prepass_analysis,
2788        );
2789        return countup_work_pair_from_prepass_result(
2790            presort_config,
2791            input_counts,
2792            candidate,
2793            prepass_result,
2794        );
2795    }
2796
2797    if !config.trim_after_marking {
2798        trim_pair(config, &mut candidate.r1, candidate.r2.as_mut());
2799    }
2800    let prepass_result = countup_prepass_pair(
2801        presort_config,
2802        config.add_bad_reads_countup,
2803        input_counts,
2804        &mut candidate.r1,
2805        candidate.r2.as_mut(),
2806        candidate.rand,
2807    );
2808    countup_work_pair_from_prepass_result(presort_config, input_counts, candidate, prepass_result)
2809}
2810
2811fn increment_countup_input_hists_from_analysis(
2812    config: &Config,
2813    hist: &mut CountupInputHistAccumulator<'_>,
2814    r1: &SequenceRecord,
2815    r2: Option<&SequenceRecord>,
2816    analysis: &PairAnalysis,
2817) {
2818    if hist.wants_depth_hist {
2819        increment_sparse_hist_from_analysis(hist.depth_hist, &analysis.read1, config.hist_len);
2820        if let Some(read2_analysis) = &analysis.read2 {
2821            increment_sparse_hist_from_analysis(hist.depth_hist, read2_analysis, config.hist_len);
2822        }
2823    }
2824    if hist.wants_read_hist {
2825        increment_sparse_read_hist(hist.read_hist, &analysis.read1, r1.len(), config.hist_len);
2826        if let (Some(read2_analysis), Some(read2)) = (&analysis.read2, r2) {
2827            increment_sparse_read_hist(
2828                hist.read_hist,
2829                read2_analysis,
2830                read2.len(),
2831                config.hist_len,
2832            );
2833        }
2834    }
2835}
2836
2837fn countup_work_pair_from_prepass_result(
2838    presort_config: &Config,
2839    input_counts: &dyn CountLookup,
2840    candidate: CountupWorkCandidate,
2841    prepass_result: CountupPrepassResult,
2842) -> Option<CountupWorkPair> {
2843    if !prepass_result.include {
2844        return None;
2845    }
2846    let sort_key = prepass_result.sort_analysis.as_ref().map_or_else(
2847        || {
2848            countup_sort_key(
2849                presort_config,
2850                input_counts,
2851                &candidate.r1,
2852                candidate.r2.as_ref(),
2853                candidate.original_index,
2854            )
2855        },
2856        |analysis| {
2857            countup_sort_key_from_analysis(
2858                &candidate.r1,
2859                candidate.r2.as_ref(),
2860                candidate.original_index,
2861                analysis,
2862            )
2863        },
2864    );
2865    Some(CountupWorkPair {
2866        input_list_index: candidate.input_list_index,
2867        sort_key,
2868        r1: candidate.r1,
2869        r2: candidate.r2,
2870    })
2871}
2872
2873fn append_countup_work_pairs(
2874    config: &Config,
2875    temp_dir: &mut Option<tempfile::TempDir>,
2876    run_paths: &mut Vec<PathBuf>,
2877    spill_summary: &mut CountupSpillSummary,
2878    work_pairs: &mut Vec<CountupWorkPair>,
2879    work_pair_bytes: &mut usize,
2880    new_pairs: Vec<CountupWorkPair>,
2881) -> Result<()> {
2882    for work_pair in new_pairs {
2883        *work_pair_bytes =
2884            (*work_pair_bytes).saturating_add(countup_work_pair_memory_hint(&work_pair));
2885        work_pairs.push(work_pair);
2886        if work_pairs.len() >= COUNTUP_SORT_RUN_PAIR_LIMIT
2887            || *work_pair_bytes >= COUNTUP_SORT_RUN_BYTE_LIMIT
2888        {
2889            spill_countup_run(config, temp_dir, run_paths, spill_summary, work_pairs)?;
2890            *work_pair_bytes = 0;
2891        }
2892    }
2893    Ok(())
2894}
2895
2896fn countup_work_pair_memory_hint(pair: &CountupWorkPair) -> usize {
2897    std::mem::size_of::<CountupWorkPair>()
2898        .saturating_add(countup_sort_key_memory_hint(&pair.sort_key))
2899        .saturating_add(sequence_record_memory_hint(&pair.r1))
2900        .saturating_add(pair.r2.as_ref().map_or(0, sequence_record_memory_hint))
2901}
2902
2903fn countup_work_candidate_memory_hint(candidate: &CountupWorkCandidate) -> usize {
2904    std::mem::size_of::<CountupWorkCandidate>()
2905        .saturating_add(sequence_record_memory_hint(&candidate.r1))
2906        .saturating_add(candidate.r2.as_ref().map_or(0, sequence_record_memory_hint))
2907}
2908
2909fn countup_sort_key_memory_hint(key: &CountupSortKey) -> usize {
2910    let _ = key;
2911    std::mem::size_of::<CountupSortKey>()
2912}
2913
2914fn sequence_record_memory_hint(record: &SequenceRecord) -> usize {
2915    std::mem::size_of::<SequenceRecord>()
2916        .saturating_add(record.id.capacity())
2917        .saturating_add(record.bases.capacity())
2918        .saturating_add(record.qualities.as_ref().map_or(0, Vec::capacity))
2919}
2920
2921fn spill_countup_run(
2922    config: &Config,
2923    temp_dir: &mut Option<tempfile::TempDir>,
2924    run_paths: &mut Vec<PathBuf>,
2925    spill_summary: &mut CountupSpillSummary,
2926    work_pairs: &mut Vec<CountupWorkPair>,
2927) -> Result<()> {
2928    if work_pairs.is_empty() {
2929        return Ok(());
2930    }
2931    let dir = match temp_dir {
2932        Some(dir) => dir,
2933        None => temp_dir.insert(managed_temp_dir(config, "bbnorm-rs-countup-")?),
2934    };
2935    work_pairs.sort_by(compare_countup_work_pairs);
2936    let path = dir
2937        .path()
2938        .join(format!("countup-run-{:06}.bin", run_paths.len()));
2939    let bytes = write_countup_run(&path, work_pairs)?;
2940    spill_summary.note_initial_run(bytes);
2941    run_paths.push(path);
2942    enforce_countup_spill_limits(config, spill_summary, run_paths.len())?;
2943    work_pairs.clear();
2944    Ok(())
2945}
2946
2947fn compact_countup_runs(
2948    config: &Config,
2949    run_paths: &mut Vec<PathBuf>,
2950    spill_summary: &mut CountupSpillSummary,
2951) -> Result<()> {
2952    if run_paths.len() <= COUNTUP_SORT_MERGE_FANIN {
2953        return Ok(());
2954    }
2955    let run_dir = run_paths
2956        .first()
2957        .and_then(|path| path.parent())
2958        .context("count-up spill runs had no parent directory")?
2959        .to_path_buf();
2960    let mut round = 0usize;
2961    while run_paths.len() > COUNTUP_SORT_MERGE_FANIN {
2962        let old_paths = std::mem::take(run_paths);
2963        for (group_index, group) in old_paths.chunks(COUNTUP_SORT_MERGE_FANIN).enumerate() {
2964            let merged_path =
2965                run_dir.join(format!("countup-merge-{round:03}-{group_index:06}.bin"));
2966            let merged_bytes = merge_countup_run_group(group, &merged_path)?;
2967            spill_summary.note_merge_run(merged_bytes);
2968            run_paths.push(merged_path);
2969            enforce_countup_spill_limits(config, spill_summary, run_paths.len())?;
2970        }
2971        for path in old_paths {
2972            let removed_bytes = path.metadata().map(|metadata| metadata.len()).unwrap_or(0);
2973            match fs::remove_file(&path) {
2974                Ok(()) => spill_summary.note_removed(removed_bytes),
2975                Err(err) if err.kind() == ErrorKind::NotFound => {}
2976                Err(err) => {
2977                    return Err(err).with_context(|| {
2978                        format!("removing compacted count-up run {}", path.display())
2979                    });
2980                }
2981            }
2982        }
2983        round += 1;
2984    }
2985    Ok(())
2986}
2987
2988fn enforce_countup_spill_limits(
2989    config: &Config,
2990    spill_summary: &CountupSpillSummary,
2991    live_run_count: usize,
2992) -> Result<()> {
2993    if let Some(limit) = config.max_countup_spill_initial_runs
2994        && spill_summary.initial_runs > limit
2995    {
2996        bail!(
2997            "count-up spill exceeded maxcountupspillinitialruns: initial spill runs {} > limit {}",
2998            spill_summary.initial_runs,
2999            limit
3000        );
3001    }
3002    if let Some(limit) = config.max_countup_spill_merge_runs
3003        && spill_summary.merge_runs > limit
3004    {
3005        bail!(
3006            "count-up spill exceeded maxcountupspillmergeruns: merge spill runs {} > limit {}",
3007            spill_summary.merge_runs,
3008            limit
3009        );
3010    }
3011    if let Some(limit) = config.max_countup_spill_final_runs
3012        && live_run_count > limit
3013    {
3014        bail!(
3015            "count-up spill exceeded maxcountupspillfinalruns: live spill runs {} > limit {}",
3016            live_run_count,
3017            limit
3018        );
3019    }
3020    if let Some(limit) = config.max_countup_spill_live_bytes
3021        && spill_summary.peak_live_bytes > limit
3022    {
3023        bail!(
3024            "count-up spill exceeded maxcountupspillbytes: peak live spill bytes {} > limit {}",
3025            spill_summary.peak_live_bytes,
3026            limit
3027        );
3028    }
3029    if let Some(limit) = config.max_countup_spill_final_live_bytes
3030        && spill_summary.final_live_bytes > limit
3031    {
3032        bail!(
3033            "count-up spill exceeded maxcountupspillfinallivebytes: current/final live spill bytes {} > limit {}",
3034            spill_summary.final_live_bytes,
3035            limit
3036        );
3037    }
3038    if let Some(limit) = config.max_countup_spill_write_bytes
3039        && spill_summary.bytes_written > limit
3040    {
3041        bail!(
3042            "count-up spill exceeded maxcountupspillwritebytes: cumulative spill bytes written {} > limit {}",
3043            spill_summary.bytes_written,
3044            limit
3045        );
3046    }
3047    Ok(())
3048}
3049
3050fn merge_countup_run_group(paths: &[PathBuf], output_path: &Path) -> Result<u64> {
3051    let mut merger = CountupRunMerger::new(paths)?;
3052    let file = fs::File::create(output_path)
3053        .with_context(|| format!("creating compacted count-up run {}", output_path.display()))?;
3054    let mut writer = BufWriter::with_capacity(COUNTUP_RUN_IO_BUFFER_CAPACITY, file);
3055    while let Some(pair) = merger.next_pair()? {
3056        write_countup_work_pair(&mut writer, &pair)?;
3057    }
3058    writer
3059        .flush()
3060        .with_context(|| format!("flushing compacted count-up run {}", output_path.display()))?;
3061    output_path
3062        .metadata()
3063        .map(|metadata| metadata.len())
3064        .with_context(|| format!("checking compacted count-up run {}", output_path.display()))
3065}
3066
3067impl CountupWorkSource {
3068    fn into_iter(self) -> Result<CountupWorkIter> {
3069        let CountupWorkSource { temp_dir, inner } = self;
3070        let inner = match inner {
3071            CountupWorkSourceInner::Memory(work_pairs) => {
3072                CountupWorkIterInner::Memory(work_pairs.into_iter())
3073            }
3074            CountupWorkSourceInner::Spilled(paths) => {
3075                CountupWorkIterInner::Spilled(CountupRunMerger::new(&paths)?)
3076            }
3077        };
3078        Ok(CountupWorkIter {
3079            _temp_dir: temp_dir,
3080            inner,
3081        })
3082    }
3083}
3084
3085impl CountupWorkIter {
3086    fn next_pair(&mut self) -> Result<Option<CountupWorkPair>> {
3087        match &mut self.inner {
3088            CountupWorkIterInner::Memory(iter) => Ok(iter.next()),
3089            CountupWorkIterInner::Spilled(merger) => merger.next_pair(),
3090        }
3091    }
3092}
3093
3094impl CountupRunMerger {
3095    fn new(paths: &[PathBuf]) -> Result<Self> {
3096        let mut readers = Vec::with_capacity(paths.len());
3097        let mut heap = BinaryHeap::new();
3098        for path in paths {
3099            let mut reader = CountupRunReader::open(path)?;
3100            if let Some(pair) = reader.next_pair()? {
3101                heap.push(CountupRunHead {
3102                    pair,
3103                    run_index: readers.len(),
3104                });
3105            }
3106            readers.push(reader);
3107        }
3108        Ok(Self { readers, heap })
3109    }
3110
3111    fn next_pair(&mut self) -> Result<Option<CountupWorkPair>> {
3112        let Some(head) = self.heap.pop() else {
3113            return Ok(None);
3114        };
3115        let pair = head.pair;
3116        if let Some(next) = self.readers[head.run_index].next_pair()? {
3117            self.heap.push(CountupRunHead {
3118                pair: next,
3119                run_index: head.run_index,
3120            });
3121        }
3122        Ok(Some(pair))
3123    }
3124}
3125
3126impl CountupRunReader {
3127    fn open(path: &Path) -> Result<Self> {
3128        let file = fs::File::open(path)
3129            .with_context(|| format!("opening count-up run {}", path.display()))?;
3130        Ok(Self {
3131            reader: BufReader::with_capacity(COUNTUP_RUN_IO_BUFFER_CAPACITY, file),
3132        })
3133    }
3134
3135    fn next_pair(&mut self) -> Result<Option<CountupWorkPair>> {
3136        read_countup_work_pair(&mut self.reader)
3137    }
3138}
3139
3140impl PartialEq for CountupRunHead {
3141    fn eq(&self, other: &Self) -> bool {
3142        compare_countup_work_pairs(&self.pair, &other.pair) == CmpOrdering::Equal
3143            && self.run_index == other.run_index
3144    }
3145}
3146
3147impl Eq for CountupRunHead {}
3148
3149impl PartialOrd for CountupRunHead {
3150    fn partial_cmp(&self, other: &Self) -> Option<CmpOrdering> {
3151        Some(self.cmp(other))
3152    }
3153}
3154
3155impl Ord for CountupRunHead {
3156    fn cmp(&self, other: &Self) -> CmpOrdering {
3157        compare_countup_work_pairs(&other.pair, &self.pair)
3158            .then_with(|| other.run_index.cmp(&self.run_index))
3159    }
3160}
3161
3162fn write_countup_run(path: &Path, work_pairs: &[CountupWorkPair]) -> Result<u64> {
3163    let file = fs::File::create(path)
3164        .with_context(|| format!("creating count-up run {}", path.display()))?;
3165    let mut writer = BufWriter::with_capacity(COUNTUP_RUN_IO_BUFFER_CAPACITY, file);
3166    for pair in work_pairs {
3167        write_countup_work_pair(&mut writer, pair)?;
3168    }
3169    writer
3170        .flush()
3171        .with_context(|| format!("flushing count-up run {}", path.display()))?;
3172    path.metadata()
3173        .map(|metadata| metadata.len())
3174        .with_context(|| format!("checking count-up run {}", path.display()))
3175}
3176
3177fn write_countup_work_pair(writer: &mut impl Write, pair: &CountupWorkPair) -> Result<()> {
3178    write_usize(writer, pair.input_list_index)?;
3179    write_usize(writer, pair.sort_key.errors)?;
3180    write_usize(writer, pair.sort_key.total_len)?;
3181    writer.write_all(&pair.sort_key.expected_errors.to_le_bytes())?;
3182    writer.write_all(&pair.sort_key.numeric_id.to_le_bytes())?;
3183    write_usize(writer, pair.sort_key.original_index)?;
3184    write_sequence_record(writer, &pair.r1)?;
3185    write_bool(writer, pair.r2.is_some())?;
3186    if let Some(r2) = &pair.r2 {
3187        write_sequence_record(writer, r2)?;
3188    }
3189    Ok(())
3190}
3191
3192fn read_countup_work_pair(reader: &mut impl Read) -> Result<Option<CountupWorkPair>> {
3193    let Some(input_list_index) = read_usize_opt(reader)? else {
3194        return Ok(None);
3195    };
3196    let errors = read_usize(reader)?;
3197    let total_len = read_usize(reader)?;
3198    let expected_errors = read_f64(reader)?;
3199    let numeric_id = read_u64(reader)?;
3200    let original_index = read_usize(reader)?;
3201    let r1 = read_sequence_record(reader)?;
3202    let has_r2 = read_bool(reader)?;
3203    let r2 = has_r2.then(|| read_sequence_record(reader)).transpose()?;
3204    Ok(Some(CountupWorkPair {
3205        input_list_index,
3206        sort_key: CountupSortKey {
3207            errors,
3208            total_len,
3209            expected_errors,
3210            numeric_id,
3211            original_index,
3212        },
3213        r1,
3214        r2,
3215    }))
3216}
3217
3218fn write_sequence_record(writer: &mut impl Write, record: &SequenceRecord) -> Result<()> {
3219    write_string(writer, &record.id)?;
3220    writer.write_all(&record.numeric_id.to_le_bytes())?;
3221    write_bytes(writer, &record.bases)?;
3222    write_bool(writer, record.qualities.is_some())?;
3223    if let Some(qualities) = &record.qualities {
3224        write_bytes(writer, qualities)?;
3225    }
3226    Ok(())
3227}
3228
3229fn read_sequence_record(reader: &mut impl Read) -> Result<SequenceRecord> {
3230    let id = read_string(reader)?;
3231    let numeric_id = read_u64(reader)?;
3232    let bases = read_bytes(reader)?;
3233    let has_qualities = read_bool(reader)?;
3234    let qualities = has_qualities.then(|| read_bytes(reader)).transpose()?;
3235    Ok(SequenceRecord {
3236        id,
3237        numeric_id,
3238        bases,
3239        qualities,
3240    })
3241}
3242
3243fn write_string(writer: &mut impl Write, value: &str) -> Result<()> {
3244    write_bytes(writer, value.as_bytes())
3245}
3246
3247fn read_string(reader: &mut impl Read) -> Result<String> {
3248    let bytes = read_bytes(reader)?;
3249    String::from_utf8(bytes).context("count-up run contained invalid UTF-8 id")
3250}
3251
3252fn write_bytes(writer: &mut impl Write, bytes: &[u8]) -> Result<()> {
3253    write_usize(writer, bytes.len())?;
3254    writer.write_all(bytes)?;
3255    Ok(())
3256}
3257
3258fn read_bytes(reader: &mut impl Read) -> Result<Vec<u8>> {
3259    let len = read_usize(reader)?;
3260    let mut bytes = vec![0; len];
3261    reader.read_exact(&mut bytes)?;
3262    Ok(bytes)
3263}
3264
3265fn write_bool(writer: &mut impl Write, value: bool) -> Result<()> {
3266    writer.write_all(&[u8::from(value)])?;
3267    Ok(())
3268}
3269
3270fn read_bool(reader: &mut impl Read) -> Result<bool> {
3271    let mut buf = [0; 1];
3272    reader.read_exact(&mut buf)?;
3273    Ok(buf[0] != 0)
3274}
3275
3276fn write_usize(writer: &mut impl Write, value: usize) -> Result<()> {
3277    writer.write_all(&(value as u64).to_le_bytes())?;
3278    Ok(())
3279}
3280
3281fn read_usize(reader: &mut impl Read) -> Result<usize> {
3282    let value = read_u64(reader)?;
3283    usize::try_from(value).context("count-up run usize field exceeded this platform")
3284}
3285
3286fn read_usize_opt(reader: &mut impl Read) -> Result<Option<usize>> {
3287    let Some(value) = read_u64_opt(reader)? else {
3288        return Ok(None);
3289    };
3290    Ok(Some(usize::try_from(value).context(
3291        "count-up run usize field exceeded this platform",
3292    )?))
3293}
3294
3295fn read_u64(reader: &mut impl Read) -> Result<u64> {
3296    let mut buf = [0; 8];
3297    reader.read_exact(&mut buf)?;
3298    Ok(u64::from_le_bytes(buf))
3299}
3300
3301fn read_u64_opt(reader: &mut impl Read) -> Result<Option<u64>> {
3302    let mut buf = [0; 8];
3303    match reader.read_exact(&mut buf) {
3304        Ok(()) => Ok(Some(u64::from_le_bytes(buf))),
3305        Err(err) if err.kind() == ErrorKind::UnexpectedEof => Ok(None),
3306        Err(err) => Err(err.into()),
3307    }
3308}
3309
3310fn read_f64(reader: &mut impl Read) -> Result<f64> {
3311    let mut buf = [0; 8];
3312    reader.read_exact(&mut buf)?;
3313    Ok(f64::from_le_bytes(buf))
3314}
3315
3316fn countup_prepass_config(config: &Config) -> Config {
3317    let mut prepass = config.clone();
3318    prepass.count_up = false;
3319    // Java sets REQUIRE_BOTH_BAD=(rbb || COUNTUP) before temporarily
3320    // disabling COUNTUP for the relaxed count-up prepass.
3321    prepass.require_both_bad = true;
3322    prepass.target_depth = config.target_depth.saturating_mul(4).max(1);
3323    prepass.target_bad_percent_low = config.target_bad_percent_low / 4.0;
3324    prepass.target_bad_percent_high = config.target_bad_percent_high / 4.0;
3325    prepass.max_depth = config.max_depth.map(|depth| depth.saturating_mul(4).max(1));
3326    prepass.min_depth = config.min_depth / 2;
3327    prepass.min_kmers_over_min_depth = config.min_kmers_over_min_depth / 2;
3328    prepass.low_percentile = 0.20;
3329    prepass
3330}
3331
3332fn countup_prepass_pair(
3333    prepass_config: &Config,
3334    add_bad_reads_countup: bool,
3335    input_counts: &dyn CountLookup,
3336    r1: &mut SequenceRecord,
3337    r2: Option<&mut SequenceRecord>,
3338    rand: f64,
3339) -> CountupPrepassResult {
3340    let analysis = analyze_pair(prepass_config, input_counts, r1, r2.as_deref());
3341    countup_prepass_pair_from_analysis(
3342        prepass_config,
3343        add_bad_reads_countup,
3344        input_counts,
3345        r1,
3346        r2,
3347        rand,
3348        analysis,
3349    )
3350}
3351
3352fn countup_prepass_pair_from_analysis(
3353    prepass_config: &Config,
3354    add_bad_reads_countup: bool,
3355    input_counts: &dyn CountLookup,
3356    r1: &mut SequenceRecord,
3357    mut r2: Option<&mut SequenceRecord>,
3358    rand: f64,
3359    analysis: PairAnalysis,
3360) -> CountupPrepassResult {
3361    let decision =
3362        decide_pair_from_analysis(prepass_config, r1, r2.as_deref(), analysis, Some(rand));
3363    let include = !decision.toss || add_bad_reads_countup;
3364    if prepass_config.error_correct && !decision.toss {
3365        let correction =
3366            correct_pair_errors_with_rollback(prepass_config, input_counts, r1, r2.as_deref_mut());
3367        if (!correction.uncorrectable || prepass_config.mark_uncorrectable_errors)
3368            && prepass_config.trim_after_marking
3369        {
3370            trim_pair(prepass_config, r1, r2);
3371        }
3372        return CountupPrepassResult {
3373            include,
3374            sort_analysis: None,
3375        };
3376    }
3377    CountupPrepassResult {
3378        include,
3379        sort_analysis: include.then_some(decision.analysis),
3380    }
3381}
3382
3383fn compare_countup_work_pairs(left: &CountupWorkPair, right: &CountupWorkPair) -> CmpOrdering {
3384    left.input_list_index
3385        .cmp(&right.input_list_index)
3386        .then_with(|| compare_countup_sort_key(&left.sort_key, &right.sort_key))
3387        .then_with(|| left.r1.id.cmp(&right.r1.id))
3388        .then_with(|| {
3389            left.sort_key
3390                .original_index
3391                .cmp(&right.sort_key.original_index)
3392        })
3393}
3394
3395fn compare_countup_sort_key(left: &CountupSortKey, right: &CountupSortKey) -> CmpOrdering {
3396    left.errors
3397        .cmp(&right.errors)
3398        .then_with(|| right.total_len.cmp(&left.total_len))
3399        .then_with(|| {
3400            left.expected_errors
3401                .partial_cmp(&right.expected_errors)
3402                .unwrap_or(CmpOrdering::Equal)
3403        })
3404        .then_with(|| left.numeric_id.cmp(&right.numeric_id))
3405}
3406
3407fn countup_sort_key(
3408    config: &Config,
3409    input_counts: &dyn CountLookup,
3410    r1: &SequenceRecord,
3411    r2: Option<&SequenceRecord>,
3412    original_index: usize,
3413) -> CountupSortKey {
3414    let analysis = analyze_pair(config, input_counts, r1, r2);
3415    countup_sort_key_from_analysis(r1, r2, original_index, &analysis)
3416}
3417
3418fn countup_sort_key_from_analysis(
3419    r1: &SequenceRecord,
3420    r2: Option<&SequenceRecord>,
3421    original_index: usize,
3422    analysis: &PairAnalysis,
3423) -> CountupSortKey {
3424    CountupSortKey {
3425        errors: analysis.low_kmer_count,
3426        total_len: r1.len() + r2.map(SequenceRecord::len).unwrap_or(0),
3427        expected_errors: expected_errors(r1) + r2.map(expected_errors).unwrap_or(0.0),
3428        numeric_id: r1.numeric_id,
3429        original_index,
3430    }
3431}
3432
3433fn expected_errors(record: &SequenceRecord) -> f64 {
3434    let Some(qualities) = &record.qualities else {
3435        return 0.0;
3436    };
3437    record
3438        .bases
3439        .iter()
3440        .zip(qualities)
3441        .map(|(&base, &quality)| {
3442            let q = if is_defined_base(base) {
3443                quality.saturating_sub(33)
3444            } else {
3445                0
3446            };
3447            phred_error_probability(q)
3448        })
3449        .sum()
3450}
3451
3452fn phred_error_probability(q: u8) -> f64 {
3453    match q {
3454        0 => 0.75,
3455        1 => 0.70,
3456        _ => 10f64.powf(-0.1 * f64::from(q)),
3457    }
3458}
3459
3460fn unique_pair_kmers(
3461    config: &Config,
3462    r1: &SequenceRecord,
3463    r2: Option<&SequenceRecord>,
3464) -> Vec<KmerKey> {
3465    let mut keys = Vec::with_capacity(pair_kmer_window_capacity(config, r1, r2));
3466    fill_unique_pair_kmers(config, r1, r2, &mut keys);
3467    keys
3468}
3469
3470fn fill_unique_pair_kmers(
3471    config: &Config,
3472    r1: &SequenceRecord,
3473    r2: Option<&SequenceRecord>,
3474    keys: &mut Vec<KmerKey>,
3475) {
3476    keys.clear();
3477    let required = pair_kmer_window_capacity(config, r1, r2);
3478    if keys.capacity() < required {
3479        keys.reserve(required - keys.capacity());
3480    }
3481    for_each_kmer_for_record(r1, config, |key| keys.push(key));
3482    if let Some(mate) = r2 {
3483        for_each_kmer_for_record(mate, config, |key| keys.push(key));
3484    }
3485    keys.sort_unstable();
3486    keys.dedup();
3487}
3488
3489fn pair_kmer_window_capacity(
3490    config: &Config,
3491    r1: &SequenceRecord,
3492    r2: Option<&SequenceRecord>,
3493) -> usize {
3494    record_kmer_window_capacity(config.k, r1)
3495        .saturating_add(r2.map_or(0, |mate| record_kmer_window_capacity(config.k, mate)))
3496}
3497
3498fn record_kmer_window_capacity(k: usize, record: &SequenceRecord) -> usize {
3499    if k == 0 {
3500        0
3501    } else {
3502        record.bases.len().saturating_sub(k).saturating_add(1)
3503    }
3504}
3505
3506#[cfg(test)]
3507fn decide_countup_pair(
3508    config: &Config,
3509    input_counts: &dyn CountLookup,
3510    kept_counts: &dyn CountLookup,
3511    keys: &[KmerKey],
3512    target_depth: u64,
3513) -> bool {
3514    countup_decision_plan(config, input_counts, kept_counts, keys, target_depth).toss
3515}
3516
3517fn countup_decision_plan(
3518    config: &Config,
3519    input_counts: &dyn CountLookup,
3520    kept_counts: &dyn CountLookup,
3521    keys: &[KmerKey],
3522    target_depth: u64,
3523) -> CountupDecisionPlan {
3524    let unique = keys.len();
3525    if unique == 0 {
3526        return CountupDecisionPlan {
3527            toss: !config.keep_all,
3528            eligible_key_indices: Vec::new(),
3529        };
3530    }
3531
3532    let mut desired = 0usize;
3533    let mut needed = 0usize;
3534    let mut badly_needed = 0usize;
3535    let mut input_depths = config.toss_error_reads.then(Vec::new);
3536    let mut eligible_key_indices = Vec::with_capacity(keys.len());
3537    for (index, key) in keys.iter().enumerate() {
3538        let input_depth = input_counts.depth(key);
3539        if let Some(depths) = &mut input_depths {
3540            depths.push(input_depth);
3541        }
3542        if input_depth >= config.min_depth {
3543            desired += 1;
3544            eligible_key_indices.push(index);
3545            let kept_depth = kept_counts.depth(key);
3546            if kept_depth < target_depth {
3547                needed += 1;
3548                if kept_depth < target_depth.min(input_depth).saturating_mul(3) / 4 {
3549                    badly_needed += 1;
3550                }
3551            }
3552        }
3553    }
3554
3555    let threshold_needed = 8usize.max(unique.div_ceil(6));
3556    let threshold_badly_needed = 2usize.max(unique.div_ceil(24));
3557    let keep = (needed >= threshold_needed || badly_needed >= threshold_badly_needed)
3558        && (desired >= config.min_kmers_over_min_depth || unique < config.min_kmers_over_min_depth);
3559    let mut toss = !keep;
3560    if config.toss_error_reads
3561        && let Some(mut depths) = input_depths
3562    {
3563        let errors = countup_error_count(&mut depths, config);
3564        if errors > 8 && needed < 2 * threshold_needed && badly_needed < 2 * threshold_badly_needed
3565        {
3566            toss = true;
3567        }
3568        if errors > unique / 2
3569            && needed < 3 * threshold_needed
3570            && badly_needed < 4 * threshold_badly_needed
3571        {
3572            toss = true;
3573        }
3574    }
3575    CountupDecisionPlan {
3576        toss: if config.keep_all { false } else { toss },
3577        eligible_key_indices,
3578    }
3579}
3580
3581fn countup_error_count(depths: &mut [u64], config: &Config) -> usize {
3582    depths.sort_unstable();
3583    let mut previous: Option<u64> = None;
3584    for (index, &depth) in depths.iter().enumerate() {
3585        if let Some(prev) = previous
3586            && ((depth >= config.high_thresh && prev <= config.low_thresh)
3587                || depth >= prev.saturating_mul(config.error_detect_ratio))
3588        {
3589            return depths.len() - index;
3590        }
3591        previous = Some(depth);
3592    }
3593    0
3594}
3595
3596#[cfg(test)]
3597fn increment_countup_kept_counts(
3598    config: &Config,
3599    kept_counts: &mut OutputCounts,
3600    input_counts: &dyn CountLookup,
3601    keys: &[KmerKey],
3602) {
3603    let mut atomic_increments = 0u64;
3604    for key in keys {
3605        if input_counts.depth(key) >= config.min_depth {
3606            match kept_counts {
3607                OutputCounts::Exact(counts) => {
3608                    *counts.entry(key.clone()).or_insert(0) += 1;
3609                }
3610                OutputCounts::Sketch(sketch) => sketch.increment(key),
3611                OutputCounts::AtomicSketch(sketch) => {
3612                    sketch.increment_key(key);
3613                    atomic_increments = atomic_increments.saturating_add(1);
3614                }
3615            }
3616        }
3617    }
3618    if let OutputCounts::AtomicSketch(sketch) = kept_counts {
3619        sketch.add_key_increments(atomic_increments);
3620    }
3621}
3622
3623#[cfg(test)]
3624fn update_countup_kept_counts_for_decision(
3625    config: &Config,
3626    kept_counts: &mut OutputCounts,
3627    input_counts: &dyn CountLookup,
3628    keys: &[KmerKey],
3629    toss: bool,
3630) {
3631    if !toss || config.add_bad_reads_countup {
3632        increment_countup_kept_counts(config, kept_counts, input_counts, keys);
3633    }
3634}
3635
3636fn update_countup_kept_counts_for_plan(
3637    config: &Config,
3638    kept_counts: &mut OutputCounts,
3639    keys: &[KmerKey],
3640    plan: &CountupDecisionPlan,
3641) {
3642    if plan.toss && !config.add_bad_reads_countup {
3643        return;
3644    }
3645    let mut atomic_increments = 0u64;
3646    for &index in &plan.eligible_key_indices {
3647        let Some(key) = keys.get(index) else {
3648            continue;
3649        };
3650        match kept_counts {
3651            OutputCounts::Exact(counts) => {
3652                *counts.entry(key.clone()).or_insert(0) += 1;
3653            }
3654            OutputCounts::Sketch(sketch) => sketch.increment(key),
3655            OutputCounts::AtomicSketch(sketch) => {
3656                sketch.increment_key(key);
3657                atomic_increments = atomic_increments.saturating_add(1);
3658            }
3659        }
3660    }
3661    if let OutputCounts::AtomicSketch(sketch) = kept_counts {
3662        sketch.add_key_increments(atomic_increments);
3663    }
3664}
3665
3666fn countup_length_toss(config: &Config, r1: &SequenceRecord, r2: Option<&SequenceRecord>) -> bool {
3667    !config.keep_all
3668        && (r1.len() < config.min_length || r2.is_some_and(|mate| mate.len() < config.min_length))
3669}
3670
3671#[cfg(test)]
3672fn count_map_depth_hist(counts: &CountMap, hist_len: usize) -> Vec<u64> {
3673    let mut hist = vec![0; hist_len];
3674    for &depth in counts.values() {
3675        let idx = (depth as usize).min(hist_len.saturating_sub(1));
3676        hist[idx] += depth;
3677    }
3678    hist
3679}
3680
3681fn count_map_sparse_depth_hist(counts: &CountMap, hist_len: usize) -> SparseHist {
3682    let Some(last_index) = hist_len.checked_sub(1) else {
3683        return SparseHist::default();
3684    };
3685    let mut hist = SparseHist::default();
3686    for &depth in counts.values() {
3687        add_depth_to_sparse_hist(&mut hist, depth, last_index);
3688    }
3689    hist
3690}
3691
3692#[cfg(test)]
3693fn add_depth_to_dynamic_hist(local: &mut Vec<u64>, depth: u64, last_index: usize) {
3694    if depth == 0 {
3695        return;
3696    }
3697    let idx = usize_from_u64_saturating(depth).min(last_index);
3698    if idx >= local.len() {
3699        local.resize(idx + 1, 0);
3700    }
3701    local[idx] = local[idx].saturating_add(depth);
3702}
3703
3704#[cfg(test)]
3705fn merge_dynamic_depth_hist(mut left: Vec<u64>, right: Vec<u64>) -> Vec<u64> {
3706    if right.len() > left.len() {
3707        left.resize(right.len(), 0);
3708    }
3709    for (index, value) in right.into_iter().enumerate() {
3710        left[index] = left[index].saturating_add(value);
3711    }
3712    left
3713}
3714
3715fn add_depth_to_sparse_hist(local: &mut SparseHist, depth: u64, last_index: usize) {
3716    if depth == 0 {
3717        return;
3718    }
3719    let idx = usize_from_u64_saturating(depth).min(last_index);
3720    let entry = local.entry(idx).or_insert(0);
3721    *entry = entry.saturating_add(depth);
3722}
3723
3724fn merge_sparse_depth_hist(mut left: SparseHist, right: SparseHist) -> SparseHist {
3725    merge_sparse_hist(&mut left, right);
3726    left
3727}
3728
3729#[cfg(test)]
3730fn build_input_counts(config: &Config) -> Result<InputCounts> {
3731    let mut stage_timings = Vec::new();
3732    build_input_counts_with_stage_timings(config, &mut stage_timings)
3733}
3734
3735fn build_input_counts_with_stage_timings(
3736    config: &Config,
3737    stage_timings: &mut Vec<StageTiming>,
3738) -> Result<InputCounts> {
3739    let started = Instant::now();
3740    let counts = build_input_counts_inner(config, stage_timings)?;
3741    record_stage_timing(stage_timings, "input_counting", started);
3742    Ok(counts)
3743}
3744
3745fn build_input_counts_inner(
3746    config: &Config,
3747    stage_timings: &mut Vec<StageTiming>,
3748) -> Result<InputCounts> {
3749    if use_bounded_input_sketch(config) {
3750        return build_sketch_input_counts(config, stage_timings);
3751    }
3752    let started = Instant::now();
3753    let mut counts = new_count_map(config);
3754    count_primary(config, &mut counts)?;
3755    for extra in &config.extra {
3756        count_single_file(config, extra, &mut counts, None)?;
3757    }
3758    apply_trusted_build_pass_filter(config, &mut counts);
3759    apply_prefilter_collision_estimates(config, &mut counts);
3760    apply_count_min_collision_estimates(config, &mut counts);
3761    record_stage_timing(stage_timings, "input_exact_counting", started);
3762    Ok(InputCounts::Exact(counts))
3763}
3764
3765fn use_bounded_input_sketch(config: &Config) -> bool {
3766    if config.force_exact_counts {
3767        return false;
3768    }
3769    config.count_min.cells.is_some()
3770        || config.count_min.memory_bytes.is_some()
3771        || automatic_count_min_should_use(config)
3772}
3773
3774fn gpu_counting_supported(config: &Config) -> bool {
3775    config.gpu_counting
3776        && config.gpu_helper.is_some()
3777        && config.k <= 31
3778        && !use_prefilter_collision_estimates(config)
3779}
3780
3781fn build_sketch_input_counts(
3782    config: &Config,
3783    stage_timings: &mut Vec<StageTiming>,
3784) -> Result<InputCounts> {
3785    validate_gpu_counting_request(config)?;
3786    if use_prefilter_collision_estimates(config) {
3787        let started = Instant::now();
3788        let mut prefilter = new_input_prefilter_count_min_sketch(config)?;
3789        count_primary_prefilter_sketch(config, &mut prefilter)?;
3790        for extra in &config.extra {
3791            count_single_file_prefilter_sketch(config, extra, &mut prefilter, None)?;
3792        }
3793        let prefilter_limit = prefilter.max_count();
3794        record_stage_timing(stage_timings, "input_prefilter_counting", started);
3795
3796        if use_atomic_count_min_sketch(config) {
3797            let started = Instant::now();
3798            let sketch = new_atomic_count_min_sketch_with_mask_seed(
3799                config,
3800                BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED,
3801            )?;
3802            count_primary_atomic_sketch(
3803                config,
3804                &sketch,
3805                Some(PrefilterGate::new(&prefilter, prefilter_limit)),
3806            )?;
3807            for extra in &config.extra {
3808                count_single_file_atomic_sketch(
3809                    config,
3810                    extra,
3811                    &sketch,
3812                    None,
3813                    Some(PrefilterGate::new(&prefilter, prefilter_limit)),
3814                )?;
3815            }
3816            record_stage_timing(stage_timings, "input_main_counting", started);
3817            return Ok(InputCounts::PrefilteredSketch {
3818                prefilter,
3819                limit: prefilter_limit,
3820                main: Box::new(InputCounts::AtomicSketch(sketch)),
3821            });
3822        }
3823
3824        let started = Instant::now();
3825        let mut sketch = new_bounded_count_min_sketch_with_mask_seed(
3826            config,
3827            BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED,
3828        )?;
3829        count_primary_sketch(
3830            config,
3831            &mut sketch,
3832            Some(PrefilterGate::new(&prefilter, prefilter_limit)),
3833        )?;
3834        for extra in &config.extra {
3835            count_single_file_sketch(
3836                config,
3837                extra,
3838                &mut sketch,
3839                None,
3840                Some(PrefilterGate::new(&prefilter, prefilter_limit)),
3841            )?;
3842        }
3843        record_stage_timing(stage_timings, "input_main_counting", started);
3844        return Ok(InputCounts::PrefilteredSketch {
3845            prefilter,
3846            limit: prefilter_limit,
3847            main: Box::new(InputCounts::Sketch(sketch)),
3848        });
3849    }
3850
3851    if use_atomic_count_min_sketch(config) {
3852        let started = Instant::now();
3853        let sketch = new_atomic_count_min_sketch(config)?;
3854        if gpu_counting_supported(config) {
3855            count_primary_gpu_reduced_runs_atomic_sketch(config, &sketch)?;
3856        } else {
3857            count_primary_atomic_sketch(config, &sketch, None)?;
3858        }
3859        for extra in &config.extra {
3860            count_single_file_atomic_sketch(config, extra, &sketch, None, None)?;
3861        }
3862        record_stage_timing(stage_timings, "input_main_counting", started);
3863        return Ok(InputCounts::AtomicSketch(sketch));
3864    }
3865    if use_atomic_packed_input_sketch(config) {
3866        let started = Instant::now();
3867        let sketch = new_atomic_packed_count_min_sketch(config)?;
3868        count_primary_atomic_packed_sketch(config, &sketch)?;
3869        for extra in &config.extra {
3870            count_single_file_atomic_packed_sketch(config, extra, &sketch, None)?;
3871        }
3872        record_stage_timing(stage_timings, "input_main_counting", started);
3873        return Ok(InputCounts::AtomicPackedSketch(sketch));
3874    }
3875    let started = Instant::now();
3876    let mut sketch = new_bounded_count_min_sketch(config)?;
3877    if gpu_counting_supported(config) {
3878        count_primary_gpu_reduced_runs_sketch(config, &mut sketch)?;
3879    } else {
3880        count_primary_sketch(config, &mut sketch, None)?;
3881    }
3882    for extra in &config.extra {
3883        count_single_file_sketch(config, extra, &mut sketch, None, None)?;
3884    }
3885    record_stage_timing(stage_timings, "input_main_counting", started);
3886    Ok(InputCounts::Sketch(sketch))
3887}
3888
3889fn validate_gpu_counting_request(config: &Config) -> Result<()> {
3890    if !config.gpu_counting {
3891        return Ok(());
3892    }
3893    ensure!(
3894        config.gpu_helper.is_some(),
3895        "gpucounting=t requires gpuhelper=<cuda_kmer_reduce_runs binary>"
3896    );
3897    ensure!(
3898        config.k <= 31,
3899        "gpucounting=t currently supports short k-mers only (k<=31)"
3900    );
3901    ensure!(
3902        !use_prefilter_collision_estimates(config),
3903        "gpucounting=t currently supports the main bounded sketch without prefilter=t"
3904    );
3905    Ok(())
3906}
3907
3908fn new_output_counts(config: &Config) -> Result<OutputCounts> {
3909    if use_bounded_input_sketch(config) {
3910        if config.count_up {
3911            return new_countup_output_counts(config);
3912        }
3913        if use_atomic_count_min_sketch(config) {
3914            new_atomic_output_count_min_sketch(config).map(OutputCounts::AtomicSketch)
3915        } else {
3916            new_bounded_output_count_min_sketch(config).map(OutputCounts::Sketch)
3917        }
3918    } else {
3919        Ok(OutputCounts::Exact(new_count_map(config)))
3920    }
3921}
3922
3923fn new_atomic_output_count_min_sketch(config: &Config) -> Result<AtomicCountMinSketch> {
3924    let hashes = config
3925        .count_min
3926        .hashes
3927        .unwrap_or(3)
3928        .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
3929    let total_cells = output_count_min_total_cells(config, 32);
3930    ensure_count_min_budget_fits_memory(
3931        "output_kept",
3932        total_cells,
3933        32,
3934        output_count_min_memory_bytes(config, 32),
3935    )?;
3936    let min_arrays = kcount_array_min_arrays(config);
3937    let cells = count_min_table_cells_from_total_bits_with_min_arrays(total_cells, 32, min_arrays);
3938    let update_mode = count_min_update_mode(config, 32, hashes);
3939    AtomicCountMinSketch::new_with_min_arrays_and_update_mode(
3940        cells,
3941        hashes,
3942        min_arrays,
3943        update_mode,
3944        kept_output_mask_seed(config),
3945    )
3946    .map(|sketch| sketch.with_parallel_replay(!config.deterministic))
3947}
3948
3949fn new_bounded_output_count_min_sketch(config: &Config) -> Result<PackedCountMinSketch> {
3950    let hashes = config
3951        .count_min
3952        .hashes
3953        .unwrap_or(3)
3954        .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
3955    let bits = config.count_min.bits.unwrap_or(32);
3956    let total_cells = output_count_min_total_cells(config, bits);
3957    ensure_count_min_budget_fits_memory(
3958        "output_kept",
3959        total_cells,
3960        bits,
3961        output_count_min_memory_bytes(config, bits),
3962    )?;
3963    let min_arrays = kcount_array_min_arrays(config);
3964    let cells =
3965        count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
3966    PackedCountMinSketch::new_with_min_arrays_and_mask_seed(
3967        cells,
3968        hashes,
3969        bits,
3970        min_arrays,
3971        kept_output_mask_seed(config),
3972    )
3973    .map(|sketch| sketch.with_update_mode(count_min_update_mode(config, bits, hashes)))
3974}
3975
3976fn new_countup_output_counts(config: &Config) -> Result<OutputCounts> {
3977    let bits = countup_output_count_bits(config);
3978    let hashes = 3;
3979    let total_cells = countup_output_total_cells(config, bits);
3980    ensure_count_min_budget_fits_memory(
3981        "count-up output",
3982        total_cells,
3983        bits,
3984        config
3985            .count_min
3986            .memory_bytes
3987            .or_else(|| automatic_count_min_memory_bytes(config)),
3988    )?;
3989    let min_arrays = kcount_array_min_arrays(config);
3990    let cells =
3991        count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
3992    PackedCountMinSketch::new_with_min_arrays_and_mask_seed(
3993        cells,
3994        hashes,
3995        bits,
3996        min_arrays,
3997        countup_output_mask_seed(config),
3998    )
3999    .map(|sketch| sketch.with_update_mode(count_min_update_mode(config, bits, hashes)))
4000    .map(OutputCounts::Sketch)
4001}
4002
4003fn countup_output_count_bits(config: &Config) -> u8 {
4004    let target = countup_adjusted_target_depth(config);
4005    if target <= 15 {
4006        4
4007    } else if target <= 255 {
4008        8
4009    } else {
4010        16
4011    }
4012}
4013
4014fn countup_adjusted_target_depth(config: &Config) -> u64 {
4015    ((config.target_depth as f64) * 0.95).round().max(1.0) as u64
4016}
4017
4018fn countup_output_total_cells(config: &Config, bits: u8) -> usize {
4019    config
4020        .count_min
4021        .cells
4022        .unwrap_or_else(|| count_min_cells_from_memory(count_min_memory_bytes(config), bits))
4023        .max(1)
4024}
4025
4026fn countup_output_mask_seed(config: &Config) -> u64 {
4027    kept_output_mask_seed(config)
4028}
4029
4030fn kept_output_mask_seed(config: &Config) -> u64 {
4031    let preceding_tables = if use_prefilter_collision_estimates(config) {
4032        2
4033    } else {
4034        1
4035    };
4036    BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED
4037        .saturating_add(BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP.saturating_mul(preceding_tables))
4038}
4039
4040fn output_count_min_total_cells(config: &Config, bits: u8) -> usize {
4041    let base = config
4042        .count_min
4043        .cells
4044        .unwrap_or_else(|| {
4045            count_min_cells_from_memory(output_count_min_memory_bytes(config, bits), bits)
4046        })
4047        .max(1);
4048    let Some(fraction_micros) = prefilter_memory_fraction_micros(config) else {
4049        return cap_main_cells_to_short_kmer_space(config, base);
4050    };
4051    let main_fraction = 1_000_000usize.saturating_sub(fraction_micros as usize);
4052    base.saturating_mul(main_fraction)
4053        .checked_div(1_000_000)
4054        .unwrap_or(0)
4055        .max(1)
4056}
4057
4058fn output_count_min_memory_bytes(config: &Config, _bits: u8) -> Option<usize> {
4059    if config.count_min.cells.is_some() {
4060        return config
4061            .count_min
4062            .memory_bytes
4063            .or(config.auto_count_min_memory_bytes)
4064            .or_else(|| automatic_count_min_memory_bytes(config));
4065    }
4066    if config.count_min.memory_bytes.is_some() {
4067        return count_min_memory_bytes(config);
4068    }
4069    automatic_count_min_memory_bytes(config).map(output_count_min_auto_memory_bytes)
4070}
4071
4072fn output_count_min_auto_memory_bytes(memory_bytes: usize) -> usize {
4073    let min_memory = OUTPUT_COUNT_MIN_AUTO_MIN_MEMORY_BYTES.min(memory_bytes);
4074    scale_by_micros(memory_bytes, OUTPUT_COUNT_MIN_AUTO_FRACTION_MICROS)
4075        .max(min_memory)
4076        .min(memory_bytes)
4077        .max(1)
4078}
4079
4080fn use_atomic_count_min_sketch(config: &Config) -> bool {
4081    config.count_min.bits.unwrap_or(32) == 32
4082}
4083
4084fn use_atomic_packed_input_sketch(config: &Config) -> bool {
4085    !config.deterministic
4086        && config.count_min.bits.unwrap_or(32) < 32
4087        && !use_prefilter_collision_estimates(config)
4088        && !gpu_counting_supported(config)
4089}
4090
4091fn new_atomic_count_min_sketch(config: &Config) -> Result<AtomicCountMinSketch> {
4092    new_atomic_count_min_sketch_with_mask_seed(config, BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED)
4093}
4094
4095fn new_atomic_count_min_sketch_with_mask_seed(
4096    config: &Config,
4097    mask_seed: u64,
4098) -> Result<AtomicCountMinSketch> {
4099    let hashes = config
4100        .count_min
4101        .hashes
4102        .unwrap_or(3)
4103        .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4104    let total_cells = main_count_min_total_cells(config, 32);
4105    ensure_count_min_budget_fits_memory(
4106        "main",
4107        total_cells,
4108        32,
4109        config
4110            .count_min
4111            .memory_bytes
4112            .or(config.auto_count_min_memory_bytes),
4113    )?;
4114    let min_arrays = kcount_array_min_arrays(config);
4115    let cells = count_min_table_cells_from_total_bits_with_min_arrays(total_cells, 32, min_arrays);
4116    let update_mode = count_min_update_mode(config, 32, hashes);
4117    AtomicCountMinSketch::new_with_min_arrays_and_update_mode(
4118        cells,
4119        hashes,
4120        min_arrays,
4121        update_mode,
4122        mask_seed,
4123    )
4124    .map(|sketch| sketch.with_parallel_replay(!config.deterministic))
4125}
4126
4127fn new_atomic_packed_count_min_sketch(config: &Config) -> Result<AtomicPackedCountMinSketch> {
4128    new_atomic_packed_count_min_sketch_with_mask_seed(config, BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED)
4129}
4130
4131fn new_atomic_packed_count_min_sketch_with_mask_seed(
4132    config: &Config,
4133    mask_seed: u64,
4134) -> Result<AtomicPackedCountMinSketch> {
4135    let bits = config.count_min.bits.unwrap_or(32);
4136    let hashes = config
4137        .count_min
4138        .hashes
4139        .unwrap_or(BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS);
4140    let total_cells = main_count_min_total_cells(config, bits);
4141    ensure_count_min_budget_fits_memory(
4142        "count-min sketch",
4143        total_cells,
4144        bits,
4145        config
4146            .count_min
4147            .memory_bytes
4148            .or(config.auto_count_min_memory_bytes),
4149    )?;
4150    let min_arrays = hashes.max(BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS);
4151    let cells =
4152        count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
4153    AtomicPackedCountMinSketch::new_with_min_arrays_and_update_mode(
4154        cells,
4155        hashes,
4156        bits,
4157        min_arrays,
4158        count_min_update_mode(config, bits, hashes),
4159        mask_seed,
4160    )
4161}
4162
4163fn new_bounded_count_min_sketch(config: &Config) -> Result<PackedCountMinSketch> {
4164    new_bounded_count_min_sketch_with_mask_seed(config, BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED)
4165}
4166
4167fn new_bounded_count_min_sketch_with_mask_seed(
4168    config: &Config,
4169    mask_seed: u64,
4170) -> Result<PackedCountMinSketch> {
4171    let hashes = config
4172        .count_min
4173        .hashes
4174        .unwrap_or(3)
4175        .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4176    let bits = config.count_min.bits.unwrap_or(32);
4177    let total_cells = main_count_min_total_cells(config, bits);
4178    ensure_count_min_budget_fits_memory(
4179        "main",
4180        total_cells,
4181        bits,
4182        config
4183            .count_min
4184            .memory_bytes
4185            .or(config.auto_count_min_memory_bytes),
4186    )?;
4187    let min_arrays = kcount_array_min_arrays(config);
4188    let cells =
4189        count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
4190    PackedCountMinSketch::new_with_min_arrays_and_mask_seed(
4191        cells, hashes, bits, min_arrays, mask_seed,
4192    )
4193    .map(|sketch| sketch.with_update_mode(count_min_update_mode(config, bits, hashes)))
4194}
4195
4196fn new_prefilter_count_min_sketch(config: &Config) -> Result<PackedCountMinSketch> {
4197    let hashes = config
4198        .prefilter
4199        .hashes
4200        .unwrap_or_else(|| default_prefilter_hashes(config))
4201        .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4202    let bits = config.prefilter.bits.unwrap_or(DEFAULT_PREFILTER_BITS);
4203    let total_cells = prefilter_total_cells(config, bits).max(1);
4204    ensure_count_min_budget_fits_memory(
4205        "prefilter",
4206        total_cells,
4207        bits,
4208        config
4209            .prefilter
4210            .memory_bytes
4211            .or(config.count_min.memory_bytes)
4212            .or(config.auto_count_min_memory_bytes),
4213    )?;
4214    let min_arrays = kcount_array_min_arrays(config);
4215    let cells =
4216        count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
4217    PackedCountMinSketch::new_with_min_arrays(cells, hashes, bits, min_arrays)
4218        .map(|sketch| sketch.with_update_mode(count_min_update_mode(config, bits, hashes)))
4219}
4220
4221fn new_input_prefilter_count_min_sketch(config: &Config) -> Result<PrefilterCountMinSketch> {
4222    if config.deterministic {
4223        return new_prefilter_count_min_sketch(config).map(PrefilterCountMinSketch::Packed);
4224    }
4225    new_atomic_packed_prefilter_count_min_sketch(config).map(PrefilterCountMinSketch::AtomicPacked)
4226}
4227
4228fn new_atomic_packed_prefilter_count_min_sketch(
4229    config: &Config,
4230) -> Result<AtomicPackedCountMinSketch> {
4231    let hashes = config
4232        .prefilter
4233        .hashes
4234        .unwrap_or_else(|| default_prefilter_hashes(config))
4235        .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4236    let bits = config.prefilter.bits.unwrap_or(DEFAULT_PREFILTER_BITS);
4237    let total_cells = prefilter_total_cells(config, bits).max(1);
4238    ensure_count_min_budget_fits_memory(
4239        "prefilter",
4240        total_cells,
4241        bits,
4242        config
4243            .prefilter
4244            .memory_bytes
4245            .or(config.count_min.memory_bytes)
4246            .or(config.auto_count_min_memory_bytes),
4247    )?;
4248    let min_arrays = kcount_array_min_arrays(config);
4249    let cells =
4250        count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
4251    AtomicPackedCountMinSketch::new_with_min_arrays_and_update_mode(
4252        cells,
4253        hashes,
4254        bits,
4255        min_arrays,
4256        count_min_update_mode(config, bits, hashes),
4257        BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
4258    )
4259}
4260
4261fn default_prefilter_hashes(config: &Config) -> usize {
4262    let main_hashes = config
4263        .count_min
4264        .hashes
4265        .unwrap_or(3)
4266        .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4267    main_hashes.div_ceil(2)
4268}
4269
4270fn count_min_update_mode(config: &Config, bits: u8, hashes: usize) -> CountMinUpdateMode {
4271    // KCountArray7MTA defaults to locked/symmetric writes for multi-hash,
4272    // multi-bit cells; lockedincrement=f / symmetricwrite=f opts into the
4273    // faster independent-row update path.
4274    if bits > 1 && hashes > 1 && config.locked_increment.unwrap_or(true) {
4275        CountMinUpdateMode::Conservative
4276    } else {
4277        CountMinUpdateMode::Independent
4278    }
4279}
4280
4281fn count_min_memory_bytes(config: &Config) -> Option<usize> {
4282    config
4283        .count_min
4284        .memory_bytes
4285        .or_else(|| automatic_count_min_memory_bytes(config))
4286}
4287
4288fn main_count_min_total_cells(config: &Config, bits: u8) -> usize {
4289    let base = config
4290        .count_min
4291        .cells
4292        .unwrap_or_else(|| count_min_cells_from_memory(count_min_memory_bytes(config), bits))
4293        .max(1);
4294    let Some(fraction_micros) = prefilter_memory_fraction_micros(config) else {
4295        return cap_main_cells_to_short_kmer_space(config, base);
4296    };
4297    let main_fraction = 1_000_000usize.saturating_sub(fraction_micros as usize);
4298    base.saturating_mul(main_fraction)
4299        .checked_div(1_000_000)
4300        .unwrap_or(0)
4301        .max(1)
4302}
4303
4304fn cap_main_cells_to_short_kmer_space(config: &Config, cells: usize) -> usize {
4305    if use_prefilter_collision_estimates(config) {
4306        return cells;
4307    }
4308    short_kmer_space_cells(config.k)
4309        .map(|cap| cells.min(cap))
4310        .unwrap_or(cells)
4311        .max(1)
4312}
4313
4314fn short_kmer_space_cells(k: usize) -> Option<usize> {
4315    if k >= 32 {
4316        return None;
4317    }
4318    1usize.checked_shl((2 * k) as u32)
4319}
4320
4321fn prefilter_memory_fraction_micros(config: &Config) -> Option<u32> {
4322    if config.prefilter.force_disabled {
4323        return None;
4324    }
4325    if config.prefilter.cells.is_some() || config.prefilter.memory_bytes.is_some() {
4326        return None;
4327    }
4328    if let Some(fraction) = config
4329        .prefilter
4330        .memory_fraction_micros
4331        .filter(|fraction| *fraction > 0)
4332    {
4333        return Some(fraction);
4334    }
4335    if config.prefilter.enabled && use_bounded_input_sketch(config) {
4336        return Some(DEFAULT_PREFILTER_FRACTION_MICROS);
4337    }
4338    None
4339}
4340
4341fn scale_by_micros(value: usize, micros: u32) -> usize {
4342    value
4343        .saturating_mul(micros as usize)
4344        .checked_div(1_000_000)
4345        .unwrap_or(0)
4346}
4347
4348fn zeroed_u64_vec(len: usize) -> Result<Vec<u64>> {
4349    // SAFETY: an all-zero bit pattern is a valid initialized `u64`.
4350    unsafe { zeroed_vec_with_layout::<u64>(len, "u64") }
4351}
4352
4353fn zeroed_atomic_u32_vec(len: usize) -> Result<Vec<AtomicU32>> {
4354    // SAFETY: an all-zero bit pattern is a valid initialized `AtomicU32`.
4355    unsafe { zeroed_vec_with_layout::<AtomicU32>(len, "AtomicU32") }
4356}
4357
4358fn zeroed_atomic_u64_vec(len: usize) -> Result<Vec<AtomicU64>> {
4359    // SAFETY: an all-zero bit pattern is a valid initialized `AtomicU64`.
4360    unsafe { zeroed_vec_with_layout::<AtomicU64>(len, "AtomicU64") }
4361}
4362
4363unsafe fn zeroed_vec_with_layout<T>(len: usize, type_name: &str) -> Result<Vec<T>> {
4364    if len == 0 {
4365        return Ok(Vec::new());
4366    }
4367    let layout = Layout::array::<T>(len)
4368        .with_context(|| format!("allocating zeroed {type_name} vector layout"))?;
4369    // SAFETY: The caller guarantees an all-zero bit pattern is valid for `T`.
4370    // The requested layout is for exactly `len` values of `T`, and the returned
4371    // pointer is converted back into a Vec with the same length and capacity.
4372    let ptr = unsafe { alloc_zeroed(layout) };
4373    if ptr.is_null() {
4374        bail!("allocating zeroed {type_name} vector failed for {len} elements");
4375    }
4376    // SAFETY: `ptr` came from the global allocator with `Layout::array::<T>(len)`,
4377    // is non-null, and points to zero-initialized memory valid for `T` by the
4378    // caller contract. Vec will deallocate it with the matching layout.
4379    Ok(unsafe { Vec::from_raw_parts(ptr.cast::<T>(), len, len) })
4380}
4381
4382fn count_min_cells_from_memory(memory_bytes: Option<usize>, bits: u8) -> usize {
4383    let Some(memory_bytes) = memory_bytes else {
4384        return DEFAULT_PREFILTER_CELLS;
4385    };
4386    let bits_total = memory_bytes.saturating_mul(8);
4387    let bits_per_cell = bits.max(1) as usize;
4388    (bits_total / bits_per_cell).max(1)
4389}
4390
4391fn count_min_total_bytes(total_cells: usize, bits: u8) -> Result<usize> {
4392    let total_cells = total_cells.max(1);
4393    let bits = bits.max(1) as usize;
4394    let total_bits = total_cells
4395        .checked_mul(bits)
4396        .context("bounded count-min sketch size overflowed")?;
4397    Ok(total_bits.div_ceil(8).max(1))
4398}
4399
4400fn packed_sketch_should_track_slots(cells: usize) -> bool {
4401    cells <= PACKED_SKETCH_TRACKED_SLOT_LIMIT
4402}
4403
4404fn safe_explicit_count_min_bytes(available: usize) -> usize {
4405    available
4406        .saturating_mul(EXPLICIT_COUNT_MIN_SAFE_MEMORY_PERCENT)
4407        .checked_div(100)
4408        .unwrap_or(0)
4409        .max(1)
4410}
4411
4412fn count_min_safe_budget_bytes(
4413    configured_memory_bytes: Option<usize>,
4414    available_memory_bytes: Option<usize>,
4415) -> Option<usize> {
4416    let safe_available = available_memory_bytes.map(safe_explicit_count_min_bytes);
4417    match (configured_memory_bytes, safe_available) {
4418        (Some(configured), Some(available)) => Some(configured.min(available)),
4419        (Some(configured), None) => Some(configured),
4420        (None, Some(available)) => Some(available),
4421        (None, None) => None,
4422    }
4423}
4424
4425fn ensure_count_min_budget_fits_ceiling(
4426    label: &str,
4427    total_cells: usize,
4428    bits: u8,
4429    safe_budget: usize,
4430) -> Result<()> {
4431    let requested = count_min_total_bytes(total_cells, bits)?;
4432    if requested > safe_budget {
4433        bail!(
4434            "{label} count-min table requests {requested} bytes ({total_cells} cells x {} bits), above safe memory budget {safe_budget} bytes; reduce cells/matrixbits/sketchmemory/mem",
4435            bits.max(1)
4436        );
4437    }
4438    Ok(())
4439}
4440
4441fn ensure_count_min_budget_fits_memory(
4442    label: &str,
4443    total_cells: usize,
4444    bits: u8,
4445    configured_memory_bytes: Option<usize>,
4446) -> Result<()> {
4447    if let Some(safe_budget) =
4448        count_min_safe_budget_bytes(configured_memory_bytes, system_available_memory_bytes())
4449    {
4450        ensure_count_min_budget_fits_ceiling(label, total_cells, bits, safe_budget)
4451    } else {
4452        count_min_total_bytes(total_cells, bits).map(|_| ())
4453    }
4454}
4455
4456#[cfg(test)]
4457fn count_min_table_cells_from_total(total_cells: usize, hashes: usize) -> usize {
4458    let _ = hashes;
4459    count_min_table_cells_from_total_bits(total_cells, 32)
4460}
4461
4462#[cfg(test)]
4463fn count_min_table_cells_from_total_bits(total_cells: usize, bits: u8) -> usize {
4464    count_min_table_cells_from_total_bits_with_min_arrays(
4465        total_cells,
4466        bits,
4467        BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
4468    )
4469}
4470
4471fn count_min_table_cells_from_total_bits_with_min_arrays(
4472    total_cells: usize,
4473    bits: u8,
4474    min_arrays: usize,
4475) -> usize {
4476    let total_cells = total_cells.max(1);
4477    let arrays = kcount_array_count(total_cells, bits, min_arrays);
4478    if arrays <= 1 {
4479        return prime_at_most(total_cells);
4480    }
4481    prime_at_most(total_cells.div_ceil(arrays)).saturating_mul(arrays)
4482}
4483
4484fn kcount_array_min_arrays(config: &Config) -> usize {
4485    kcount_array_min_arrays_for_threads(config.threads.unwrap_or_else(rayon::current_num_threads))
4486}
4487
4488fn kcount_array_min_arrays_for_threads(threads: usize) -> usize {
4489    let target = threads.max(BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS);
4490    let mut arrays = BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS;
4491    while arrays < target {
4492        let next = arrays.saturating_mul(2);
4493        if next == arrays {
4494            break;
4495        }
4496        arrays = next;
4497    }
4498    arrays
4499}
4500
4501fn kcount_array_lock_index(key: &KmerKey) -> usize {
4502    let raw = match key {
4503        KmerKey::Short(raw) | KmerKey::LongHash(raw) => *raw,
4504    };
4505    ((raw & (i64::MAX as u64)) % BBTOOLS_KCOUNT_ARRAY_LOCKS as u64) as usize
4506}
4507
4508fn kcount_array_count(desired_cells: usize, bits: u8, min_arrays: usize) -> usize {
4509    if desired_cells < BBTOOLS_KCOUNT_ARRAY_SHARD_MIN_CELLS {
4510        return 1;
4511    }
4512    let bits = bits.clamp(1, 64) as usize;
4513    let min_arrays = kcount_array_min_arrays_for_threads(min_arrays);
4514    let words = desired_cells
4515        .saturating_mul(bits)
4516        .saturating_add(31)
4517        .checked_div(32)
4518        .unwrap_or(usize::MAX)
4519        .max(min_arrays);
4520    let mut arrays = min_arrays;
4521    while words / arrays >= i32::MAX as usize {
4522        arrays = arrays.saturating_mul(2);
4523    }
4524    while arrays > desired_cells {
4525        arrays /= 2;
4526    }
4527    arrays.max(1)
4528}
4529
4530fn prime_at_most(value: usize) -> usize {
4531    if value <= 2 {
4532        return value.max(1);
4533    }
4534
4535    let mut candidate = if value.is_multiple_of(2) {
4536        value - 1
4537    } else {
4538        value
4539    };
4540    while candidate > 2 {
4541        if is_prime(candidate) {
4542            return candidate;
4543        }
4544        candidate -= 2;
4545    }
4546    2
4547}
4548
4549fn is_prime(value: usize) -> bool {
4550    if value <= 3 {
4551        return value > 1;
4552    }
4553    if value.is_multiple_of(2) || value.is_multiple_of(3) {
4554        return false;
4555    }
4556
4557    let mut divisor = 5usize;
4558    while divisor <= value / divisor {
4559        if value.is_multiple_of(divisor) || value.is_multiple_of(divisor + 2) {
4560            return false;
4561        }
4562        divisor += 6;
4563    }
4564    true
4565}
4566
4567fn automatic_count_min_should_use(config: &Config) -> bool {
4568    if !config.auto_count_min || config.force_exact_counts {
4569        return false;
4570    }
4571    if config
4572        .table_reads
4573        .or(config.max_reads)
4574        .is_some_and(|reads| reads >= config.auto_count_min_read_threshold)
4575    {
4576        return true;
4577    }
4578    input_metadata_bytes(config)
4579        .is_some_and(|bytes| bytes >= config.auto_count_min_input_bytes as u64)
4580}
4581
4582fn automatic_count_min_memory_bytes(config: &Config) -> Option<usize> {
4583    if !automatic_count_min_should_use(config) {
4584        return None;
4585    }
4586    let raw_memory = config
4587        .auto_count_min_memory_bytes
4588        .unwrap_or_else(default_auto_count_min_memory_bytes);
4589    Some(automatic_count_min_filter_memory_bytes(config, raw_memory))
4590}
4591
4592fn automatic_count_min_filter_memory_bytes(config: &Config, raw_memory: usize) -> usize {
4593    let usable = bbtools_usable_table_memory_bytes(config, raw_memory).max(1);
4594    if config.count_up {
4595        (usable / 2).max(1)
4596    } else {
4597        usable
4598    }
4599}
4600
4601fn default_auto_count_min_memory_bytes() -> usize {
4602    system_available_memory_bytes()
4603        .map(|bytes| {
4604            (bytes / 4).clamp(
4605                AUTO_COUNT_MIN_MIN_MEMORY_BYTES,
4606                AUTO_COUNT_MIN_MAX_MEMORY_BYTES,
4607            )
4608        })
4609        .unwrap_or(AUTO_COUNT_MIN_FALLBACK_MEMORY_BYTES)
4610}
4611
4612fn bbtools_usable_table_memory_bytes(config: &Config, memory_bytes: usize) -> usize {
4613    let after_headroom = memory_bytes.saturating_sub(BBTOOLS_MEMORY_HEADROOM_BYTES) as f64 * 0.73;
4614    let fraction = memory_bytes as f64 * 0.45;
4615    let mut usable = after_headroom.max(fraction).max(1.0) as usize;
4616    if histogram_memory_is_reserved(config) {
4617        let threads = config
4618            .threads
4619            .unwrap_or_else(rayon::current_num_threads)
4620            .max(1);
4621        let hist_bytes = config
4622            .hist_len
4623            .saturating_mul(8)
4624            .saturating_mul(threads.saturating_add(1));
4625        usable = usable.saturating_sub(hist_bytes);
4626    }
4627    if config.build_passes > 1 {
4628        usable /= 2;
4629    }
4630    usable.max(1)
4631}
4632
4633fn histogram_memory_is_reserved(config: &Config) -> bool {
4634    config.hist_in.is_some()
4635        || config.hist_out.is_some()
4636        || config.peaks_in.is_some()
4637        || config.peaks_out.is_some()
4638}
4639
4640fn system_available_memory_bytes() -> Option<usize> {
4641    let text = fs::read_to_string("/proc/meminfo").ok()?;
4642    for line in text.lines() {
4643        if let Some(rest) = line.strip_prefix("MemAvailable:") {
4644            let kb = rest.split_whitespace().next()?.parse::<usize>().ok()?;
4645            return kb.checked_mul(1024);
4646        }
4647    }
4648    None
4649}
4650
4651fn input_metadata_bytes(config: &Config) -> Option<u64> {
4652    let mut total = 0u64;
4653    let mut found = false;
4654    for path in input_metadata_paths(config) {
4655        let Ok(metadata) = fs::metadata(path) else {
4656            continue;
4657        };
4658        if metadata.is_file() {
4659            total = total.saturating_add(metadata.len());
4660            found = true;
4661        }
4662    }
4663    found.then_some(total)
4664}
4665
4666fn input_metadata_paths(config: &Config) -> Vec<PathBuf> {
4667    let mut paths = Vec::new();
4668    if let Some(path) = &config.in1 {
4669        paths.extend(metadata_path_expansion(path));
4670    }
4671    if let Some(path) = &config.in2 {
4672        paths.extend(metadata_path_expansion(path));
4673    }
4674    for path in &config.extra {
4675        paths.extend(metadata_path_expansion(path));
4676    }
4677    paths
4678}
4679
4680fn metadata_path_expansion(path: &Path) -> Vec<PathBuf> {
4681    if path.exists() {
4682        return vec![path.to_path_buf()];
4683    }
4684    let text = path.to_string_lossy();
4685    if text.contains(',') {
4686        split_path_list(&text)
4687    } else {
4688        vec![path.to_path_buf()]
4689    }
4690}
4691
4692fn apply_output_count_adjustments(config: &Config, counts: &mut OutputCounts) {
4693    let OutputCounts::Exact(counts) = counts else {
4694        return;
4695    };
4696    apply_trusted_build_pass_filter(config, counts);
4697    apply_prefilter_collision_estimates(config, counts);
4698    apply_count_min_collision_estimates(config, counts);
4699}
4700
4701fn apply_trusted_build_pass_filter(config: &Config, counts: &mut CountMap) {
4702    if config.build_passes <= 1 || counts.len() < 2 {
4703        return;
4704    }
4705    let decrement = (config.build_passes as u64).saturating_sub(1);
4706    for count in counts.values_mut() {
4707        if *count > 1 {
4708            *count = count.saturating_sub(decrement).max(1);
4709        }
4710    }
4711}
4712
4713fn apply_prefilter_collision_estimates(config: &Config, counts: &mut CountMap) {
4714    if config.force_exact_counts {
4715        return;
4716    }
4717    if !use_prefilter_collision_estimates(config) {
4718        return;
4719    };
4720    if counts.len() < 2 {
4721        return;
4722    }
4723    let entries = sorted_count_entries(counts);
4724    let Ok(mut sketch) = new_prefilter_count_min_sketch(config) else {
4725        return;
4726    };
4727    sketch.add_key_counts(counts);
4728
4729    for (key, exact) in entries {
4730        let estimate = sketch.depth(&key);
4731        if estimate < sketch.max_count {
4732            counts.insert(key, estimate);
4733        } else {
4734            counts.insert(key, exact);
4735        }
4736    }
4737}
4738
4739fn use_prefilter_collision_estimates(config: &Config) -> bool {
4740    if config.prefilter.force_disabled {
4741        return false;
4742    }
4743    config.prefilter.cells.is_some()
4744        || config.prefilter.hashes.is_some()
4745        || config.prefilter.memory_bytes.is_some()
4746        || config
4747            .prefilter
4748            .memory_fraction_micros
4749            .is_some_and(|fraction| fraction > 0)
4750        || (config.prefilter.enabled && use_bounded_input_sketch(config))
4751}
4752
4753fn prefilter_total_cells(config: &Config, bits: u8) -> usize {
4754    if let Some(cells) = config.prefilter.cells {
4755        return cells.max(1);
4756    }
4757    if let Some(memory_bytes) = config.prefilter.memory_bytes {
4758        return count_min_cells_from_memory(Some(memory_bytes), bits);
4759    }
4760    if let Some(fraction_micros) = prefilter_memory_fraction_micros(config) {
4761        if let Some(total_cells) = config.count_min.cells {
4762            let main_bits = config.count_min.bits.unwrap_or(32).max(1) as usize;
4763            let prefilter_bits = scale_by_micros(
4764                total_cells.max(1).saturating_mul(main_bits),
4765                fraction_micros,
4766            )
4767            .max(bits.max(1) as usize);
4768            return (prefilter_bits / bits.max(1) as usize).max(1);
4769        }
4770        if let Some(memory_bytes) =
4771            count_min_memory_bytes(config).or(config.auto_count_min_memory_bytes)
4772        {
4773            let prefilter_memory = scale_by_micros(memory_bytes, fraction_micros).max(1);
4774            return count_min_cells_from_memory(Some(prefilter_memory), bits);
4775        }
4776    }
4777    DEFAULT_PREFILTER_CELLS
4778}
4779
4780fn apply_count_min_collision_estimates(config: &Config, counts: &mut CountMap) {
4781    if config.force_exact_counts {
4782        return;
4783    }
4784    let Some(cells) = config.count_min.cells else {
4785        return;
4786    };
4787    if cells == 0 || counts.len() < 2 {
4788        return;
4789    }
4790    let entries = sorted_count_entries(counts);
4791    let Ok(mut sketch) = new_bounded_count_min_sketch(config) else {
4792        return;
4793    };
4794    sketch.add_key_counts(counts);
4795
4796    for (key, exact) in entries {
4797        let exact = exact.min(sketch.max_count);
4798        let estimate = sketch.depth(&key).max(exact).min(sketch.max_count);
4799        counts.insert(key, estimate);
4800    }
4801}
4802
4803fn sorted_count_entries(counts: &CountMap) -> Vec<(KmerKey, u64)> {
4804    let mut entries: Vec<_> = counts
4805        .iter()
4806        .map(|(key, &count)| (key.clone(), count))
4807        .collect();
4808    entries.sort_unstable_by(|(left, _), (right, _)| left.cmp(right));
4809    entries
4810}
4811
4812impl PackedCountMinSketch {
4813    #[cfg(test)]
4814    fn new(cells: usize, hashes: usize, bits: u8) -> Result<Self> {
4815        Self::new_with_min_arrays(cells, hashes, bits, BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS)
4816    }
4817
4818    fn new_with_min_arrays(
4819        cells: usize,
4820        hashes: usize,
4821        bits: u8,
4822        min_arrays: usize,
4823    ) -> Result<Self> {
4824        Self::new_with_min_arrays_and_mask_seed(
4825            cells,
4826            hashes,
4827            bits,
4828            min_arrays,
4829            BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
4830        )
4831    }
4832
4833    fn new_with_min_arrays_and_mask_seed(
4834        cells: usize,
4835        hashes: usize,
4836        bits: u8,
4837        min_arrays: usize,
4838        mask_seed: u64,
4839    ) -> Result<Self> {
4840        let cells = cells.max(1);
4841        let hashes = hashes.clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4842        let bits = bits.clamp(1, 64);
4843        let layout = KCountArrayLayout::new_with_min_arrays_and_mask_seed(
4844            cells, bits, min_arrays, mask_seed,
4845        );
4846        let word_count = if bits == 64 {
4847            cells
4848        } else {
4849            let total_bits = cells
4850                .checked_mul(bits as usize)
4851                .context("bounded sketch bit count overflowed")?;
4852            total_bits.div_ceil(64)
4853        };
4854        let words = zeroed_u64_vec(word_count).context("allocating bounded count-min sketch")?;
4855        Ok(Self {
4856            cells,
4857            hashes,
4858            bits,
4859            max_count: count_min_max_count(bits),
4860            layout,
4861            update_mode: CountMinUpdateMode::Conservative,
4862            words,
4863            increments: 0,
4864            occupied_slots: 0,
4865            tracked_slots: packed_sketch_should_track_slots(cells).then(Vec::new),
4866        })
4867    }
4868
4869    fn with_update_mode(mut self, update_mode: CountMinUpdateMode) -> Self {
4870        self.update_mode = update_mode;
4871        self
4872    }
4873
4874    fn layout_summary(
4875        &self,
4876        table: &'static str,
4877        prefilter_limit: Option<u64>,
4878    ) -> SketchLayoutSummary {
4879        SketchLayoutSummary {
4880            table,
4881            kind: "packed",
4882            cells: self.cells,
4883            hashes: self.hashes,
4884            bits: self.bits,
4885            arrays: self.layout.array_count(),
4886            cells_per_array: self.layout.cells_per_array,
4887            mask_seed: self.layout.mask_seed,
4888            update_mode: self.update_mode.as_str(),
4889            max_count: self.max_count,
4890            memory_bytes: self.estimated_memory_bytes(),
4891            prefilter_limit,
4892        }
4893    }
4894
4895    fn estimated_memory_bytes(&self) -> usize {
4896        self.words
4897            .len()
4898            .saturating_mul(std::mem::size_of::<u64>())
4899            .saturating_add(self.tracked_slot_memory_bytes())
4900    }
4901
4902    fn tracked_slot_memory_bytes(&self) -> usize {
4903        self.tracked_slots.as_ref().map_or(0, |slots| {
4904            slots
4905                .capacity()
4906                .saturating_mul(std::mem::size_of::<usize>())
4907        })
4908    }
4909
4910    fn increment(&mut self, key: &KmerKey) {
4911        self.add_key_count(key, 1);
4912        self.increments = self.increments.saturating_add(1);
4913    }
4914
4915    fn add_key_count(&mut self, key: &KmerKey, count: u64) {
4916        let _ = self.increment_and_return_unincremented(key, count);
4917    }
4918
4919    fn increment_and_return_unincremented(&mut self, key: &KmerKey, count: u64) -> u64 {
4920        if count == 0 {
4921            return self.depth(key);
4922        }
4923        if self.update_mode == CountMinUpdateMode::Independent {
4924            return self.increment_independent_and_return_unincremented(key, count);
4925        }
4926        if self.bits == 2 && self.hashes == 2 {
4927            return self.increment_2bit_2hash_conservative_and_return_unincremented(key, count);
4928        }
4929        if self.bits == 16 && self.hashes == 3 {
4930            return self.increment_16bit_3hash_conservative_and_return_unincremented(key, count);
4931        }
4932        let target_increment = count.min(self.max_count);
4933        let mut slots = [0usize; 16];
4934        let mut min_depth = self.max_count;
4935        fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
4936        for slot in slots.iter().take(self.hashes) {
4937            min_depth = min_depth.min(self.cell(*slot));
4938        }
4939        if min_depth >= self.max_count {
4940            return min_depth;
4941        }
4942        let target = min_depth
4943            .saturating_add(target_increment)
4944            .min(self.max_count);
4945        let mut previous_min = self.max_count;
4946        for slot in slots.iter().take(self.hashes) {
4947            let previous = self.cell(*slot);
4948            previous_min = previous_min.min(previous);
4949            if previous < target {
4950                self.set_cell_with_previous(*slot, previous, target);
4951            }
4952        }
4953        previous_min
4954    }
4955
4956    fn increment_16bit_3hash_conservative_and_return_unincremented(
4957        &mut self,
4958        key: &KmerKey,
4959        count: u64,
4960    ) -> u64 {
4961        let [first, second, third] = count_min_three_buckets_raw(raw_kmer_key(key), self.layout);
4962        let first_depth = self.cell_16bit(first);
4963        let second_depth = self.cell_16bit(second);
4964        let third_depth = self.cell_16bit(third);
4965        let min_depth = first_depth.min(second_depth).min(third_depth);
4966        if min_depth >= self.max_count {
4967            return min_depth;
4968        }
4969        let target = min_depth
4970            .saturating_add(count.min(self.max_count))
4971            .min(self.max_count);
4972        if first_depth < target {
4973            self.set_cell_16bit_with_previous(first, first_depth, target);
4974        }
4975        if second_depth < target {
4976            self.set_cell_16bit_with_previous(second, second_depth, target);
4977        }
4978        if third_depth < target {
4979            self.set_cell_16bit_with_previous(third, third_depth, target);
4980        }
4981        min_depth
4982    }
4983
4984    fn increment_2bit_2hash_conservative_and_return_unincremented(
4985        &mut self,
4986        key: &KmerKey,
4987        count: u64,
4988    ) -> u64 {
4989        let [first, second] = count_min_two_buckets(key, self.layout);
4990        let first_depth = self.cell_2bit(first);
4991        let second_depth = self.cell_2bit(second);
4992        let min_depth = first_depth.min(second_depth);
4993        if min_depth >= self.max_count {
4994            return min_depth;
4995        }
4996        let target = min_depth
4997            .saturating_add(count.min(self.max_count))
4998            .min(self.max_count);
4999        if first_depth < target {
5000            self.set_cell_2bit_with_previous(first, first_depth, target);
5001        }
5002        if second_depth < target {
5003            self.set_cell_2bit_with_previous(second, second_depth, target);
5004        }
5005        min_depth
5006    }
5007
5008    fn increment_independent_and_return_unincremented(&mut self, key: &KmerKey, count: u64) -> u64 {
5009        if count == 0 {
5010            return self.depth(key);
5011        }
5012        let increment = count.min(self.max_count);
5013        let mut previous_min = self.max_count;
5014        let mut slots = [0usize; 16];
5015        fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5016        for slot in slots.iter().take(self.hashes) {
5017            let previous = self.cell(*slot);
5018            previous_min = previous_min.min(previous);
5019            let next = previous.saturating_add(increment).min(self.max_count);
5020            self.set_cell_with_previous(*slot, previous, next);
5021        }
5022        previous_min
5023    }
5024
5025    fn add_key_counts(&mut self, counts: &CountMap) {
5026        if self.update_mode == CountMinUpdateMode::Conservative
5027            && self.bits == 16
5028            && self.hashes == 3
5029        {
5030            for (key, count) in counts {
5031                let _ =
5032                    self.increment_16bit_3hash_conservative_and_return_unincremented(key, *count);
5033            }
5034            return;
5035        }
5036        for (key, count) in counts {
5037            self.add_key_count(key, *count);
5038        }
5039    }
5040
5041    fn add_key_increments(&mut self, key_increments: u64) {
5042        self.increments = self.increments.saturating_add(key_increments);
5043    }
5044
5045    fn depth_16bit_3hash(&self, key: &KmerKey) -> u64 {
5046        let [first, second, third] = count_min_three_buckets_raw(raw_kmer_key(key), self.layout);
5047        self.cell_16bit(first)
5048            .min(self.cell_16bit(second))
5049            .min(self.cell_16bit(third))
5050    }
5051
5052    fn occupied_slots_at_least(&self, min_depth: u64) -> usize {
5053        if min_depth > self.max_count {
5054            return 0;
5055        }
5056        if min_depth <= 1 {
5057            return self.occupied_slots;
5058        }
5059        let min_depth = min_depth.max(1);
5060        if let Some(slots) = &self.tracked_slots {
5061            return slots
5062                .par_iter()
5063                .filter(|&&slot| self.cell(slot) >= min_depth)
5064                .count();
5065        }
5066        (0..self.cells)
5067            .into_par_iter()
5068            .filter(|&slot| self.cell(slot) >= min_depth)
5069            .count()
5070    }
5071
5072    fn cell(&self, slot: usize) -> u64 {
5073        if self.bits == 64 {
5074            return self.words[slot];
5075        }
5076        if self.bits == 16 {
5077            return self.cell_16bit(slot);
5078        }
5079        if self.bits == 2 {
5080            return self.cell_2bit(slot);
5081        }
5082        let bit = slot * self.bits as usize;
5083        let word = bit / 64;
5084        let offset = bit % 64;
5085        let mask = (1u64 << self.bits) - 1;
5086        if offset + self.bits as usize <= 64 {
5087            (self.words[word] >> offset) & mask
5088        } else {
5089            let low_bits = 64 - offset;
5090            let high_bits = self.bits as usize - low_bits;
5091            let low = self.words[word] >> offset;
5092            let high = self.words[word + 1] & ((1u64 << high_bits) - 1);
5093            ((high << low_bits) | low) & mask
5094        }
5095    }
5096
5097    fn cell_16bit(&self, slot: usize) -> u64 {
5098        let word = slot >> 2;
5099        let offset = (slot & 3) << 4;
5100        (self.words[word] >> offset) & 0xffff
5101    }
5102
5103    fn cell_2bit(&self, slot: usize) -> u64 {
5104        let word = slot >> 5;
5105        let offset = (slot & 31) << 1;
5106        (self.words[word] >> offset) & 3
5107    }
5108
5109    #[cfg(test)]
5110    fn set_cell(&mut self, slot: usize, value: u64) {
5111        let previous = self.cell(slot);
5112        self.set_cell_with_previous(slot, previous, value);
5113    }
5114
5115    fn set_cell_with_previous(&mut self, slot: usize, previous: u64, value: u64) {
5116        let value = value.min(self.max_count);
5117        self.set_cell_raw(slot, value);
5118        self.note_cell_transition(previous, value, slot);
5119    }
5120
5121    fn set_cell_raw(&mut self, slot: usize, value: u64) {
5122        if self.bits == 64 {
5123            self.words[slot] = value;
5124            return;
5125        }
5126        if self.bits == 16 {
5127            self.set_cell_16bit_raw(slot, value);
5128            return;
5129        }
5130        if self.bits == 2 {
5131            self.set_cell_2bit_raw(slot, value);
5132            return;
5133        }
5134        let bit = slot * self.bits as usize;
5135        let word = bit / 64;
5136        let offset = bit % 64;
5137        let mask = (1u64 << self.bits) - 1;
5138        if offset + self.bits as usize <= 64 {
5139            let shifted_mask = mask << offset;
5140            self.words[word] = (self.words[word] & !shifted_mask) | ((value & mask) << offset);
5141        } else {
5142            let low_bits = 64 - offset;
5143            let high_bits = self.bits as usize - low_bits;
5144            let low_mask = ((1u64 << low_bits) - 1) << offset;
5145            self.words[word] =
5146                (self.words[word] & !low_mask) | ((value & ((1u64 << low_bits) - 1)) << offset);
5147            let high_mask = (1u64 << high_bits) - 1;
5148            self.words[word + 1] =
5149                (self.words[word + 1] & !high_mask) | ((value >> low_bits) & high_mask);
5150        }
5151    }
5152
5153    fn set_cell_16bit_raw(&mut self, slot: usize, value: u64) {
5154        let word = slot >> 2;
5155        let offset = (slot & 3) << 4;
5156        let shifted_mask = 0xffffu64 << offset;
5157        self.words[word] = (self.words[word] & !shifted_mask) | ((value & 0xffff) << offset);
5158    }
5159
5160    fn set_cell_16bit_with_previous(&mut self, slot: usize, previous: u64, value: u64) {
5161        let value = value.min(self.max_count);
5162        self.set_cell_16bit_raw(slot, value);
5163        self.note_cell_transition(previous, value, slot);
5164    }
5165
5166    fn set_cell_2bit_with_previous(&mut self, slot: usize, previous: u64, value: u64) {
5167        let value = value.min(self.max_count);
5168        self.set_cell_2bit_raw(slot, value);
5169        self.note_cell_transition(previous, value, slot);
5170    }
5171
5172    fn set_cell_2bit_raw(&mut self, slot: usize, value: u64) {
5173        let word = slot >> 5;
5174        let offset = (slot & 31) << 1;
5175        let shifted_mask = 3u64 << offset;
5176        self.words[word] = (self.words[word] & !shifted_mask) | ((value & 3) << offset);
5177    }
5178
5179    fn note_cell_transition(&mut self, previous: u64, value: u64, slot: usize) {
5180        match (previous == 0, value == 0) {
5181            (true, false) => {
5182                self.occupied_slots = self.occupied_slots.saturating_add(1);
5183                if let Some(slots) = &mut self.tracked_slots {
5184                    if slots.len() < PACKED_SKETCH_TRACKED_SLOT_LIMIT {
5185                        slots.push(slot);
5186                    } else {
5187                        self.tracked_slots = None;
5188                    }
5189                }
5190            }
5191            (false, true) => {
5192                self.occupied_slots = self.occupied_slots.saturating_sub(1);
5193                if let Some(slots) = &mut self.tracked_slots
5194                    && let Some(index) = slots.iter().position(|&tracked| tracked == slot)
5195                {
5196                    slots.swap_remove(index);
5197                }
5198            }
5199            _ => {}
5200        }
5201    }
5202
5203    #[cfg(test)]
5204    fn depth_hist(&self, hist_len: usize) -> Vec<u64> {
5205        let Some(last_index) = hist_len.checked_sub(1) else {
5206            return Vec::new();
5207        };
5208        if let Some(slots) = &self.tracked_slots {
5209            let mut hist = slots
5210                .par_iter()
5211                .fold(Vec::new, |mut local, &slot| {
5212                    add_depth_to_dynamic_hist(&mut local, self.cell(slot), last_index);
5213                    local
5214                })
5215                .reduce(Vec::new, merge_dynamic_depth_hist);
5216            hist.resize(hist_len, 0);
5217            return hist;
5218        }
5219        let mut hist = (0..self.cells)
5220            .into_par_iter()
5221            .fold(Vec::new, |mut local, slot| {
5222                add_depth_to_dynamic_hist(&mut local, self.cell(slot), last_index);
5223                local
5224            })
5225            .reduce(Vec::new, merge_dynamic_depth_hist);
5226        hist.resize(hist_len, 0);
5227        hist
5228    }
5229
5230    fn sparse_depth_hist(&self, hist_len: usize) -> SparseHist {
5231        let Some(last_index) = hist_len.checked_sub(1) else {
5232            return SparseHist::default();
5233        };
5234        if let Some(slots) = &self.tracked_slots {
5235            return slots
5236                .par_iter()
5237                .fold(SparseHist::default, |mut local, &slot| {
5238                    add_depth_to_sparse_hist(&mut local, self.cell(slot), last_index);
5239                    local
5240                })
5241                .reduce(SparseHist::default, merge_sparse_depth_hist);
5242        }
5243        (0..self.cells)
5244            .into_par_iter()
5245            .fold(SparseHist::default, |mut local, slot| {
5246                add_depth_to_sparse_hist(&mut local, self.cell(slot), last_index);
5247                local
5248            })
5249            .reduce(SparseHist::default, merge_sparse_depth_hist)
5250    }
5251}
5252
5253impl PrefilterCountMinSketch {
5254    fn max_count(&self) -> u64 {
5255        match self {
5256            Self::Packed(sketch) => sketch.max_count,
5257            Self::AtomicPacked(sketch) => sketch.max_count,
5258        }
5259    }
5260
5261    #[cfg(test)]
5262    fn bits(&self) -> u8 {
5263        match self {
5264            Self::Packed(sketch) => sketch.bits,
5265            Self::AtomicPacked(sketch) => sketch.bits,
5266        }
5267    }
5268
5269    #[cfg(test)]
5270    fn update_mode(&self) -> CountMinUpdateMode {
5271        match self {
5272            Self::Packed(sketch) => sketch.update_mode,
5273            Self::AtomicPacked(sketch) => sketch.update_mode,
5274        }
5275    }
5276
5277    fn layout_summary(
5278        &self,
5279        table: &'static str,
5280        prefilter_limit: Option<u64>,
5281    ) -> SketchLayoutSummary {
5282        match self {
5283            Self::Packed(sketch) => sketch.layout_summary(table, prefilter_limit),
5284            Self::AtomicPacked(sketch) => sketch.layout_summary(table, prefilter_limit),
5285        }
5286    }
5287}
5288
5289impl CountLookup for PrefilterCountMinSketch {
5290    fn depth(&self, key: &KmerKey) -> u64 {
5291        match self {
5292            Self::Packed(sketch) => sketch.depth(key),
5293            Self::AtomicPacked(sketch) => sketch.depth(key),
5294        }
5295    }
5296
5297    fn unique_kmers(&self) -> usize {
5298        match self {
5299            Self::Packed(sketch) => sketch.unique_kmers(),
5300            Self::AtomicPacked(sketch) => sketch.unique_kmers(),
5301        }
5302    }
5303
5304    fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
5305        match self {
5306            Self::Packed(sketch) => sketch.unique_kmers_at_least(min_depth),
5307            Self::AtomicPacked(sketch) => sketch.unique_kmers_at_least(min_depth),
5308        }
5309    }
5310}
5311
5312impl AtomicCountMinSketch {
5313    #[cfg(test)]
5314    fn new(cells: usize, hashes: usize) -> Result<Self> {
5315        Self::new_with_min_arrays(cells, hashes, BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS)
5316    }
5317
5318    #[cfg(test)]
5319    fn new_with_min_arrays(cells: usize, hashes: usize, min_arrays: usize) -> Result<Self> {
5320        Self::new_with_min_arrays_and_update_mode(
5321            cells,
5322            hashes,
5323            min_arrays,
5324            CountMinUpdateMode::Conservative,
5325            BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
5326        )
5327    }
5328
5329    fn new_with_min_arrays_and_update_mode(
5330        cells: usize,
5331        hashes: usize,
5332        min_arrays: usize,
5333        update_mode: CountMinUpdateMode,
5334        mask_seed: u64,
5335    ) -> Result<Self> {
5336        let cells = cells.max(1);
5337        let hashes = hashes.clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
5338        let layout =
5339            KCountArrayLayout::new_with_min_arrays_and_mask_seed(cells, 32, min_arrays, mask_seed);
5340        let cells_by_hash =
5341            zeroed_atomic_u32_vec(cells).context("allocating atomic count-min sketch")?;
5342        let locks = atomic_count_min_locks(update_mode)?;
5343        Ok(Self {
5344            cells,
5345            hashes,
5346            max_count: i32::MAX as u32,
5347            layout,
5348            update_mode,
5349            parallel_replay: false,
5350            cells_by_hash,
5351            locks,
5352            increments: AtomicU64::new(0),
5353            occupied_slots: AtomicUsize::new(0),
5354        })
5355    }
5356
5357    fn with_parallel_replay(mut self, parallel_replay: bool) -> Self {
5358        self.parallel_replay = parallel_replay;
5359        self
5360    }
5361
5362    fn layout_summary(
5363        &self,
5364        table: &'static str,
5365        prefilter_limit: Option<u64>,
5366    ) -> SketchLayoutSummary {
5367        SketchLayoutSummary {
5368            table,
5369            kind: "atomic",
5370            cells: self.cells,
5371            hashes: self.hashes,
5372            bits: 32,
5373            arrays: self.layout.array_count(),
5374            cells_per_array: self.layout.cells_per_array,
5375            mask_seed: self.layout.mask_seed,
5376            update_mode: self.update_mode.as_str(),
5377            max_count: u64::from(self.max_count),
5378            memory_bytes: self
5379                .cells_by_hash
5380                .len()
5381                .saturating_mul(std::mem::size_of::<AtomicU32>())
5382                .saturating_add(
5383                    self.locks
5384                        .len()
5385                        .saturating_mul(std::mem::size_of::<Mutex<()>>()),
5386                )
5387                .saturating_add(std::mem::size_of::<AtomicUsize>()),
5388            prefilter_limit,
5389        }
5390    }
5391
5392    fn increment_key(&self, key: &KmerKey) {
5393        self.add_key_count(key, 1);
5394    }
5395
5396    fn add_key_count(&self, key: &KmerKey, count: u64) {
5397        let (_, newly_occupied) = self.increment_and_count_newly_occupied(key, count);
5398        self.add_occupied_slots(newly_occupied);
5399    }
5400
5401    #[cfg(test)]
5402    fn increment_and_return_unincremented(&self, key: &KmerKey, count: u64) -> u64 {
5403        let (previous_min, newly_occupied) = self.increment_and_count_newly_occupied(key, count);
5404        self.add_occupied_slots(newly_occupied);
5405        previous_min
5406    }
5407
5408    fn add_key_count_counting_newly_occupied(&self, key: &KmerKey, count: u64) -> usize {
5409        self.increment_and_count_newly_occupied(key, count).1
5410    }
5411
5412    fn add_key_count_unlocked_counting_newly_occupied(&self, key: &KmerKey, count: u64) -> usize {
5413        if self.update_mode == CountMinUpdateMode::Independent {
5414            self.increment_independent_and_count_newly_occupied(key, count)
5415                .1
5416        } else {
5417            self.increment_conservative_unlocked_and_count_newly_occupied(key, count)
5418                .1
5419        }
5420    }
5421
5422    fn increment_and_count_newly_occupied(&self, key: &KmerKey, count: u64) -> (u64, usize) {
5423        if count == 0 {
5424            return (self.depth(key), 0);
5425        }
5426        if self.update_mode == CountMinUpdateMode::Independent {
5427            return self.increment_independent_and_count_newly_occupied(key, count);
5428        }
5429        let _guard = self.lock_for_key(key);
5430        self.increment_conservative_unlocked_and_count_newly_occupied(key, count)
5431    }
5432
5433    fn increment_conservative_unlocked_and_count_newly_occupied(
5434        &self,
5435        key: &KmerKey,
5436        count: u64,
5437    ) -> (u64, usize) {
5438        let target_increment = count.min(u64::from(self.max_count)) as u32;
5439        if self.hashes == 3 {
5440            return self.increment_conservative_three_unlocked_and_count_newly_occupied(
5441                key,
5442                target_increment,
5443            );
5444        }
5445        let mut slots = [0usize; 16];
5446        let mut min_depth = self.max_count;
5447        fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5448        for slot in slots.iter().take(self.hashes) {
5449            min_depth = min_depth.min(self.cells_by_hash[*slot].load(Ordering::Relaxed));
5450        }
5451        if min_depth >= self.max_count {
5452            return (u64::from(min_depth), 0);
5453        }
5454        let target = min_depth
5455            .saturating_add(target_increment)
5456            .min(self.max_count);
5457        let mut previous_min = self.max_count;
5458        let mut newly_occupied = 0usize;
5459        for slot in slots.iter().take(self.hashes) {
5460            let (previous, cell_newly_occupied) =
5461                raise_atomic_cell_to_at_least(&self.cells_by_hash[*slot], target);
5462            previous_min = previous_min.min(previous);
5463            newly_occupied += usize::from(cell_newly_occupied);
5464        }
5465        (u64::from(previous_min), newly_occupied)
5466    }
5467
5468    fn increment_conservative_three_unlocked_and_count_newly_occupied(
5469        &self,
5470        key: &KmerKey,
5471        target_increment: u32,
5472    ) -> (u64, usize) {
5473        let [first, second, third] = count_min_three_buckets(key, self.layout);
5474        let first_depth = self.cells_by_hash[first].load(Ordering::Relaxed);
5475        let second_depth = self.cells_by_hash[second].load(Ordering::Relaxed);
5476        let third_depth = self.cells_by_hash[third].load(Ordering::Relaxed);
5477        let min_depth = first_depth.min(second_depth).min(third_depth);
5478        if min_depth >= self.max_count {
5479            return (u64::from(min_depth), 0);
5480        }
5481        let target = min_depth
5482            .saturating_add(target_increment)
5483            .min(self.max_count);
5484        let (first_previous, first_new) =
5485            raise_atomic_cell_to_at_least(&self.cells_by_hash[first], target);
5486        let (second_previous, second_new) =
5487            raise_atomic_cell_to_at_least(&self.cells_by_hash[second], target);
5488        let (third_previous, third_new) =
5489            raise_atomic_cell_to_at_least(&self.cells_by_hash[third], target);
5490        (
5491            u64::from(first_previous.min(second_previous).min(third_previous)),
5492            usize::from(first_new) + usize::from(second_new) + usize::from(third_new),
5493        )
5494    }
5495
5496    fn lock_for_key(&self, key: &KmerKey) -> std::sync::MutexGuard<'_, ()> {
5497        let lock_index = kcount_array_lock_index(key);
5498        self.locks[lock_index]
5499            .lock()
5500            .unwrap_or_else(|poisoned| poisoned.into_inner())
5501    }
5502
5503    fn increment_independent_and_count_newly_occupied(
5504        &self,
5505        key: &KmerKey,
5506        count: u64,
5507    ) -> (u64, usize) {
5508        if count == 0 {
5509            return (self.depth(key), 0);
5510        }
5511        let increment = count.min(u64::from(self.max_count)) as u32;
5512        let mut previous_min = self.max_count;
5513        let mut newly_occupied = 0usize;
5514        let mut slots = [0usize; 16];
5515        fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5516        for slot in slots.iter().take(self.hashes) {
5517            let (previous, cell_newly_occupied) = increment_atomic_cell_saturating(
5518                &self.cells_by_hash[*slot],
5519                increment,
5520                self.max_count,
5521            );
5522            previous_min = previous_min.min(previous);
5523            newly_occupied += usize::from(cell_newly_occupied);
5524        }
5525        (u64::from(previous_min), newly_occupied)
5526    }
5527
5528    fn add_key_counts(&self, counts: &CountMap) {
5529        let newly_occupied =
5530            if self.parallel_replay && counts.len() >= ATOMIC_SKETCH_PAR_REPLAY_MIN_KEYS {
5531                counts
5532                    .par_iter()
5533                    .map(|(key, count)| self.add_key_count_counting_newly_occupied(key, *count))
5534                    .sum()
5535            } else {
5536                counts
5537                    .iter()
5538                    .map(|(key, count)| {
5539                        self.add_key_count_unlocked_counting_newly_occupied(key, *count)
5540                    })
5541                    .sum()
5542            };
5543        self.add_occupied_slots(newly_occupied);
5544    }
5545
5546    fn add_key_increments(&self, key_increments: u64) {
5547        self.increments.fetch_add(key_increments, Ordering::Relaxed);
5548    }
5549
5550    fn add_occupied_slots(&self, newly_occupied: usize) {
5551        if newly_occupied > 0 {
5552            self.occupied_slots
5553                .fetch_add(newly_occupied, Ordering::Relaxed);
5554        }
5555    }
5556
5557    fn occupied_slots_at_least(&self, min_depth: u64) -> usize {
5558        if min_depth > u64::from(self.max_count) {
5559            return 0;
5560        }
5561        if min_depth <= 1 {
5562            return self.occupied_slots.load(Ordering::Relaxed);
5563        }
5564        let min_depth = min_depth.max(1) as u32;
5565        self.cells_by_hash
5566            .par_iter()
5567            .filter(|cell| cell.load(Ordering::Relaxed) >= min_depth)
5568            .count()
5569    }
5570
5571    #[cfg(test)]
5572    fn depth_hist(&self, hist_len: usize) -> Vec<u64> {
5573        let Some(last_index) = hist_len.checked_sub(1) else {
5574            return Vec::new();
5575        };
5576        let mut hist = self
5577            .cells_by_hash
5578            .par_iter()
5579            .fold(Vec::new, |mut local, cell| {
5580                add_depth_to_dynamic_hist(
5581                    &mut local,
5582                    u64::from(cell.load(Ordering::Relaxed)),
5583                    last_index,
5584                );
5585                local
5586            })
5587            .reduce(Vec::new, merge_dynamic_depth_hist);
5588        hist.resize(hist_len, 0);
5589        hist
5590    }
5591
5592    fn sparse_depth_hist(&self, hist_len: usize) -> SparseHist {
5593        let Some(last_index) = hist_len.checked_sub(1) else {
5594            return SparseHist::default();
5595        };
5596        self.cells_by_hash
5597            .par_iter()
5598            .fold(SparseHist::default, |mut local, cell| {
5599                add_depth_to_sparse_hist(
5600                    &mut local,
5601                    u64::from(cell.load(Ordering::Relaxed)),
5602                    last_index,
5603                );
5604                local
5605            })
5606            .reduce(SparseHist::default, merge_sparse_depth_hist)
5607    }
5608}
5609
5610impl AtomicPackedCountMinSketch {
5611    fn new_with_min_arrays_and_update_mode(
5612        cells: usize,
5613        hashes: usize,
5614        bits: u8,
5615        min_arrays: usize,
5616        update_mode: CountMinUpdateMode,
5617        mask_seed: u64,
5618    ) -> Result<Self> {
5619        let cells = cells.max(1);
5620        let hashes = hashes.clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
5621        ensure!(
5622            bits.is_power_of_two() && bits <= 64,
5623            "atomic packed count-min sketches require power-of-two cell bits up to 64"
5624        );
5625        let layout = KCountArrayLayout::new_with_min_arrays_and_mask_seed(
5626            cells, bits, min_arrays, mask_seed,
5627        );
5628        let word_count = if bits == 64 {
5629            cells
5630        } else {
5631            let cells_per_word = 64 / bits as usize;
5632            cells.div_ceil(cells_per_word)
5633        };
5634        let words = zeroed_atomic_u64_vec(word_count)
5635            .context("allocating atomic packed count-min sketch")?;
5636        let locks = atomic_count_min_locks(update_mode)?;
5637        Ok(Self {
5638            cells,
5639            hashes,
5640            bits,
5641            max_count: count_min_max_count(bits),
5642            layout,
5643            update_mode,
5644            words,
5645            locks,
5646            increments: AtomicU64::new(0),
5647            occupied_slots: AtomicUsize::new(0),
5648        })
5649    }
5650
5651    fn layout_summary(
5652        &self,
5653        table: &'static str,
5654        prefilter_limit: Option<u64>,
5655    ) -> SketchLayoutSummary {
5656        SketchLayoutSummary {
5657            table,
5658            kind: "atomic_packed",
5659            cells: self.cells,
5660            hashes: self.hashes,
5661            bits: self.bits,
5662            arrays: self.layout.array_count(),
5663            cells_per_array: self.layout.cells_per_array,
5664            mask_seed: self.layout.mask_seed,
5665            update_mode: self.update_mode.as_str(),
5666            max_count: self.max_count,
5667            memory_bytes: self
5668                .words
5669                .len()
5670                .saturating_mul(std::mem::size_of::<AtomicU64>())
5671                .saturating_add(
5672                    self.locks
5673                        .len()
5674                        .saturating_mul(std::mem::size_of::<Mutex<()>>()),
5675                )
5676                .saturating_add(std::mem::size_of::<AtomicUsize>()),
5677            prefilter_limit,
5678        }
5679    }
5680
5681    #[cfg(test)]
5682    fn add_key_count(&self, key: &KmerKey, count: u64) {
5683        let (_, newly_occupied) = self.increment_and_count_newly_occupied(key, count);
5684        self.add_occupied_slots(newly_occupied);
5685    }
5686
5687    fn add_key_count_counting_newly_occupied(&self, key: &KmerKey, count: u64) -> usize {
5688        self.increment_and_count_newly_occupied(key, count).1
5689    }
5690
5691    fn increment_and_count_newly_occupied(&self, key: &KmerKey, count: u64) -> (u64, usize) {
5692        if count == 0 {
5693            return (self.depth(key), 0);
5694        }
5695        if self.update_mode == CountMinUpdateMode::Independent {
5696            return self.increment_independent_and_count_newly_occupied(key, count);
5697        }
5698        let _guard = self.lock_for_key(key);
5699        let target_increment = count.min(self.max_count);
5700        let mut slots = [0usize; 16];
5701        let mut min_depth = self.max_count;
5702        fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5703        for slot in slots.iter().take(self.hashes) {
5704            min_depth = min_depth.min(self.cell(*slot));
5705        }
5706        if min_depth >= self.max_count {
5707            return (min_depth, 0);
5708        }
5709        let target = min_depth
5710            .saturating_add(target_increment)
5711            .min(self.max_count);
5712        let mut previous_min = self.max_count;
5713        let mut newly_occupied = 0usize;
5714        for slot in slots.iter().take(self.hashes) {
5715            let (previous, cell_newly_occupied) = self.raise_cell_to_at_least(*slot, target);
5716            previous_min = previous_min.min(previous);
5717            newly_occupied += usize::from(cell_newly_occupied);
5718        }
5719        (previous_min, newly_occupied)
5720    }
5721
5722    fn increment_independent_and_count_newly_occupied(
5723        &self,
5724        key: &KmerKey,
5725        count: u64,
5726    ) -> (u64, usize) {
5727        if count == 0 {
5728            return (self.depth(key), 0);
5729        }
5730        let increment = count.min(self.max_count);
5731        let mut previous_min = self.max_count;
5732        let mut newly_occupied = 0usize;
5733        let mut slots = [0usize; 16];
5734        fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5735        for slot in slots.iter().take(self.hashes) {
5736            let (previous, cell_newly_occupied) = self.increment_cell_saturating(*slot, increment);
5737            previous_min = previous_min.min(previous);
5738            newly_occupied += usize::from(cell_newly_occupied);
5739        }
5740        (previous_min, newly_occupied)
5741    }
5742
5743    fn add_key_increments(&self, key_increments: u64) {
5744        self.increments.fetch_add(key_increments, Ordering::Relaxed);
5745    }
5746
5747    fn add_occupied_slots(&self, newly_occupied: usize) {
5748        if newly_occupied > 0 {
5749            self.occupied_slots
5750                .fetch_add(newly_occupied, Ordering::Relaxed);
5751        }
5752    }
5753
5754    fn lock_for_key(&self, key: &KmerKey) -> std::sync::MutexGuard<'_, ()> {
5755        let lock_index = kcount_array_lock_index(key);
5756        self.locks[lock_index]
5757            .lock()
5758            .unwrap_or_else(|poisoned| poisoned.into_inner())
5759    }
5760
5761    fn cell(&self, slot: usize) -> u64 {
5762        let position = self.cell_position(slot);
5763        (self.words[position.word].load(Ordering::Relaxed) >> position.shift) & position.mask
5764    }
5765
5766    fn raise_cell_to_at_least(&self, slot: usize, target: u64) -> (u64, bool) {
5767        let target = target.min(self.max_count);
5768        let position = self.cell_position(slot);
5769        let cell = &self.words[position.word];
5770        let mut current = cell.load(Ordering::Relaxed);
5771        loop {
5772            let previous = (current >> position.shift) & position.mask;
5773            if previous >= target {
5774                return (previous, false);
5775            }
5776            let next = replace_packed_cell(current, position, target);
5777            match cell.compare_exchange_weak(current, next, Ordering::Relaxed, Ordering::Relaxed) {
5778                Ok(_) => return (previous, previous == 0 && target > 0),
5779                Err(observed) => current = observed,
5780            }
5781        }
5782    }
5783
5784    fn increment_cell_saturating(&self, slot: usize, increment: u64) -> (u64, bool) {
5785        let increment = increment.min(self.max_count);
5786        let position = self.cell_position(slot);
5787        let cell = &self.words[position.word];
5788        let mut current = cell.load(Ordering::Relaxed);
5789        loop {
5790            let previous = (current >> position.shift) & position.mask;
5791            if previous >= self.max_count {
5792                return (previous, false);
5793            }
5794            let next_value = previous.saturating_add(increment).min(self.max_count);
5795            let next = replace_packed_cell(current, position, next_value);
5796            match cell.compare_exchange_weak(current, next, Ordering::Relaxed, Ordering::Relaxed) {
5797                Ok(_) => return (previous, previous == 0 && next_value > 0),
5798                Err(observed) => current = observed,
5799            }
5800        }
5801    }
5802
5803    fn cell_position(&self, slot: usize) -> PackedCellPosition {
5804        if self.bits == 64 {
5805            return PackedCellPosition {
5806                word: slot,
5807                shift: 0,
5808                mask: u64::MAX,
5809            };
5810        }
5811        let cells_per_word = 64 / self.bits as usize;
5812        let word = slot / cells_per_word;
5813        let shift = (slot % cells_per_word) * self.bits as usize;
5814        let mask = (1u64 << self.bits) - 1;
5815        PackedCellPosition { word, shift, mask }
5816    }
5817
5818    fn occupied_slots_at_least(&self, min_depth: u64) -> usize {
5819        if min_depth > self.max_count {
5820            return 0;
5821        }
5822        if min_depth <= 1 {
5823            return self.occupied_slots.load(Ordering::Relaxed);
5824        }
5825        let min_depth = min_depth.max(1);
5826        (0..self.cells)
5827            .into_par_iter()
5828            .filter(|&slot| self.cell(slot) >= min_depth)
5829            .count()
5830    }
5831}
5832
5833#[derive(Debug, Clone, Copy)]
5834struct PackedCellPosition {
5835    word: usize,
5836    shift: usize,
5837    mask: u64,
5838}
5839
5840fn replace_packed_cell(word: u64, position: PackedCellPosition, value: u64) -> u64 {
5841    let shifted_mask = position.mask << position.shift;
5842    (word & !shifted_mask) | ((value & position.mask) << position.shift)
5843}
5844
5845fn raise_atomic_cell_to_at_least(cell: &AtomicU32, target: u32) -> (u32, bool) {
5846    let mut current = cell.load(Ordering::Relaxed);
5847    loop {
5848        if current >= target {
5849            return (current, false);
5850        }
5851        match cell.compare_exchange_weak(current, target, Ordering::Relaxed, Ordering::Relaxed) {
5852            Ok(_) => return (current, current == 0 && target > 0),
5853            Err(observed) => current = observed,
5854        }
5855    }
5856}
5857
5858fn increment_atomic_cell_saturating(
5859    cell: &AtomicU32,
5860    increment: u32,
5861    max_count: u32,
5862) -> (u32, bool) {
5863    let mut current = cell.load(Ordering::Relaxed);
5864    loop {
5865        if current >= max_count {
5866            return (current, false);
5867        }
5868        let next = current.saturating_add(increment).min(max_count);
5869        match cell.compare_exchange_weak(current, next, Ordering::Relaxed, Ordering::Relaxed) {
5870            Ok(_) => return (current, current == 0 && next > 0),
5871            Err(observed) => current = observed,
5872        }
5873    }
5874}
5875
5876fn atomic_count_min_locks(update_mode: CountMinUpdateMode) -> Result<Vec<Mutex<()>>> {
5877    if update_mode == CountMinUpdateMode::Independent {
5878        return Ok(Vec::new());
5879    }
5880    let mut locks = Vec::new();
5881    locks
5882        .try_reserve_exact(BBTOOLS_KCOUNT_ARRAY_LOCKS)
5883        .context("allocating atomic count-min sketch locks")?;
5884    locks.resize_with(BBTOOLS_KCOUNT_ARRAY_LOCKS, || Mutex::new(()));
5885    Ok(locks)
5886}
5887
5888impl CountLookup for PackedCountMinSketch {
5889    fn depth(&self, key: &KmerKey) -> u64 {
5890        if self.bits == 16 && self.hashes == 3 {
5891            return self.depth_16bit_3hash(key);
5892        }
5893        let mut slots = [0usize; 16];
5894        fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5895        slots
5896            .iter()
5897            .take(self.hashes)
5898            .map(|&slot| self.cell(slot))
5899            .min()
5900            .unwrap_or(0)
5901    }
5902
5903    fn unique_kmers(&self) -> usize {
5904        self.unique_kmers_at_least(1)
5905    }
5906
5907    fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
5908        let occupied = self.occupied_slots_at_least(min_depth);
5909        estimate_unique_kmers_from_occupied(self.cells, occupied, self.hashes, self.increments)
5910    }
5911}
5912
5913impl CountLookup for AtomicCountMinSketch {
5914    fn depth(&self, key: &KmerKey) -> u64 {
5915        let mut slots = [0usize; 16];
5916        fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5917        slots
5918            .iter()
5919            .take(self.hashes)
5920            .map(|&slot| u64::from(self.cells_by_hash[slot].load(Ordering::Relaxed)))
5921            .min()
5922            .unwrap_or(0)
5923    }
5924
5925    fn unique_kmers(&self) -> usize {
5926        self.unique_kmers_at_least(1)
5927    }
5928
5929    fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
5930        let occupied = self.occupied_slots_at_least(min_depth);
5931        let increments = self.increments.load(Ordering::Relaxed);
5932        estimate_unique_kmers_from_occupied(self.cells, occupied, self.hashes, increments)
5933    }
5934}
5935
5936impl CountLookup for AtomicPackedCountMinSketch {
5937    fn depth(&self, key: &KmerKey) -> u64 {
5938        let mut slots = [0usize; 16];
5939        fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5940        slots
5941            .iter()
5942            .take(self.hashes)
5943            .map(|&slot| self.cell(slot))
5944            .min()
5945            .unwrap_or(0)
5946    }
5947
5948    fn unique_kmers(&self) -> usize {
5949        self.unique_kmers_at_least(1)
5950    }
5951
5952    fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
5953        let occupied = self.occupied_slots_at_least(min_depth);
5954        let increments = self.increments.load(Ordering::Relaxed);
5955        estimate_unique_kmers_from_occupied(self.cells, occupied, self.hashes, increments)
5956    }
5957}
5958
5959fn estimate_unique_kmers_from_occupied(
5960    total_slots: usize,
5961    occupied_slots: usize,
5962    hashes: usize,
5963    increments: u64,
5964) -> usize {
5965    // BBTools' KCountArray estimates cardinality from one shared used-cell
5966    // fraction, adjusted by hash count.
5967    if occupied_slots == 0 || total_slots == 0 {
5968        return 0;
5969    }
5970    let increment_cap = usize_from_u64_saturating(increments);
5971    if occupied_slots >= total_slots {
5972        return increment_cap;
5973    }
5974    let used_fraction = occupied_slots as f64 / total_slots as f64;
5975    let hash_count = hashes.max(1) as f64;
5976    let one_hash_fraction = 1.0 - (1.0 - used_fraction).powf(1.0 / hash_count);
5977    let estimate = (-(total_slots as f64) * (1.0 - one_hash_fraction).ln()).round();
5978    let estimate = estimate.max(1.0) as usize;
5979    estimate.min(increment_cap)
5980}
5981
5982fn usize_from_u64_saturating(value: u64) -> usize {
5983    usize::try_from(value).unwrap_or(usize::MAX)
5984}
5985
5986fn count_min_max_count(bits: u8) -> u64 {
5987    if bits >= 31 {
5988        i32::MAX as u64
5989    } else {
5990        (1u64 << bits.max(1)) - 1
5991    }
5992}
5993
5994impl KCountArrayLayout {
5995    #[cfg(test)]
5996    fn new(cells: usize, bits: u8) -> Self {
5997        Self::new_with_min_arrays(cells, bits, BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS)
5998    }
5999
6000    #[cfg(test)]
6001    fn new_with_min_arrays(cells: usize, bits: u8, min_arrays: usize) -> Self {
6002        Self::new_with_min_arrays_and_mask_seed(
6003            cells,
6004            bits,
6005            min_arrays,
6006            BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
6007        )
6008    }
6009
6010    fn new_with_min_arrays_and_mask_seed(
6011        cells: usize,
6012        bits: u8,
6013        min_arrays: usize,
6014        mask_seed: u64,
6015    ) -> Self {
6016        let cells = cells.max(1);
6017        let arrays = kcount_array_count(cells, bits, min_arrays);
6018        let cells_per_array = (cells / arrays).max(1);
6019        Self {
6020            array_mask: arrays.saturating_sub(1) as u64,
6021            array_bits: arrays.trailing_zeros(),
6022            cells_per_array,
6023            mask_seed,
6024            masks: bbtools_hash_masks(mask_seed),
6025        }
6026    }
6027
6028    fn array_count(self) -> usize {
6029        self.array_mask.saturating_add(1) as usize
6030    }
6031
6032    fn bucket(self, hashed: u64) -> usize {
6033        if self.cells_per_array <= 1 && self.array_mask == 0 {
6034            return 0;
6035        }
6036        let array_num = (hashed & self.array_mask) as usize;
6037        let cell = ((hashed >> self.array_bits) % self.cells_per_array as u64) as usize;
6038        array_num * self.cells_per_array + cell
6039    }
6040}
6041
6042#[cfg(test)]
6043fn count_min_bucket(key: &KmerKey, hash_index: usize, cells: usize) -> usize {
6044    count_min_bucket_with_layout(key, hash_index, KCountArrayLayout::new(cells, 32))
6045}
6046
6047#[cfg(test)]
6048fn count_min_bucket_with_layout(
6049    key: &KmerKey,
6050    hash_index: usize,
6051    layout: KCountArrayLayout,
6052) -> usize {
6053    let hashed = bbtools_count_min_row_hash_with_masks(raw_kmer_key(key), hash_index, layout.masks);
6054    layout.bucket(hashed)
6055}
6056
6057#[inline]
6058fn fill_count_min_buckets(
6059    key: &KmerKey,
6060    hashes: usize,
6061    layout: KCountArrayLayout,
6062    slots: &mut [usize; 16],
6063) {
6064    let hashes = hashes.min(slots.len());
6065    if hashes == 0 {
6066        return;
6067    }
6068    let mut hashed = bbtools_mask_hash_with_masks(raw_kmer_key(key), 0, layout.masks);
6069    slots[0] = layout.bucket(hashed);
6070    for (hash_index, slot) in slots.iter_mut().enumerate().take(hashes).skip(1) {
6071        hashed = hashed.rotate_right(BBTOOLS_HASH_BITS);
6072        hashed = bbtools_mask_hash_with_masks(hashed, hash_index, layout.masks);
6073        *slot = layout.bucket(hashed);
6074    }
6075}
6076
6077#[inline]
6078fn count_min_three_buckets(key: &KmerKey, layout: KCountArrayLayout) -> [usize; 3] {
6079    count_min_three_buckets_raw(raw_kmer_key(key), layout)
6080}
6081
6082#[inline]
6083fn count_min_three_buckets_raw(raw_key: u64, layout: KCountArrayLayout) -> [usize; 3] {
6084    let mut hashed = bbtools_mask_hash_with_masks(raw_key, 0, layout.masks);
6085    let first = layout.bucket(hashed);
6086    hashed = bbtools_mask_hash_with_masks(hashed.rotate_right(BBTOOLS_HASH_BITS), 1, layout.masks);
6087    let second = layout.bucket(hashed);
6088    hashed = bbtools_mask_hash_with_masks(hashed.rotate_right(BBTOOLS_HASH_BITS), 2, layout.masks);
6089    [first, second, layout.bucket(hashed)]
6090}
6091
6092#[inline]
6093fn count_min_two_buckets(key: &KmerKey, layout: KCountArrayLayout) -> [usize; 2] {
6094    let mut hashed = bbtools_mask_hash_with_masks(raw_kmer_key(key), 0, layout.masks);
6095    let first = layout.bucket(hashed);
6096    hashed = bbtools_mask_hash_with_masks(hashed.rotate_right(BBTOOLS_HASH_BITS), 1, layout.masks);
6097    [first, layout.bucket(hashed)]
6098}
6099
6100#[cfg(test)]
6101fn bbtools_count_min_row_hash_with_masks(
6102    raw_key: u64,
6103    hash_index: usize,
6104    masks: &BbtoolsHashMaskTable,
6105) -> u64 {
6106    let mut key = bbtools_mask_hash_with_masks(raw_key, 0, masks);
6107    for row in 1..=hash_index {
6108        key = key.rotate_right(BBTOOLS_HASH_BITS);
6109        key = bbtools_mask_hash_with_masks(key, row, masks);
6110    }
6111    key
6112}
6113
6114#[cfg(test)]
6115fn bbtools_mask_hash(key: u64, row: usize, mask_seed: u64) -> u64 {
6116    let masks = bbtools_hash_masks(mask_seed);
6117    bbtools_mask_hash_with_masks(key, row, masks)
6118}
6119
6120#[inline]
6121fn bbtools_mask_hash_with_masks(mut key: u64, row: usize, masks: &BbtoolsHashMaskTable) -> u64 {
6122    let row = row & 7;
6123    let mut cell =
6124        ((key & BBTOOLS_LONG_MAX_VALUE) % (BBTOOLS_HASH_ARRAY_LENGTH as u64 - 1)) as usize;
6125
6126    if row == 0 {
6127        key ^= masks[(row + 4) & 7][cell];
6128        cell = ((key >> 5) & BBTOOLS_HASH_CELL_MASK) as usize;
6129    }
6130
6131    key ^ masks[row][cell]
6132}
6133
6134fn bbtools_hash_masks(mask_seed: u64) -> BbtoolsHashMaskRef {
6135    static SEED0_MASKS: OnceLock<BbtoolsHashMaskTable> = OnceLock::new();
6136    static SEED7_MASKS: OnceLock<BbtoolsHashMaskTable> = OnceLock::new();
6137    static SEED14_MASKS: OnceLock<BbtoolsHashMaskTable> = OnceLock::new();
6138    static OTHER_MASKS: OnceLock<Mutex<BbtoolsHashMaskCache>> = OnceLock::new();
6139    match mask_seed {
6140        BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED => {
6141            SEED0_MASKS.get_or_init(|| make_bbtools_hash_masks(mask_seed))
6142        }
6143        BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED => {
6144            SEED7_MASKS.get_or_init(|| make_bbtools_hash_masks(mask_seed))
6145        }
6146        BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED => {
6147            SEED14_MASKS.get_or_init(|| make_bbtools_hash_masks(mask_seed))
6148        }
6149        _ => {
6150            let cache = OTHER_MASKS.get_or_init(|| Mutex::new(FxHashMap::default()));
6151            let mut cache = cache
6152                .lock()
6153                .unwrap_or_else(|poisoned| poisoned.into_inner());
6154            if let Some(&masks) = cache.get(&mask_seed) {
6155                return masks;
6156            }
6157            let masks = Box::leak(Box::new(make_bbtools_hash_masks(mask_seed)));
6158            cache.insert(mask_seed, masks);
6159            masks
6160        }
6161    }
6162}
6163
6164fn make_bbtools_hash_masks(mask_seed: u64) -> BbtoolsHashMaskTable {
6165    let mut masks = [[0u64; BBTOOLS_HASH_ARRAY_LENGTH]; 8];
6166    let mut rng = BbtoolsXoshiro::new(mask_seed);
6167    for row_masks in &mut masks {
6168        fill_bbtools_hash_mask_row(row_masks, &mut rng);
6169    }
6170    masks
6171}
6172
6173fn fill_bbtools_hash_mask_row(
6174    row_masks: &mut [u64; BBTOOLS_HASH_ARRAY_LENGTH],
6175    rng: &mut BbtoolsXoshiro,
6176) {
6177    let mut low_cells = [0u8; BBTOOLS_HASH_ARRAY_LENGTH];
6178    let mut rotated_cells = [0u8; BBTOOLS_HASH_ARRAY_LENGTH];
6179
6180    for mask in row_masks {
6181        let (value, low_cell, rotated_cell) = loop {
6182            let mut value = rng.next_long();
6183            while (value & 0xffff_ffff).count_ones() < 16 {
6184                value |= 1u64 << rng.next_power_of_two_int(32);
6185            }
6186            while (value & 0xffff_ffff).count_ones() > 16 {
6187                value &= !(1u64 << rng.next_power_of_two_int(32));
6188            }
6189            while (value & 0xffff_ffff_0000_0000).count_ones() < 16 {
6190                value |= 1u64 << (rng.next_power_of_two_int(32) + 32);
6191            }
6192            while (value & 0xffff_ffff_0000_0000).count_ones() > 16 {
6193                value &= !(1u64 << (rng.next_power_of_two_int(32) + 32));
6194            }
6195
6196            let low_cell = (value & BBTOOLS_HASH_CELL_MASK) as usize;
6197            let rotated_cell =
6198                (((value as i64) >> BBTOOLS_HASH_BITS) as u64 & BBTOOLS_HASH_CELL_MASK) as usize;
6199            if low_cells[low_cell] == 0 && rotated_cells[rotated_cell] == 0 {
6200                break (value & BBTOOLS_LONG_MAX_VALUE, low_cell, rotated_cell);
6201            }
6202        };
6203
6204        *mask = value;
6205        low_cells[low_cell] = low_cells[low_cell].saturating_add(1);
6206        rotated_cells[rotated_cell] = rotated_cells[rotated_cell].saturating_add(1);
6207    }
6208}
6209
6210struct BbtoolsXoshiro {
6211    s0: u64,
6212    s1: u64,
6213    s2: u64,
6214    s3: u64,
6215}
6216
6217impl BbtoolsXoshiro {
6218    fn new(seed: u64) -> Self {
6219        let mut rng = Self {
6220            s0: seed,
6221            s1: Self::mix_seed(seed),
6222            s2: 0,
6223            s3: 0,
6224        };
6225        rng.s2 = Self::mix_seed(rng.s1);
6226        rng.s3 = Self::mix_seed(rng.s2);
6227        if rng.s0 == 0 && rng.s1 == 0 && rng.s2 == 0 && rng.s3 == 0 {
6228            rng.s0 = 0x5DEECE66D;
6229            rng.s1 = 0xB;
6230            rng.s2 = 0xCCA;
6231            rng.s3 = 0xF00;
6232        }
6233        for _ in 0..4 {
6234            rng.next_long();
6235        }
6236        rng
6237    }
6238
6239    fn mix_seed(mut value: u64) -> u64 {
6240        value = value.wrapping_add(0x9E37_79B9_7F4A_7C15);
6241        value = (value ^ (value >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
6242        value = (value ^ (value >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
6243        value ^ (value >> 31)
6244    }
6245
6246    fn next_long(&mut self) -> u64 {
6247        let result = self.s0.wrapping_add(self.s3);
6248        let t = self.s1 << 17;
6249
6250        self.s2 ^= self.s0;
6251        self.s3 ^= self.s1;
6252        self.s1 ^= self.s2;
6253        self.s0 ^= self.s3;
6254
6255        self.s2 ^= t;
6256        self.s3 = self.s3.rotate_left(45);
6257
6258        result
6259    }
6260
6261    fn next_power_of_two_int(&mut self, bound: u32) -> u32 {
6262        debug_assert!(bound.is_power_of_two());
6263        (self.next_long() as u32) & (bound - 1)
6264    }
6265}
6266
6267fn new_count_map(config: &Config) -> CountMap {
6268    let mut counts = CountMap::default();
6269    if let Some(capacity) = count_map_capacity_hint(config) {
6270        let _ = counts.try_reserve(capacity);
6271    }
6272    counts
6273}
6274
6275fn count_map_with_capacity(capacity: usize) -> CountMap {
6276    let mut counts = CountMap::default();
6277    if capacity > 0 {
6278        let _ = counts.try_reserve(capacity);
6279    }
6280    counts
6281}
6282
6283fn count_chunk_local_map(
6284    config: &Config,
6285    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6286) -> CountMap {
6287    count_map_with_capacity(count_chunk_local_map_capacity(config, pairs))
6288}
6289
6290fn count_chunk_local_map_capacity(
6291    config: &Config,
6292    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6293) -> usize {
6294    let total_windows: usize = pairs
6295        .iter()
6296        .map(|(r1, r2)| pair_kmer_window_capacity(config, r1, r2.as_ref()))
6297        .sum();
6298    if total_windows == 0 {
6299        return 0;
6300    }
6301    total_windows
6302        .div_ceil(rayon::current_num_threads().max(1))
6303        .clamp(64, COUNT_CHUNK_LOCAL_MAP_MAX_CAPACITY)
6304}
6305
6306fn count_map_capacity_hint(config: &Config) -> Option<usize> {
6307    let explicit = config.table_initial_size;
6308    let prealloc = preallocation_capacity_hint(config);
6309    explicit.max(prealloc)
6310}
6311
6312fn preallocation_capacity_hint(config: &Config) -> Option<usize> {
6313    let fraction = config.table_prealloc_fraction?;
6314    let reads = config.table_reads.or(config.max_reads)?;
6315    let reads = usize::try_from(reads).ok()?;
6316    if reads == 0 || fraction <= 0.0 {
6317        return None;
6318    }
6319    let mates = if config.in2.is_some() || config.interleaved {
6320        2usize
6321    } else {
6322        1usize
6323    };
6324    let kmers_per_read_hint = 100usize.saturating_sub(config.k).saturating_add(1).max(1);
6325    let raw = reads
6326        .saturating_mul(mates)
6327        .saturating_mul(kmers_per_read_hint);
6328    Some(((raw as f64) * fraction).ceil().max(1.0) as usize)
6329}
6330
6331fn count_primary(config: &Config, counts: &mut CountMap) -> Result<()> {
6332    if let Some(paths) = primary_input_lists(config) {
6333        if let Some(first) = paths.first.first() {
6334            if let Some(second) = paths.second.as_ref().and_then(|paths| paths.first()) {
6335                count_paired_files(config, first, second, counts, config.table_reads)?;
6336            } else {
6337                count_single_file(config, first, counts, config.table_reads)?;
6338            }
6339        }
6340        for path in paths.first.iter().skip(1) {
6341            count_single_file(config, path, counts, None)?;
6342        }
6343        if let Some(second) = &paths.second {
6344            for path in second.iter().skip(1) {
6345                count_single_file(config, path, counts, None)?;
6346            }
6347        }
6348        return Ok(());
6349    }
6350
6351    let mut readers = PrimaryReaders::open(config, config.table_reads)?;
6352    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6353    while let Some((r1, r2)) = readers.next_pair()? {
6354        chunk.push((r1, r2));
6355        if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6356            increment_counts_from_pair_chunk(config, counts, &chunk);
6357            chunk.clear();
6358        }
6359    }
6360    if !chunk.is_empty() {
6361        increment_counts_from_pair_chunk(config, counts, &chunk);
6362    }
6363    Ok(())
6364}
6365
6366fn count_primary_sketch(
6367    config: &Config,
6368    sketch: &mut PackedCountMinSketch,
6369    prefilter: Option<PrefilterGate<'_>>,
6370) -> Result<()> {
6371    if let Some(paths) = primary_input_lists(config) {
6372        if let Some(first) = paths.first.first() {
6373            if let Some(second) = paths.second.as_ref().and_then(|paths| paths.first()) {
6374                count_paired_files_sketch(
6375                    config,
6376                    first,
6377                    second,
6378                    sketch,
6379                    config.table_reads,
6380                    prefilter,
6381                )?;
6382            } else {
6383                count_single_file_sketch(config, first, sketch, config.table_reads, prefilter)?;
6384            }
6385        }
6386        for path in paths.first.iter().skip(1) {
6387            count_single_file_sketch(config, path, sketch, None, prefilter)?;
6388        }
6389        if let Some(second) = &paths.second {
6390            for path in second.iter().skip(1) {
6391                count_single_file_sketch(config, path, sketch, None, prefilter)?;
6392            }
6393        }
6394        return Ok(());
6395    }
6396
6397    let mut readers = PrimaryReaders::open(config, config.table_reads)?;
6398    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6399    while let Some((r1, r2)) = readers.next_pair()? {
6400        chunk.push((r1, r2));
6401        if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6402            increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6403            chunk.clear();
6404        }
6405    }
6406    if !chunk.is_empty() {
6407        increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6408    }
6409    Ok(())
6410}
6411
6412fn count_primary_prefilter_sketch(
6413    config: &Config,
6414    sketch: &mut PrefilterCountMinSketch,
6415) -> Result<()> {
6416    match sketch {
6417        PrefilterCountMinSketch::Packed(sketch) => count_primary_sketch(config, sketch, None),
6418        PrefilterCountMinSketch::AtomicPacked(sketch) => {
6419            count_primary_atomic_packed_sketch(config, sketch)
6420        }
6421    }
6422}
6423
6424fn count_primary_atomic_packed_sketch(
6425    config: &Config,
6426    sketch: &AtomicPackedCountMinSketch,
6427) -> Result<()> {
6428    if let Some(paths) = primary_input_lists(config) {
6429        if let Some(first) = paths.first.first() {
6430            if let Some(second) = paths.second.as_ref().and_then(|paths| paths.first()) {
6431                count_paired_files_atomic_packed_sketch(
6432                    config,
6433                    first,
6434                    second,
6435                    sketch,
6436                    config.table_reads,
6437                )?;
6438            } else {
6439                count_single_file_atomic_packed_sketch(config, first, sketch, config.table_reads)?;
6440            }
6441        }
6442        for path in paths.first.iter().skip(1) {
6443            count_single_file_atomic_packed_sketch(config, path, sketch, None)?;
6444        }
6445        if let Some(second) = &paths.second {
6446            for path in second.iter().skip(1) {
6447                count_single_file_atomic_packed_sketch(config, path, sketch, None)?;
6448            }
6449        }
6450        return Ok(());
6451    }
6452
6453    let mut readers = PrimaryReaders::open(config, config.table_reads)?;
6454    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6455    while let Some((r1, r2)) = readers.next_pair()? {
6456        chunk.push((r1, r2));
6457        if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6458            increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
6459            chunk.clear();
6460        }
6461    }
6462    if !chunk.is_empty() {
6463        increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
6464    }
6465    Ok(())
6466}
6467
6468fn count_primary_atomic_sketch(
6469    config: &Config,
6470    sketch: &AtomicCountMinSketch,
6471    prefilter: Option<PrefilterGate<'_>>,
6472) -> Result<()> {
6473    if let Some(paths) = primary_input_lists(config) {
6474        if let Some(first) = paths.first.first() {
6475            if let Some(second) = paths.second.as_ref().and_then(|paths| paths.first()) {
6476                count_paired_files_atomic_sketch(
6477                    config,
6478                    first,
6479                    second,
6480                    sketch,
6481                    config.table_reads,
6482                    prefilter,
6483                )?;
6484            } else {
6485                count_single_file_atomic_sketch(
6486                    config,
6487                    first,
6488                    sketch,
6489                    config.table_reads,
6490                    prefilter,
6491                )?;
6492            }
6493        }
6494        for path in paths.first.iter().skip(1) {
6495            count_single_file_atomic_sketch(config, path, sketch, None, prefilter)?;
6496        }
6497        if let Some(second) = &paths.second {
6498            for path in second.iter().skip(1) {
6499                count_single_file_atomic_sketch(config, path, sketch, None, prefilter)?;
6500            }
6501        }
6502        return Ok(());
6503    }
6504
6505    let mut readers = PrimaryReaders::open(config, config.table_reads)?;
6506    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6507    while let Some((r1, r2)) = readers.next_pair()? {
6508        chunk.push((r1, r2));
6509        if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6510            increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6511            chunk.clear();
6512        }
6513    }
6514    if !chunk.is_empty() {
6515        increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6516    }
6517    Ok(())
6518}
6519
6520fn count_primary_gpu_reduced_runs_sketch(
6521    config: &Config,
6522    sketch: &mut PackedCountMinSketch,
6523) -> Result<()> {
6524    for_each_gpu_reduced_chunk_run(config, |key, count| {
6525        sketch.add_key_count(&key, count);
6526        sketch.add_key_increments(count);
6527    })
6528}
6529
6530fn count_primary_gpu_reduced_runs_atomic_sketch(
6531    config: &Config,
6532    sketch: &AtomicCountMinSketch,
6533) -> Result<()> {
6534    for_each_gpu_reduced_chunk_run(config, |key, count| {
6535        sketch.add_key_count(&key, count);
6536        sketch.add_key_increments(count);
6537    })
6538}
6539
6540fn for_each_gpu_reduced_chunk_run<F>(config: &Config, mut f: F) -> Result<()>
6541where
6542    F: FnMut(KmerKey, u64),
6543{
6544    let helper = config
6545        .gpu_helper
6546        .as_ref()
6547        .context("gpucounting=t requires gpuhelper=<cuda_kmer_reduce_runs binary>")?;
6548    if !helper.exists() {
6549        bail!("gpuhelper does not exist: {}", helper.display());
6550    }
6551    ensure!(
6552        config.k <= 31,
6553        "gpucounting=t currently supports short k-mers only (k<=31)"
6554    );
6555    ensure!(
6556        !use_prefilter_collision_estimates(config),
6557        "gpucounting=t currently supports the main bounded sketch without prefilter=t"
6558    );
6559    let temp_dir = config.temp_dir.clone().unwrap_or_else(std::env::temp_dir);
6560    fs::create_dir_all(&temp_dir)
6561        .with_context(|| format!("creating GPU counting temp dir {}", temp_dir.display()))?;
6562    let token = format!(
6563        "{}_{}",
6564        std::process::id(),
6565        SystemTime::now()
6566            .duration_since(UNIX_EPOCH)
6567            .unwrap_or_default()
6568            .as_nanos()
6569    );
6570    let kmers_path = temp_dir.join(format!("bbnorm-rs-gpu-kmers-{token}.u64"));
6571    let runs_path = temp_dir.join(format!("bbnorm-rs-gpu-runs-{token}.bin"));
6572    let result = (|| {
6573        let mut readers = PrimaryReaders::open(config, config.table_reads)?;
6574        let mut persistent = config
6575            .gpu_persistent
6576            .then(|| PersistentGpuReducer::start(helper))
6577            .transpose()?;
6578        let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6579        let mut persistent_keys = Vec::new();
6580        while let Some((r1, r2)) = readers.next_pair()? {
6581            chunk.push((r1, r2));
6582            if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6583                if let Some(reducer) = &mut persistent {
6584                    reduce_gpu_pair_chunk_persistent(
6585                        config,
6586                        reducer,
6587                        &chunk,
6588                        &mut persistent_keys,
6589                        &mut f,
6590                    )?;
6591                } else {
6592                    reduce_gpu_pair_chunk(config, helper, &kmers_path, &runs_path, &chunk, &mut f)?;
6593                }
6594                chunk.clear();
6595            }
6596        }
6597        if !chunk.is_empty() {
6598            if let Some(reducer) = &mut persistent {
6599                reduce_gpu_pair_chunk_persistent(
6600                    config,
6601                    reducer,
6602                    &chunk,
6603                    &mut persistent_keys,
6604                    &mut f,
6605                )?;
6606            } else {
6607                reduce_gpu_pair_chunk(config, helper, &kmers_path, &runs_path, &chunk, &mut f)?;
6608            }
6609        }
6610        if let Some(reducer) = persistent {
6611            reducer.finish()?;
6612        }
6613        Ok(())
6614    })();
6615    let _ = fs::remove_file(&kmers_path);
6616    let _ = fs::remove_file(&runs_path);
6617    result
6618}
6619
6620fn reduce_gpu_pair_chunk<F>(
6621    config: &Config,
6622    helper: &Path,
6623    kmers_path: &Path,
6624    runs_path: &Path,
6625    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6626    f: &mut F,
6627) -> Result<()>
6628where
6629    F: FnMut(KmerKey, u64),
6630{
6631    write_pair_chunk_short_kmers(config, pairs, kmers_path)?;
6632    if fs::metadata(kmers_path)?.len() == 0 {
6633        return Ok(());
6634    }
6635    let status = Command::new(helper)
6636        .arg(kmers_path)
6637        .arg(runs_path)
6638        .status()
6639        .with_context(|| format!("running GPU helper {}", helper.display()))?;
6640    if !status.success() {
6641        bail!("GPU helper failed with status {status}");
6642    }
6643    replay_reduced_runs_file(runs_path, f)?;
6644    let _ = fs::remove_file(kmers_path);
6645    let _ = fs::remove_file(runs_path);
6646    Ok(())
6647}
6648
6649fn reduce_gpu_pair_chunk_persistent<F>(
6650    config: &Config,
6651    reducer: &mut PersistentGpuReducer,
6652    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6653    keys: &mut Vec<u64>,
6654    f: &mut F,
6655) -> Result<()>
6656where
6657    F: FnMut(KmerKey, u64),
6658{
6659    collect_pair_chunk_short_kmers(config, pairs, keys)?;
6660    if keys.is_empty() {
6661        return Ok(());
6662    }
6663    reducer.reduce(keys, f)
6664}
6665
6666fn write_pair_chunk_short_kmers(
6667    config: &Config,
6668    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6669    path: &Path,
6670) -> Result<()> {
6671    let mut writer = BufWriter::new(
6672        fs::File::create(path).with_context(|| format!("create {}", path.display()))?,
6673    );
6674    let mut keys = Vec::new();
6675    collect_pair_chunk_short_kmers(config, pairs, &mut keys)?;
6676    for raw in keys {
6677        writer.write_all(&raw.to_le_bytes())?;
6678    }
6679    writer.flush()?;
6680    Ok(())
6681}
6682
6683fn collect_pair_chunk_short_kmers(
6684    config: &Config,
6685    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6686    out: &mut Vec<u64>,
6687) -> Result<()> {
6688    out.clear();
6689    let mut keys = Vec::new();
6690    for (r1, r2) in pairs {
6691        if config.remove_duplicate_kmers {
6692            fill_unique_pair_kmers(config, r1, r2.as_ref(), &mut keys);
6693            for key in &keys {
6694                out.push(short_kmer_raw(key)?);
6695            }
6696        } else {
6697            let mut write_error = None;
6698            for_each_kmer_for_record(r1, config, |key| match short_kmer_raw(&key) {
6699                Ok(raw) => out.push(raw),
6700                Err(err) => {
6701                    write_error = Some(err);
6702                }
6703            });
6704            if let Some(mate) = r2 {
6705                for_each_kmer_for_record(mate, config, |key| match short_kmer_raw(&key) {
6706                    Ok(raw) => out.push(raw),
6707                    Err(err) => {
6708                        write_error = Some(err);
6709                    }
6710                });
6711            }
6712            if let Some(err) = write_error {
6713                return Err(err);
6714            }
6715        }
6716    }
6717    Ok(())
6718}
6719
6720fn short_kmer_raw(key: &KmerKey) -> Result<u64> {
6721    let KmerKey::Short(raw) = key else {
6722        bail!("GPU counting helper only accepts short k-mer keys");
6723    };
6724    Ok(*raw)
6725}
6726
6727struct PersistentGpuReducer {
6728    child: Child,
6729    stdin: BufWriter<ChildStdin>,
6730    stdout: BufReader<ChildStdout>,
6731}
6732
6733impl PersistentGpuReducer {
6734    fn start(helper: &Path) -> Result<Self> {
6735        let mut child = Command::new(helper)
6736            .stdin(Stdio::piped())
6737            .stdout(Stdio::piped())
6738            .stderr(Stdio::inherit())
6739            .spawn()
6740            .with_context(|| format!("starting persistent GPU helper {}", helper.display()))?;
6741        let stdin = child
6742            .stdin
6743            .take()
6744            .context("persistent GPU helper stdin was not piped")?;
6745        let stdout = child
6746            .stdout
6747            .take()
6748            .context("persistent GPU helper stdout was not piped")?;
6749        Ok(Self {
6750            child,
6751            stdin: BufWriter::new(stdin),
6752            stdout: BufReader::new(stdout),
6753        })
6754    }
6755
6756    fn reduce<F>(&mut self, keys: &[u64], f: &mut F) -> Result<()>
6757    where
6758        F: FnMut(KmerKey, u64),
6759    {
6760        let count = keys.len() as u64;
6761        self.stdin.write_all(&count.to_le_bytes())?;
6762        for key in keys {
6763            self.stdin.write_all(&key.to_le_bytes())?;
6764        }
6765        self.stdin.flush()?;
6766
6767        let mut unique_buf = [0u8; 8];
6768        self.stdout
6769            .read_exact(&mut unique_buf)
6770            .context("reading persistent GPU helper unique count")?;
6771        let unique = u64::from_le_bytes(unique_buf);
6772        let mut record = [0u8; 12];
6773        for _ in 0..unique {
6774            self.stdout
6775                .read_exact(&mut record)
6776                .context("reading persistent GPU helper reduced run")?;
6777            let key = u64::from_le_bytes(record[0..8].try_into().unwrap());
6778            let count = u32::from_le_bytes(record[8..12].try_into().unwrap());
6779            f(KmerKey::Short(key), u64::from(count));
6780        }
6781        Ok(())
6782    }
6783
6784    fn finish(mut self) -> Result<()> {
6785        self.stdin.write_all(&u64::MAX.to_le_bytes())?;
6786        self.stdin.flush()?;
6787        drop(self.stdin);
6788        let status = self
6789            .child
6790            .wait()
6791            .context("waiting for persistent GPU helper")?;
6792        if !status.success() {
6793            bail!("persistent GPU helper failed with status {status}");
6794        }
6795        Ok(())
6796    }
6797}
6798
6799fn replay_reduced_runs_file<F>(path: &Path, f: &mut F) -> Result<()>
6800where
6801    F: FnMut(KmerKey, u64),
6802{
6803    let mut reader =
6804        BufReader::new(fs::File::open(path).with_context(|| format!("open {}", path.display()))?);
6805    let mut record = [0u8; 12];
6806    loop {
6807        match reader.read_exact(&mut record) {
6808            Ok(()) => {
6809                let key = u64::from_le_bytes(record[0..8].try_into().unwrap());
6810                let count = u32::from_le_bytes(record[8..12].try_into().unwrap());
6811                f(KmerKey::Short(key), u64::from(count));
6812            }
6813            Err(err) if err.kind() == ErrorKind::UnexpectedEof => break,
6814            Err(err) => return Err(err).context("reading GPU reduced runs"),
6815        }
6816    }
6817    Ok(())
6818}
6819
6820fn count_single_file(
6821    config: &Config,
6822    path: &Path,
6823    counts: &mut CountMap,
6824    limit: Option<u64>,
6825) -> Result<()> {
6826    let mut reader = open_sequence_reader(config, path, sequence_settings(config))?;
6827    let mut reads_seen = 0u64;
6828    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6829    while let Some(record) = reader.next_record()? {
6830        if limit_reached(limit, reads_seen) {
6831            break;
6832        }
6833        chunk.push((record, None));
6834        if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6835            increment_counts_from_pair_chunk(config, counts, &chunk);
6836            chunk.clear();
6837        }
6838        reads_seen += 1;
6839    }
6840    if !chunk.is_empty() {
6841        increment_counts_from_pair_chunk(config, counts, &chunk);
6842    }
6843    Ok(())
6844}
6845
6846fn count_single_file_sketch(
6847    config: &Config,
6848    path: &Path,
6849    sketch: &mut PackedCountMinSketch,
6850    limit: Option<u64>,
6851    prefilter: Option<PrefilterGate<'_>>,
6852) -> Result<()> {
6853    let mut reader = open_sequence_reader(config, path, sequence_settings(config))?;
6854    let mut reads_seen = 0u64;
6855    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6856    while let Some(record) = reader.next_record()? {
6857        if limit_reached(limit, reads_seen) {
6858            break;
6859        }
6860        chunk.push((record, None));
6861        if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6862            increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6863            chunk.clear();
6864        }
6865        reads_seen += 1;
6866    }
6867    if !chunk.is_empty() {
6868        increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6869    }
6870    Ok(())
6871}
6872
6873fn count_single_file_prefilter_sketch(
6874    config: &Config,
6875    path: &Path,
6876    sketch: &mut PrefilterCountMinSketch,
6877    limit: Option<u64>,
6878) -> Result<()> {
6879    match sketch {
6880        PrefilterCountMinSketch::Packed(sketch) => {
6881            count_single_file_sketch(config, path, sketch, limit, None)
6882        }
6883        PrefilterCountMinSketch::AtomicPacked(sketch) => {
6884            count_single_file_atomic_packed_sketch(config, path, sketch, limit)
6885        }
6886    }
6887}
6888
6889fn count_single_file_atomic_packed_sketch(
6890    config: &Config,
6891    path: &Path,
6892    sketch: &AtomicPackedCountMinSketch,
6893    limit: Option<u64>,
6894) -> Result<()> {
6895    let mut reader = open_sequence_reader(config, path, sequence_settings(config))?;
6896    let mut reads_seen = 0u64;
6897    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6898    while let Some(record) = reader.next_record()? {
6899        if limit_reached(limit, reads_seen) {
6900            break;
6901        }
6902        chunk.push((record, None));
6903        if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6904            increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
6905            chunk.clear();
6906        }
6907        reads_seen += 1;
6908    }
6909    if !chunk.is_empty() {
6910        increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
6911    }
6912    Ok(())
6913}
6914
6915fn count_single_file_atomic_sketch(
6916    config: &Config,
6917    path: &Path,
6918    sketch: &AtomicCountMinSketch,
6919    limit: Option<u64>,
6920    prefilter: Option<PrefilterGate<'_>>,
6921) -> Result<()> {
6922    let mut reader = open_sequence_reader(config, path, sequence_settings(config))?;
6923    let mut reads_seen = 0u64;
6924    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6925    while let Some(record) = reader.next_record()? {
6926        if limit_reached(limit, reads_seen) {
6927            break;
6928        }
6929        chunk.push((record, None));
6930        if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6931            increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6932            chunk.clear();
6933        }
6934        reads_seen += 1;
6935    }
6936    if !chunk.is_empty() {
6937        increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6938    }
6939    Ok(())
6940}
6941
6942fn count_paired_files(
6943    config: &Config,
6944    path1: &Path,
6945    path2: &Path,
6946    counts: &mut CountMap,
6947    limit: Option<u64>,
6948) -> Result<()> {
6949    let settings = sequence_settings(config);
6950    let (mut reader1, mut reader2) = open_paired_sequence_readers(config, path1, path2, settings)?;
6951    if reader1.format() != reader2.format() {
6952        bail!("paired inputs must use the same FASTA/FASTQ format");
6953    }
6954
6955    let mut pairs_seen = 0u64;
6956    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6957    loop {
6958        if limit_reached(limit, pairs_seen) {
6959            break;
6960        }
6961        match (reader1.next_record()?, reader2.next_record()?) {
6962            (None, None) => break,
6963            (Some(read1), Some(read2)) => {
6964                chunk.push((read1, Some(read2)));
6965                if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6966                    increment_counts_from_pair_chunk(config, counts, &chunk);
6967                    chunk.clear();
6968                }
6969                pairs_seen += 1;
6970            }
6971            (Some(_), None) => bail!(
6972                "{} has fewer records than {}",
6973                path2.display(),
6974                path1.display()
6975            ),
6976            (None, Some(_)) => bail!(
6977                "{} has fewer records than {}",
6978                path1.display(),
6979                path2.display()
6980            ),
6981        }
6982    }
6983    if !chunk.is_empty() {
6984        increment_counts_from_pair_chunk(config, counts, &chunk);
6985    }
6986    Ok(())
6987}
6988
6989fn count_paired_files_sketch(
6990    config: &Config,
6991    path1: &Path,
6992    path2: &Path,
6993    sketch: &mut PackedCountMinSketch,
6994    limit: Option<u64>,
6995    prefilter: Option<PrefilterGate<'_>>,
6996) -> Result<()> {
6997    let settings = sequence_settings(config);
6998    let (mut reader1, mut reader2) = open_paired_sequence_readers(config, path1, path2, settings)?;
6999    if reader1.format() != reader2.format() {
7000        bail!("paired inputs must use the same FASTA/FASTQ format");
7001    }
7002
7003    let mut pairs_seen = 0u64;
7004    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
7005    loop {
7006        if limit_reached(limit, pairs_seen) {
7007            break;
7008        }
7009        match (reader1.next_record()?, reader2.next_record()?) {
7010            (None, None) => break,
7011            (Some(read1), Some(read2)) => {
7012                chunk.push((read1, Some(read2)));
7013                if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
7014                    increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
7015                    chunk.clear();
7016                }
7017                pairs_seen += 1;
7018            }
7019            (Some(_), None) => bail!(
7020                "{} has fewer records than {}",
7021                path2.display(),
7022                path1.display()
7023            ),
7024            (None, Some(_)) => bail!(
7025                "{} has fewer records than {}",
7026                path1.display(),
7027                path2.display()
7028            ),
7029        }
7030    }
7031    if !chunk.is_empty() {
7032        increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
7033    }
7034    Ok(())
7035}
7036
7037fn count_paired_files_atomic_packed_sketch(
7038    config: &Config,
7039    path1: &Path,
7040    path2: &Path,
7041    sketch: &AtomicPackedCountMinSketch,
7042    limit: Option<u64>,
7043) -> Result<()> {
7044    let settings = sequence_settings(config);
7045    let (mut reader1, mut reader2) = open_paired_sequence_readers(config, path1, path2, settings)?;
7046    if reader1.format() != reader2.format() {
7047        bail!("paired inputs must use the same FASTA/FASTQ format");
7048    }
7049
7050    let mut pairs_seen = 0u64;
7051    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
7052    loop {
7053        if limit_reached(limit, pairs_seen) {
7054            break;
7055        }
7056        match (reader1.next_record()?, reader2.next_record()?) {
7057            (None, None) => break,
7058            (Some(read1), Some(read2)) => {
7059                chunk.push((read1, Some(read2)));
7060                if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
7061                    increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
7062                    chunk.clear();
7063                }
7064                pairs_seen += 1;
7065            }
7066            (Some(_), None) => bail!(
7067                "{} has fewer records than {}",
7068                path2.display(),
7069                path1.display()
7070            ),
7071            (None, Some(_)) => bail!(
7072                "{} has fewer records than {}",
7073                path1.display(),
7074                path2.display()
7075            ),
7076        }
7077    }
7078    if !chunk.is_empty() {
7079        increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
7080    }
7081    Ok(())
7082}
7083
7084fn count_paired_files_atomic_sketch(
7085    config: &Config,
7086    path1: &Path,
7087    path2: &Path,
7088    sketch: &AtomicCountMinSketch,
7089    limit: Option<u64>,
7090    prefilter: Option<PrefilterGate<'_>>,
7091) -> Result<()> {
7092    let settings = sequence_settings(config);
7093    let (mut reader1, mut reader2) = open_paired_sequence_readers(config, path1, path2, settings)?;
7094    if reader1.format() != reader2.format() {
7095        bail!("paired inputs must use the same FASTA/FASTQ format");
7096    }
7097
7098    let mut pairs_seen = 0u64;
7099    let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
7100    loop {
7101        if limit_reached(limit, pairs_seen) {
7102            break;
7103        }
7104        match (reader1.next_record()?, reader2.next_record()?) {
7105            (None, None) => break,
7106            (Some(read1), Some(read2)) => {
7107                chunk.push((read1, Some(read2)));
7108                if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
7109                    increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
7110                    chunk.clear();
7111                }
7112                pairs_seen += 1;
7113            }
7114            (Some(_), None) => bail!(
7115                "{} has fewer records than {}",
7116                path2.display(),
7117                path1.display()
7118            ),
7119            (None, Some(_)) => bail!(
7120                "{} has fewer records than {}",
7121                path1.display(),
7122                path2.display()
7123            ),
7124        }
7125    }
7126    if !chunk.is_empty() {
7127        increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
7128    }
7129    Ok(())
7130}
7131
7132fn normalize_primary(
7133    config: &Config,
7134    input_counts: &dyn CountLookup,
7135    mut output_counts: Option<&mut OutputCounts>,
7136    mut output_cardinality: Option<&mut KmerCardinalityEstimator>,
7137    cardinality_config: &Config,
7138    random_seed: u64,
7139    mut input_hist: InputHistSinks<'_>,
7140) -> Result<RunSummary> {
7141    let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7142    let format1 = readers.format1();
7143    let format2 = readers.format2();
7144    let mut writers = OptionalWriters::open(config, format1, format2)?;
7145    let mut summary = RunSummary::default();
7146    let mut rng = JavaXoshiro::new(random_seed);
7147    let mut chunk = Vec::with_capacity(NORMALIZE_PARALLEL_CHUNK_SIZE);
7148
7149    while let Some((r1, r2)) = readers.next_pair()? {
7150        chunk.push((readers.input_list_index(), r1, r2, rng.next_double()));
7151        if chunk.len() >= NORMALIZE_PARALLEL_CHUNK_SIZE {
7152            let pairs = normalize_pair_chunk(config, input_counts, &chunk);
7153            write_normalized_pairs(
7154                config,
7155                &mut writers,
7156                &mut output_counts,
7157                &mut output_cardinality,
7158                cardinality_config,
7159                &mut summary,
7160                &pairs,
7161                &mut input_hist,
7162            )?;
7163            chunk.clear();
7164        }
7165    }
7166    if !chunk.is_empty() {
7167        let pairs = normalize_pair_chunk(config, input_counts, &chunk);
7168        write_normalized_pairs(
7169            config,
7170            &mut writers,
7171            &mut output_counts,
7172            &mut output_cardinality,
7173            cardinality_config,
7174            &mut summary,
7175            &pairs,
7176            &mut input_hist,
7177        )?;
7178    }
7179
7180    writers.flush()?;
7181    Ok(summary)
7182}
7183
7184fn normalize_pair_chunk(
7185    config: &Config,
7186    input_counts: &dyn CountLookup,
7187    pairs: &[NormalizationInput],
7188) -> Vec<NormalizedPair> {
7189    pairs
7190        .par_iter()
7191        .map(|(input_list_index, r1, r2, rand)| {
7192            let mut r1 = r1.clone();
7193            let mut r2 = r2.clone();
7194            if !config.trim_after_marking {
7195                trim_pair(config, &mut r1, r2.as_mut());
7196            }
7197            let decision = decide_pair(config, input_counts, &r1, r2.as_ref(), Some(*rand));
7198            let mut correction = CorrectionResult::default();
7199            if config.error_correct && !decision.toss {
7200                correction =
7201                    correct_pair_errors_with_rollback(config, input_counts, &mut r1, r2.as_mut());
7202            }
7203            if config.trim_after_marking && config.error_correct {
7204                trim_pair(config, &mut r1, r2.as_mut());
7205            }
7206            let (out_r1, out_r2) = maybe_rename_pair(config, &r1, r2.as_ref(), &decision.analysis);
7207            let read_count = 1 + u64::from(r2.is_some());
7208            let base_count = r1.len() as u64 + r2.as_ref().map(|r| r.len() as u64).unwrap_or(0);
7209            NormalizedPair {
7210                input_list_index: *input_list_index,
7211                r1,
7212                r2,
7213                out_r1,
7214                out_r2,
7215                decision,
7216                uncorrectable: correction.uncorrectable,
7217                read_count,
7218                base_count,
7219            }
7220        })
7221        .collect()
7222}
7223
7224#[allow(clippy::too_many_arguments)]
7225fn write_normalized_pairs(
7226    config: &Config,
7227    writers: &mut OptionalWriters,
7228    output_counts: &mut Option<&mut OutputCounts>,
7229    output_cardinality: &mut Option<&mut KmerCardinalityEstimator>,
7230    cardinality_config: &Config,
7231    summary: &mut RunSummary,
7232    pairs: &[NormalizedPair],
7233    input_hist: &mut InputHistSinks<'_>,
7234) -> Result<()> {
7235    for pair in pairs {
7236        writers.sync_to_input_list_index(config, pair.input_list_index)?;
7237        summary.reads_in += pair.read_count;
7238        summary.bases_in += pair.base_count;
7239
7240        if let Some(hist) = input_hist.depth.as_deref_mut() {
7241            increment_sparse_hist_from_analysis(
7242                hist,
7243                &pair.decision.analysis.read1,
7244                config.hist_len,
7245            );
7246            if let Some(read2) = &pair.decision.analysis.read2 {
7247                increment_sparse_hist_from_analysis(hist, read2, config.hist_len);
7248            }
7249        }
7250        if let Some(read_hist) = input_hist.read.as_deref_mut() {
7251            increment_sparse_read_hist(
7252                read_hist,
7253                &pair.decision.analysis.read1,
7254                pair.r1.len(),
7255                config.hist_len,
7256            );
7257            if let (Some(read2_analysis), Some(read2)) =
7258                (&pair.decision.analysis.read2, pair.r2.as_ref())
7259            {
7260                increment_sparse_read_hist(read_hist, read2_analysis, read2.len(), config.hist_len);
7261            }
7262        }
7263
7264        if pair.decision.toss {
7265            summary.reads_tossed += pair.read_count;
7266            summary.bases_tossed += pair.base_count;
7267        } else {
7268            summary.reads_kept += pair.read_count;
7269            summary.bases_kept += pair.base_count;
7270        }
7271
7272        writers.write_pair(pair.decision.toss, &pair.out_r1, pair.out_r2.as_ref())?;
7273        if pair.uncorrectable {
7274            writers.write_uncorrected(&pair.r1, pair.r2.as_ref())?;
7275        }
7276        if depth_bin_outputs_enabled(config) {
7277            writers.write_depth_bin(
7278                config,
7279                &pair.decision.analysis,
7280                &pair.out_r1,
7281                pair.out_r2.as_ref(),
7282            )?;
7283        }
7284    }
7285    if let Some(counts) = output_counts.as_mut() {
7286        increment_output_counts_from_normalized_chunk(config, counts, pairs);
7287    }
7288    if let Some(estimator) = output_cardinality.as_mut() {
7289        for pair in pairs.iter().filter(|pair| !pair.decision.toss) {
7290            estimator.observe_pair(cardinality_config, &pair.r1, pair.r2.as_ref());
7291        }
7292    }
7293    Ok(())
7294}
7295
7296fn increment_output_counts_from_normalized_chunk(
7297    config: &Config,
7298    counts: &mut OutputCounts,
7299    pairs: &[NormalizedPair],
7300) {
7301    match counts {
7302        OutputCounts::Exact(counts) => {
7303            let chunk_counts = pairs
7304                .par_iter()
7305                .filter(|pair| !pair.decision.toss)
7306                .fold(CountMap::default, |mut local_counts, pair| {
7307                    increment_pair_counts(config, &mut local_counts, &pair.r1, pair.r2.as_ref());
7308                    local_counts
7309                })
7310                .reduce(CountMap::default, |mut left, right| {
7311                    merge_count_maps(&mut left, right);
7312                    left
7313                });
7314            merge_count_maps(counts, chunk_counts);
7315        }
7316        OutputCounts::Sketch(sketch) => {
7317            increment_sketch_from_normalized_chunk(config, sketch, pairs);
7318        }
7319        OutputCounts::AtomicSketch(sketch) => {
7320            increment_atomic_sketch_from_normalized_chunk(config, sketch, pairs);
7321        }
7322    }
7323}
7324
7325fn increment_atomic_sketch_from_normalized_chunk(
7326    config: &Config,
7327    sketch: &AtomicCountMinSketch,
7328    pairs: &[NormalizedPair],
7329) {
7330    if !config.deterministic {
7331        let (key_increments, newly_occupied) = pairs
7332            .par_iter()
7333            .filter(|pair| !pair.decision.toss)
7334            .map(|pair| {
7335                increment_pair_atomic_sketch_direct(
7336                    config,
7337                    sketch,
7338                    &pair.r1,
7339                    pair.r2.as_ref(),
7340                    None,
7341                )
7342            })
7343            .reduce(
7344                || (0u64, 0usize),
7345                |left, right| {
7346                    (
7347                        left.0.saturating_add(right.0),
7348                        left.1.saturating_add(right.1),
7349                    )
7350                },
7351            );
7352        sketch.add_key_increments(key_increments);
7353        sketch.add_occupied_slots(newly_occupied);
7354        return;
7355    }
7356
7357    let chunk_counts = pairs
7358        .par_iter()
7359        .filter(|pair| !pair.decision.toss)
7360        .fold(CountMap::default, |mut local_counts, pair| {
7361            increment_pair_counts(config, &mut local_counts, &pair.r1, pair.r2.as_ref());
7362            local_counts
7363        })
7364        .reduce(CountMap::default, |mut left, right| {
7365            merge_count_maps(&mut left, right);
7366            left
7367        });
7368    let key_increments = chunk_counts.values().copied().sum();
7369    sketch.add_key_counts(&chunk_counts);
7370    sketch.add_key_increments(key_increments);
7371}
7372
7373fn increment_sketch_from_normalized_chunk(
7374    config: &Config,
7375    sketch: &mut PackedCountMinSketch,
7376    pairs: &[NormalizedPair],
7377) {
7378    let chunk_counts = pairs
7379        .par_iter()
7380        .filter(|pair| !pair.decision.toss)
7381        .fold(CountMap::default, |mut local_counts, pair| {
7382            increment_pair_counts(config, &mut local_counts, &pair.r1, pair.r2.as_ref());
7383            local_counts
7384        })
7385        .reduce(CountMap::default, |mut left, right| {
7386            merge_count_maps(&mut left, right);
7387            left
7388        });
7389    let key_increments = chunk_counts.values().copied().sum();
7390    sketch.add_key_counts(&chunk_counts);
7391    sketch.add_key_increments(key_increments);
7392}
7393
7394#[cfg(test)]
7395fn collect_primary_hist(
7396    config: &Config,
7397    hist_counts: &dyn CountLookup,
7398    keep_filter_counts: Option<&dyn CountLookup>,
7399    random_seed: u64,
7400) -> Result<Vec<u64>> {
7401    let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7402    let mut hist = vec![0u64; config.hist_len];
7403    let mut rng = JavaXoshiro::new(random_seed);
7404    let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7405
7406    while let Some((mut r1, mut r2)) = readers.next_pair()? {
7407        trim_pair(config, &mut r1, r2.as_mut());
7408        let rand = keep_filter_counts.map(|_| rng.next_double());
7409        chunk.push((r1, r2, rand));
7410        if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7411            increment_hist_from_pair_chunk(
7412                config,
7413                hist_counts,
7414                keep_filter_counts,
7415                &mut hist,
7416                &chunk,
7417            );
7418            chunk.clear();
7419        }
7420    }
7421    if !chunk.is_empty() {
7422        increment_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, &mut hist, &chunk);
7423    }
7424
7425    Ok(hist)
7426}
7427
7428fn collect_primary_sparse_hist(
7429    config: &Config,
7430    hist_counts: &dyn CountLookup,
7431    keep_filter_counts: Option<&dyn CountLookup>,
7432    random_seed: u64,
7433) -> Result<SparseHist> {
7434    let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7435    let mut hist = SparseHist::default();
7436    let mut rng = JavaXoshiro::new(random_seed);
7437    let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7438
7439    while let Some((mut r1, mut r2)) = readers.next_pair()? {
7440        trim_pair(config, &mut r1, r2.as_mut());
7441        let rand = keep_filter_counts.map(|_| rng.next_double());
7442        chunk.push((r1, r2, rand));
7443        if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7444            let chunk_hist =
7445                sparse_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, &chunk);
7446            merge_sparse_hist(&mut hist, chunk_hist);
7447            chunk.clear();
7448        }
7449    }
7450    if !chunk.is_empty() {
7451        let chunk_hist =
7452            sparse_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, &chunk);
7453        merge_sparse_hist(&mut hist, chunk_hist);
7454    }
7455
7456    Ok(hist)
7457}
7458
7459#[cfg(test)]
7460fn collect_primary_read_hist(
7461    config: &Config,
7462    hist_counts: &dyn CountLookup,
7463    keep_filter_counts: Option<&dyn CountLookup>,
7464    random_seed: u64,
7465) -> Result<ReadDepthHistogram> {
7466    let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7467    let mut hist = ReadDepthHistogram::new(config.hist_len);
7468    let mut rng = JavaXoshiro::new(random_seed);
7469    let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7470
7471    while let Some((mut r1, mut r2)) = readers.next_pair()? {
7472        trim_pair(config, &mut r1, r2.as_mut());
7473        let rand = keep_filter_counts.map(|_| rng.next_double());
7474        chunk.push((r1, r2, rand));
7475        if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7476            increment_read_hist_from_pair_chunk(
7477                config,
7478                hist_counts,
7479                keep_filter_counts,
7480                &mut hist,
7481                &chunk,
7482            );
7483            chunk.clear();
7484        }
7485    }
7486    if !chunk.is_empty() {
7487        increment_read_hist_from_pair_chunk(
7488            config,
7489            hist_counts,
7490            keep_filter_counts,
7491            &mut hist,
7492            &chunk,
7493        );
7494    }
7495
7496    Ok(hist)
7497}
7498
7499fn collect_primary_sparse_read_hist(
7500    config: &Config,
7501    hist_counts: &dyn CountLookup,
7502    keep_filter_counts: Option<&dyn CountLookup>,
7503    random_seed: u64,
7504) -> Result<SparseReadDepthHist> {
7505    let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7506    let mut hist = SparseReadDepthHist::default();
7507    let mut rng = JavaXoshiro::new(random_seed);
7508    let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7509
7510    while let Some((mut r1, mut r2)) = readers.next_pair()? {
7511        trim_pair(config, &mut r1, r2.as_mut());
7512        let rand = keep_filter_counts.map(|_| rng.next_double());
7513        chunk.push((r1, r2, rand));
7514        if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7515            let chunk_hist =
7516                sparse_read_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, &chunk);
7517            merge_sparse_read_depth_hist(&mut hist, chunk_hist);
7518            chunk.clear();
7519        }
7520    }
7521    if !chunk.is_empty() {
7522        let chunk_hist =
7523            sparse_read_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, &chunk);
7524        merge_sparse_read_depth_hist(&mut hist, chunk_hist);
7525    }
7526
7527    Ok(hist)
7528}
7529
7530#[cfg(test)]
7531fn collect_primary_hist_and_read_hist(
7532    config: &Config,
7533    hist_counts: &dyn CountLookup,
7534    keep_filter_counts: Option<&dyn CountLookup>,
7535    random_seed: u64,
7536) -> Result<(Vec<u64>, ReadDepthHistogram)> {
7537    let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7538    let mut depth_hist = vec![0u64; config.hist_len];
7539    let mut read_hist = ReadDepthHistogram::new(config.hist_len);
7540    let mut rng = JavaXoshiro::new(random_seed);
7541    let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7542
7543    while let Some((mut r1, mut r2)) = readers.next_pair()? {
7544        trim_pair(config, &mut r1, r2.as_mut());
7545        let rand = keep_filter_counts.map(|_| rng.next_double());
7546        chunk.push((r1, r2, rand));
7547        if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7548            increment_hist_and_read_hist_from_pair_chunk(
7549                config,
7550                hist_counts,
7551                keep_filter_counts,
7552                &mut depth_hist,
7553                &mut read_hist,
7554                &chunk,
7555            );
7556            chunk.clear();
7557        }
7558    }
7559    if !chunk.is_empty() {
7560        increment_hist_and_read_hist_from_pair_chunk(
7561            config,
7562            hist_counts,
7563            keep_filter_counts,
7564            &mut depth_hist,
7565            &mut read_hist,
7566            &chunk,
7567        );
7568    }
7569
7570    Ok((depth_hist, read_hist))
7571}
7572
7573fn collect_primary_sparse_hist_and_read_hist(
7574    config: &Config,
7575    hist_counts: &dyn CountLookup,
7576    keep_filter_counts: Option<&dyn CountLookup>,
7577    random_seed: u64,
7578) -> Result<(SparseHist, SparseReadDepthHist)> {
7579    let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7580    let mut depth_hist = SparseHist::default();
7581    let mut read_hist = SparseReadDepthHist::default();
7582    let mut rng = JavaXoshiro::new(random_seed);
7583    let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7584
7585    while let Some((mut r1, mut r2)) = readers.next_pair()? {
7586        trim_pair(config, &mut r1, r2.as_mut());
7587        let rand = keep_filter_counts.map(|_| rng.next_double());
7588        chunk.push((r1, r2, rand));
7589        if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7590            let (chunk_depth_hist, chunk_read_hist) = sparse_hist_and_read_hist_from_pair_chunk(
7591                config,
7592                hist_counts,
7593                keep_filter_counts,
7594                &chunk,
7595            );
7596            merge_sparse_hist(&mut depth_hist, chunk_depth_hist);
7597            merge_sparse_read_depth_hist(&mut read_hist, chunk_read_hist);
7598            chunk.clear();
7599        }
7600    }
7601    if !chunk.is_empty() {
7602        let (chunk_depth_hist, chunk_read_hist) = sparse_hist_and_read_hist_from_pair_chunk(
7603            config,
7604            hist_counts,
7605            keep_filter_counts,
7606            &chunk,
7607        );
7608        merge_sparse_hist(&mut depth_hist, chunk_depth_hist);
7609        merge_sparse_read_depth_hist(&mut read_hist, chunk_read_hist);
7610    }
7611
7612    Ok((depth_hist, read_hist))
7613}
7614
7615fn emit_read_local_side_outputs(config: &Config) -> Result<()> {
7616    if !read_local_side_outputs_enabled(config) {
7617        return Ok(());
7618    }
7619
7620    let mut hist = collect_read_local_side_hists(config)?;
7621    if let Some(quality) = hist.quality.take() {
7622        emit_quality_side_outputs(config, &quality)?;
7623    }
7624    if let (Some(path), Some(length)) = (&config.length_hist_out, hist.length.as_ref()) {
7625        write_length_hist(path, length, config)?;
7626    }
7627    if let (Some(path), Some(gc)) = (&config.gc_hist_out, hist.gc.as_ref()) {
7628        write_gc_hist(path, gc, config)?;
7629    }
7630    if let (Some(path), Some(base)) = (&config.base_hist_out, hist.base.as_ref()) {
7631        write_base_content_hist(path, base, config)?;
7632    }
7633    if let (Some(path), Some(entropy)) = (&config.entropy_hist_out, hist.entropy.as_ref()) {
7634        write_entropy_hist(path, entropy, config)?;
7635    }
7636    if let (Some(path), Some(identity)) = (&config.identity_hist_out, hist.identity.as_ref()) {
7637        write_identity_hist(path, identity, config)?;
7638    }
7639    if let Some(alignment) = hist.alignment.as_ref() {
7640        emit_alignment_fallback_side_outputs(config, alignment)?;
7641    }
7642    if let (Some(path), Some(barcodes)) = (&config.barcode_stats_out, hist.barcodes.as_ref()) {
7643        write_barcode_stats(path, barcodes, config)?;
7644    }
7645    Ok(())
7646}
7647
7648fn read_local_side_outputs_enabled(config: &Config) -> bool {
7649    config.quality_hist_out.is_some()
7650        || config.base_quality_hist_out.is_some()
7651        || config.quality_count_hist_out.is_some()
7652        || config.average_quality_hist_out.is_some()
7653        || config.overall_base_quality_hist_out.is_some()
7654        || config.length_hist_out.is_some()
7655        || config.gc_hist_out.is_some()
7656        || config.base_hist_out.is_some()
7657        || config.entropy_hist_out.is_some()
7658        || config.identity_hist_out.is_some()
7659        || config.barcode_stats_out.is_some()
7660        || alignment_fallback_side_outputs_enabled(config)
7661}
7662
7663fn quality_side_outputs_enabled(config: &Config) -> bool {
7664    config.quality_hist_out.is_some()
7665        || config.base_quality_hist_out.is_some()
7666        || config.quality_count_hist_out.is_some()
7667        || config.average_quality_hist_out.is_some()
7668        || config.overall_base_quality_hist_out.is_some()
7669}
7670
7671fn alignment_fallback_side_outputs_enabled(config: &Config) -> bool {
7672    config.match_hist_out.is_some()
7673        || config.insert_hist_out.is_some()
7674        || config.quality_accuracy_hist_out.is_some()
7675        || config.indel_hist_out.is_some()
7676        || config.error_hist_out.is_some()
7677}
7678
7679fn emit_quality_side_outputs(config: &Config, hist: &QualitySideHistograms) -> Result<()> {
7680    if let Some(path) = &config.quality_hist_out {
7681        write_quality_hist(path, &hist.overall, config)?;
7682    }
7683    if let Some(path) = &config.quality_count_hist_out {
7684        write_quality_count_hist(
7685            path,
7686            &hist.first_counts,
7687            &hist.second_counts,
7688            hist.paired,
7689            config,
7690        )?;
7691    }
7692    if let Some(path) = &config.average_quality_hist_out {
7693        write_average_quality_hist(path, &hist.first_avg, &hist.second_avg, hist.paired, config)?;
7694    }
7695    if let Some(path) = &config.overall_base_quality_hist_out {
7696        write_overall_base_quality_hist(path, &hist.overall, config)?;
7697    }
7698    if let Some(path) = &config.base_quality_hist_out {
7699        write_base_quality_hist(path, hist, config)?;
7700    }
7701    Ok(())
7702}
7703
7704fn collect_read_local_side_hists(config: &Config) -> Result<ReadLocalSideHistograms> {
7705    let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7706    let quality_len = side_hist_len(config);
7707    let side_len = side_hist_len(config);
7708    let mut hist = ReadLocalSideHistograms {
7709        quality: quality_side_outputs_enabled(config).then(|| QualitySideHistograms {
7710            overall: vec![0; quality_len],
7711            first_counts: vec![0; quality_len],
7712            second_counts: vec![0; quality_len],
7713            first_avg: vec![0; quality_len],
7714            second_avg: vec![0; quality_len],
7715            first_by_pos: vec![vec![0; quality_len]; side_len],
7716            second_by_pos: vec![vec![0; quality_len]; side_len],
7717            paired: false,
7718        }),
7719        length: config
7720            .length_hist_out
7721            .is_some()
7722            .then(|| ReadDepthHistogram::new(side_len)),
7723        gc: config
7724            .gc_hist_out
7725            .is_some()
7726            .then(|| ReadDepthHistogram::new(gc_hist_len(config))),
7727        base: config
7728            .base_hist_out
7729            .is_some()
7730            .then(|| BaseContentHistogram {
7731                first: vec![BaseCounts::default(); side_len],
7732                second: vec![BaseCounts::default(); side_len],
7733            }),
7734        entropy: config
7735            .entropy_hist_out
7736            .is_some()
7737            .then(|| vec![0u64; config.entropy_bins.saturating_add(1).max(1)]),
7738        identity: config
7739            .identity_hist_out
7740            .is_some()
7741            .then(|| ReadDepthHistogram::new(config.identity_bins.saturating_add(1).max(1))),
7742        alignment: alignment_fallback_side_outputs_enabled(config).then(|| {
7743            AlignmentFallbackHistograms {
7744                first_match: vec![MatchCounts::default(); side_len],
7745                second_match: vec![MatchCounts::default(); side_len],
7746                quality_match: vec![0; quality_len],
7747                ..AlignmentFallbackHistograms::default()
7748            }
7749        }),
7750        barcodes: config.barcode_stats_out.is_some().then(BTreeMap::new),
7751    };
7752
7753    while let Some((mut r1, mut r2)) = readers.next_pair()? {
7754        trim_pair(config, &mut r1, r2.as_mut());
7755        if let Some(barcodes) = hist.barcodes.as_mut() {
7756            increment_barcode_stats(barcodes, &r1, r2.is_some());
7757        }
7758        increment_read_local_side_hists(config, &mut hist, &r1, false);
7759        if let Some(mate) = r2.as_ref() {
7760            increment_read_local_side_hists(config, &mut hist, mate, true);
7761        }
7762    }
7763
7764    Ok(hist)
7765}
7766
7767fn side_hist_len(config: &Config) -> usize {
7768    config.side_hist_len.unwrap_or(config.hist_len).max(1)
7769}
7770
7771fn gc_hist_len(config: &Config) -> usize {
7772    config.gc_bins.unwrap_or(101).max(1)
7773}
7774
7775fn increment_length_hist(hist: &mut ReadDepthHistogram, read_len: usize) {
7776    let idx = read_len.min(hist.reads.len().saturating_sub(1));
7777    hist.reads[idx] += 1;
7778    hist.bases[idx] += read_len as u64;
7779}
7780
7781fn increment_read_local_side_hists(
7782    config: &Config,
7783    hist: &mut ReadLocalSideHistograms,
7784    record: &SequenceRecord,
7785    second: bool,
7786) {
7787    if let Some(quality) = hist.quality.as_mut() {
7788        if second {
7789            quality.paired = true;
7790        }
7791        increment_quality_side_hists(config, quality, record, second);
7792    }
7793    if let Some(length) = hist.length.as_mut() {
7794        increment_length_hist(length, record.len());
7795    }
7796    if let Some(gc) = hist.gc.as_mut() {
7797        increment_gc_hist(gc, record);
7798    }
7799    if let Some(base) = hist.base.as_mut() {
7800        if second {
7801            increment_base_content_hist(&mut base.second, record);
7802        } else {
7803            increment_base_content_hist(&mut base.first, record);
7804        }
7805    }
7806    if let Some(entropy) = hist.entropy.as_mut() {
7807        increment_entropy_hist(config, entropy, record);
7808    }
7809    if let Some(identity) = hist.identity.as_mut() {
7810        increment_sequence_identity_hist(identity, record);
7811    }
7812    if let Some(alignment) = hist.alignment.as_mut() {
7813        increment_alignment_fallback_hists(config, alignment, record, second);
7814    }
7815}
7816
7817fn increment_gc_hist(hist: &mut ReadDepthHistogram, record: &SequenceRecord) {
7818    let mut gc = 0usize;
7819    let mut acgt = 0usize;
7820    for base in &record.bases {
7821        match *base {
7822            b'G' | b'C' | b'g' | b'c' => {
7823                gc += 1;
7824                acgt += 1;
7825            }
7826            b'A' | b'T' | b'U' | b'a' | b't' | b'u' => acgt += 1,
7827            _ => {}
7828        }
7829    }
7830    let idx = if acgt == 0 {
7831        0
7832    } else {
7833        ((gc * hist.reads.len()) / acgt).min(hist.reads.len().saturating_sub(1))
7834    };
7835    hist.reads[idx] += 1;
7836    hist.bases[idx] += record.len() as u64;
7837}
7838
7839fn increment_quality_side_hists(
7840    config: &Config,
7841    hist: &mut QualitySideHistograms,
7842    record: &SequenceRecord,
7843    second: bool,
7844) {
7845    if record.is_empty() {
7846        return;
7847    }
7848
7849    let quality_len = hist.overall.len();
7850    let last_quality_idx = quality_len.saturating_sub(1);
7851    let (counts, avg_counts, by_pos) = if second {
7852        (
7853            &mut hist.second_counts,
7854            &mut hist.second_avg,
7855            &mut hist.second_by_pos,
7856        )
7857    } else {
7858        (
7859            &mut hist.first_counts,
7860            &mut hist.first_avg,
7861            &mut hist.first_by_pos,
7862        )
7863    };
7864
7865    let mut sum = 0usize;
7866    for idx in 0..record.len() {
7867        let quality = record_quality_at(config, record, idx).min(last_quality_idx);
7868        hist.overall[quality] += 1;
7869        counts[quality] += 1;
7870        sum += quality;
7871        if idx < by_pos.len() {
7872            by_pos[idx][quality] += 1;
7873        }
7874    }
7875
7876    let avg = ((sum as f64) / (record.len() as f64)).round() as usize;
7877    avg_counts[avg.min(last_quality_idx)] += 1;
7878}
7879
7880fn record_quality_at(config: &Config, record: &SequenceRecord, idx: usize) -> usize {
7881    record
7882        .qualities
7883        .as_ref()
7884        .and_then(|qualities| qualities.get(idx))
7885        .map_or(config.fake_quality as usize, |quality| {
7886            quality.saturating_sub(33) as usize
7887        })
7888}
7889
7890fn increment_base_content_hist(hist: &mut [BaseCounts], record: &SequenceRecord) {
7891    for (idx, base) in record.bases.iter().copied().enumerate().take(hist.len()) {
7892        let counts = &mut hist[idx];
7893        match base {
7894            b'A' | b'a' => counts.a += 1,
7895            b'C' | b'c' => counts.c += 1,
7896            b'G' | b'g' => counts.g += 1,
7897            b'T' | b't' | b'U' | b'u' => counts.t += 1,
7898            _ => counts.n += 1,
7899        }
7900    }
7901}
7902
7903fn increment_entropy_hist(config: &Config, hist: &mut [u64], record: &SequenceRecord) {
7904    if record.is_empty() {
7905        return;
7906    }
7907    if let Some(entropy) = read_entropy(config, &record.bases) {
7908        let bins = hist.len().saturating_sub(1);
7909        let idx = ((entropy * hist.len() as f64) as usize).min(bins);
7910        hist[idx] += 1;
7911    }
7912}
7913
7914fn increment_sequence_identity_hist(hist: &mut ReadDepthHistogram, record: &SequenceRecord) {
7915    let idx = hist.reads.len().saturating_sub(1);
7916    hist.reads[idx] += 1;
7917    hist.bases[idx] += record.len() as u64;
7918}
7919
7920fn increment_barcode_stats(
7921    barcodes: &mut BTreeMap<String, u64>,
7922    record: &SequenceRecord,
7923    paired: bool,
7924) {
7925    let barcode = header_to_barcode(&record.id).unwrap_or("NONE");
7926    let count = if paired { 2 } else { 1 };
7927    *barcodes.entry(barcode.to_string()).or_insert(0) += count;
7928}
7929
7930fn header_to_barcode(id: &str) -> Option<&str> {
7931    let loc = id.rfind(':')?;
7932    let loc2 = id
7933        .find(' ')
7934        .map(|idx| idx as isize)
7935        .unwrap_or(-1)
7936        .max(id.find('/').map(|idx| idx as isize).unwrap_or(-1));
7937    if (loc as isize) <= loc2 || loc >= id.len().saturating_sub(1) {
7938        return None;
7939    }
7940    let start = loc + 1;
7941    let stop = id[start..]
7942        .find([' ', '\t'])
7943        .map_or(id.len(), |offset| start + offset);
7944    Some(&id[start..stop])
7945}
7946
7947fn increment_alignment_fallback_hists(
7948    config: &Config,
7949    hist: &mut AlignmentFallbackHistograms,
7950    record: &SequenceRecord,
7951    second: bool,
7952) {
7953    hist.read_count += 1;
7954    hist.base_count += record.len() as u64;
7955    if second {
7956        hist.paired = true;
7957        hist.pair_count += 1;
7958    }
7959
7960    let match_hist = if second {
7961        &mut hist.second_match
7962    } else {
7963        &mut hist.first_match
7964    };
7965    for (idx, base) in record
7966        .bases
7967        .iter()
7968        .copied()
7969        .enumerate()
7970        .take(match_hist.len())
7971    {
7972        if is_acgt(base) {
7973            match_hist[idx].matches += 1;
7974        } else {
7975            match_hist[idx].n += 1;
7976        }
7977    }
7978
7979    for idx in 0..record.len() {
7980        let quality =
7981            record_quality_at(config, record, idx).min(hist.quality_match.len().saturating_sub(1));
7982        hist.quality_match[quality] += 1;
7983    }
7984}
7985
7986fn read_entropy(config: &Config, bases: &[u8]) -> Option<f64> {
7987    let k = config.entropy_k.clamp(1, 15);
7988    if bases.len() < k {
7989        return base_entropy(config, bases);
7990    }
7991
7992    let window = config.entropy_window.max(k).min(bases.len());
7993    let mut sum = 0.0;
7994    let mut count = 0usize;
7995    for start in 0..=bases.len() - window {
7996        if let Some(entropy) = window_kmer_entropy(config, &bases[start..start + window], k) {
7997            sum += entropy;
7998            count += 1;
7999        }
8000    }
8001
8002    if count == 0 {
8003        None
8004    } else {
8005        Some((sum / count as f64).clamp(0.0, 1.0))
8006    }
8007}
8008
8009fn window_kmer_entropy(config: &Config, window: &[u8], k: usize) -> Option<f64> {
8010    if window.len() < k {
8011        return base_entropy(config, window);
8012    }
8013
8014    let mut counts: FxHashMap<Vec<u8>, u64> = FxHashMap::default();
8015    let mut total = 0u64;
8016    for kmer in window.windows(k) {
8017        if !config.allow_entropy_ns && kmer.iter().any(|base| !is_acgt(*base)) {
8018            continue;
8019        }
8020        let key: Vec<u8> = kmer
8021            .iter()
8022            .copied()
8023            .map(|base| match base.to_ascii_uppercase() {
8024                b'A' | b'C' | b'G' | b'T' => base.to_ascii_uppercase(),
8025                _ => b'N',
8026            })
8027            .collect();
8028        *counts.entry(key).or_insert(0) += 1;
8029        total += 1;
8030    }
8031
8032    if total == 0 {
8033        return None;
8034    }
8035    let entropy = shannon_entropy(counts.values().copied(), total);
8036    let max_entropy = (total as f64).ln();
8037    Some(if max_entropy > 0.0 {
8038        entropy / max_entropy
8039    } else {
8040        0.0
8041    })
8042}
8043
8044fn base_entropy(config: &Config, bases: &[u8]) -> Option<f64> {
8045    let mut counts = [0u64; 5];
8046    let mut total = 0u64;
8047    for base in bases {
8048        let idx = match base.to_ascii_uppercase() {
8049            b'A' => Some(0),
8050            b'C' => Some(1),
8051            b'G' => Some(2),
8052            b'T' | b'U' => Some(3),
8053            _ if config.allow_entropy_ns => Some(4),
8054            _ => None,
8055        };
8056        if let Some(idx) = idx {
8057            counts[idx] += 1;
8058            total += 1;
8059        }
8060    }
8061    if total == 0 {
8062        return None;
8063    }
8064    let entropy = shannon_entropy(counts, total);
8065    let nonzero = counts.into_iter().filter(|count| *count > 0).count();
8066    let max_entropy = (nonzero.max(1) as f64).ln();
8067    Some(if max_entropy > 0.0 {
8068        entropy / max_entropy
8069    } else {
8070        0.0
8071    })
8072}
8073
8074fn shannon_entropy(counts: impl IntoIterator<Item = u64>, total: u64) -> f64 {
8075    let total = total as f64;
8076    counts
8077        .into_iter()
8078        .filter(|count| *count > 0)
8079        .map(|count| {
8080            let p = count as f64 / total;
8081            -p * p.ln()
8082        })
8083        .sum()
8084}
8085
8086fn is_acgt(base: u8) -> bool {
8087    matches!(base, b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't')
8088}
8089
8090fn analyze_pair(
8091    config: &Config,
8092    counts: &dyn CountLookup,
8093    r1: &SequenceRecord,
8094    r2: Option<&SequenceRecord>,
8095) -> PairAnalysis {
8096    let (read1, read2) = match r2 {
8097        Some(record) if r1.len() + record.len() >= PAIRED_ANALYSIS_JOIN_MIN_BASES => {
8098            let (read1, read2) = rayon::join(
8099                || analyze_read(config, counts, r1),
8100                || analyze_read(config, counts, record),
8101            );
8102            (read1, Some(read2))
8103        }
8104        Some(record) => (
8105            analyze_read(config, counts, r1),
8106            Some(analyze_read(config, counts, record)),
8107        ),
8108        None => (analyze_read(config, counts, r1), None),
8109    };
8110    pair_analysis_from_reads(config, read1, read2)
8111}
8112
8113fn analyze_pair_for_two_configs(
8114    config: &Config,
8115    other_config: &Config,
8116    counts: &dyn CountLookup,
8117    r1: &SequenceRecord,
8118    r2: Option<&SequenceRecord>,
8119) -> (PairAnalysis, PairAnalysis) {
8120    if !can_share_read_coverage(config, other_config) {
8121        return (
8122            analyze_pair(config, counts, r1, r2),
8123            analyze_pair(other_config, counts, r1, r2),
8124        );
8125    }
8126
8127    let ((read1, other_read1), read2_pair) = match r2 {
8128        Some(record) if r1.len() + record.len() >= PAIRED_ANALYSIS_JOIN_MIN_BASES => {
8129            let (first, second) = rayon::join(
8130                || analyze_read_for_two_configs(config, other_config, counts, r1),
8131                || analyze_read_for_two_configs(config, other_config, counts, record),
8132            );
8133            (first, Some(second))
8134        }
8135        Some(record) => (
8136            analyze_read_for_two_configs(config, other_config, counts, r1),
8137            Some(analyze_read_for_two_configs(
8138                config,
8139                other_config,
8140                counts,
8141                record,
8142            )),
8143        ),
8144        None => (
8145            analyze_read_for_two_configs(config, other_config, counts, r1),
8146            None,
8147        ),
8148    };
8149    let (read2, other_read2) = read2_pair
8150        .map(|(read, other_read)| (Some(read), Some(other_read)))
8151        .unwrap_or((None, None));
8152    (
8153        pair_analysis_from_reads(config, read1, read2),
8154        pair_analysis_from_reads(other_config, other_read1, other_read2),
8155    )
8156}
8157
8158fn can_share_read_coverage(config: &Config, other_config: &Config) -> bool {
8159    config.k == other_config.k
8160        && (config.canonical || config.k <= 31) == (other_config.canonical || other_config.k <= 31)
8161        && config.fix_spikes == other_config.fix_spikes
8162}
8163
8164fn pair_analysis_from_reads(
8165    config: &Config,
8166    read1: ReadAnalysis,
8167    read2: Option<ReadAnalysis>,
8168) -> PairAnalysis {
8169    let depth_proxy_al = match (&read2, config.use_lower_depth) {
8170        (Some(read2), true) => min_option(read1.depth_al, read2.depth_al),
8171        (Some(read2), false) => max_option(read1.depth_al, read2.depth_al),
8172        (None, _) => read1.depth_al,
8173    };
8174    let max_true_depth = match &read2 {
8175        Some(read2) => max_option(read1.true_depth, read2.true_depth),
8176        None => read1.true_depth,
8177    };
8178    let low_kmer_count =
8179        read1.low_kmer_count + read2.as_ref().map(|read| read.low_kmer_count).unwrap_or(0);
8180    let total_kmer_count = read1.total_kmer_count
8181        + read2
8182            .as_ref()
8183            .map(|read| read.total_kmer_count)
8184            .unwrap_or(0);
8185    PairAnalysis {
8186        error1: read1.error,
8187        error2: read2.as_ref().is_some_and(|read| read.error),
8188        read1,
8189        read2,
8190        depth_proxy_al,
8191        max_true_depth,
8192        low_kmer_count,
8193        total_kmer_count,
8194    }
8195}
8196
8197fn analyze_read(
8198    config: &Config,
8199    counts: &dyn CountLookup,
8200    record: &SequenceRecord,
8201) -> ReadAnalysis {
8202    let coverage = read_coverage_desc(config, counts, record);
8203    analyze_read_from_coverage(config, coverage.coverage_desc, coverage.had_kmer_windows)
8204}
8205
8206fn analyze_read_for_two_configs(
8207    config: &Config,
8208    other_config: &Config,
8209    counts: &dyn CountLookup,
8210    record: &SequenceRecord,
8211) -> (ReadAnalysis, ReadAnalysis) {
8212    let coverage = read_coverage_desc(config, counts, record);
8213    let other_coverage = coverage.coverage_desc.clone();
8214    (
8215        analyze_read_from_coverage(config, coverage.coverage_desc, coverage.had_kmer_windows),
8216        analyze_read_from_coverage(other_config, other_coverage, coverage.had_kmer_windows),
8217    )
8218}
8219
8220struct ReadCoverageDesc {
8221    coverage_desc: Vec<i64>,
8222    had_kmer_windows: bool,
8223}
8224
8225fn read_coverage_desc(
8226    config: &Config,
8227    counts: &dyn CountLookup,
8228    record: &SequenceRecord,
8229) -> ReadCoverageDesc {
8230    let windows = unfiltered_kmer_windows_for_record(record, config);
8231    let mut coverage: Vec<i64> = windows
8232        .iter()
8233        .map(|window| match window {
8234            Some(kmer) => u64_to_i64_saturating(counts.depth(kmer)),
8235            None => -1,
8236        })
8237        .collect();
8238    if coverage.is_empty() {
8239        return ReadCoverageDesc {
8240            coverage_desc: coverage,
8241            had_kmer_windows: record.len() >= config.k,
8242        };
8243    }
8244    if config.fix_spikes {
8245        fix_spikes(&mut coverage, &windows, counts, config.k);
8246    }
8247    if coverage.len() >= COVERAGE_PAR_SORT_MIN_WINDOWS {
8248        coverage.par_sort_unstable_by(|a, b| b.cmp(a));
8249    } else {
8250        coverage.sort_unstable_by(|a, b| b.cmp(a));
8251    }
8252    ReadCoverageDesc {
8253        coverage_desc: coverage,
8254        had_kmer_windows: true,
8255    }
8256}
8257
8258fn analyze_read_from_coverage(
8259    config: &Config,
8260    coverage: Vec<i64>,
8261    had_kmer_windows: bool,
8262) -> ReadAnalysis {
8263    if coverage.is_empty() {
8264        return ReadAnalysis {
8265            had_kmer_windows,
8266            ..ReadAnalysis::default()
8267        };
8268    }
8269    let cov_last = coverage.len() - 1;
8270    let high = coverage[percentile_index(cov_last, config.high_percentile)];
8271    let low = coverage[percentile_index(cov_last, config.low_percentile)];
8272    let true_depth = coverage[percentile_index(cov_last, config.depth_percentile)];
8273    let min_true_depth = low;
8274    let min_depth = u64_to_i64_saturating(config.min_depth)
8275        .max(high / u64_to_i64_saturating(config.error_detect_ratio));
8276
8277    let mut above_limit = cov_last as isize;
8278    while above_limit >= 0 && coverage[above_limit as usize] < min_depth {
8279        above_limit -= 1;
8280    }
8281
8282    let depth_al = if above_limit >= 0
8283        && ((above_limit as usize + 1) >= config.min_kmers_over_min_depth
8284            || config.min_kmers_over_min_depth > coverage.len())
8285    {
8286        let idx = ((above_limit as f64) * (1.0 - config.depth_percentile)) as usize;
8287        non_negative_depth(coverage[idx])
8288    } else {
8289        None
8290    };
8291
8292    let low_thresh = u64_to_i64_saturating(config.low_thresh);
8293    let high_thresh = u64_to_i64_saturating(config.high_thresh);
8294    let error_detect_ratio = u64_to_i64_saturating(config.error_detect_ratio);
8295    let error = high <= low_thresh
8296        || (high >= high_thresh && low <= low_thresh)
8297        || high >= low.saturating_mul(error_detect_ratio);
8298    let low_kmer_count =
8299        low_kmer_count(&coverage, low_thresh, high_thresh, high, error_detect_ratio);
8300
8301    ReadAnalysis {
8302        depth_al,
8303        true_depth: non_negative_depth(true_depth),
8304        min_true_depth: non_negative_depth(min_true_depth),
8305        low_kmer_count,
8306        total_kmer_count: coverage.len(),
8307        error,
8308        had_kmer_windows: true,
8309        coverage_desc: coverage,
8310    }
8311}
8312
8313fn low_kmer_count(
8314    coverage_desc: &[i64],
8315    low_thresh: i64,
8316    high_thresh: i64,
8317    high_depth: i64,
8318    error_detect_ratio: i64,
8319) -> usize {
8320    if coverage_desc.is_empty() {
8321        return 0;
8322    }
8323    if coverage_desc[0] <= low_thresh {
8324        return coverage_desc.len();
8325    }
8326    if high_depth < high_thresh {
8327        return 0;
8328    }
8329    let limit = low_thresh.min(high_depth / error_detect_ratio.max(1));
8330    coverage_desc
8331        .iter()
8332        .rev()
8333        .take_while(|&&depth| depth <= limit)
8334        .count()
8335}
8336
8337fn correct_pair_errors(
8338    config: &Config,
8339    counts: &dyn CountLookup,
8340    r1: &mut SequenceRecord,
8341    r2: Option<&mut SequenceRecord>,
8342) -> CorrectionResult {
8343    let mut result = CorrectionResult::default();
8344    let mut r2 = r2;
8345    if config.overlap_error_correct
8346        && !config.mark_errors_only
8347        && let Some(mate) = r2.as_deref_mut()
8348    {
8349        let overlap = correct_pair_by_overlap(config, r1, mate);
8350        result.corrected += overlap.corrected;
8351        result.marked += overlap.marked;
8352        result.uncorrectable |= overlap.uncorrectable;
8353    }
8354
8355    let read_result = correct_read_errors(config, counts, r1);
8356    result.corrected += read_result.corrected;
8357    result.marked += read_result.marked;
8358    result.uncorrectable |= read_result.uncorrectable;
8359    if let Some(mate) = r2 {
8360        let mate_result = correct_read_errors(config, counts, mate);
8361        result.corrected += mate_result.corrected;
8362        result.marked += mate_result.marked;
8363        result.uncorrectable |= mate_result.uncorrectable;
8364    }
8365    result
8366}
8367
8368fn correct_pair_errors_with_rollback(
8369    config: &Config,
8370    counts: &dyn CountLookup,
8371    r1: &mut SequenceRecord,
8372    mut r2: Option<&mut SequenceRecord>,
8373) -> CorrectionResult {
8374    let rollback =
8375        (!config.mark_uncorrectable_errors).then(|| (r1.clone(), r2.as_deref().cloned()));
8376    let correction = correct_pair_errors(config, counts, r1, r2.as_deref_mut());
8377    if correction.uncorrectable
8378        && let Some((original_r1, original_r2)) = rollback
8379    {
8380        *r1 = original_r1;
8381        if let (Some(mate), Some(original)) = (r2, original_r2) {
8382            *mate = original;
8383        }
8384    }
8385    correction
8386}
8387
8388fn correct_pair_by_overlap(
8389    config: &Config,
8390    r1: &mut SequenceRecord,
8391    r2: &mut SequenceRecord,
8392) -> CorrectionResult {
8393    let Some(overlap) = best_pair_overlap(r1, r2) else {
8394        return CorrectionResult::default();
8395    };
8396    if overlap_expected_mismatch_rejects(r1, r2, &overlap) {
8397        return CorrectionResult::default();
8398    }
8399    if overlap_probability_rejects(r1, r2, &overlap) {
8400        return CorrectionResult::default();
8401    }
8402    let mut corrected = 0usize;
8403
8404    for pair in overlap.pairs {
8405        let b1 = r1.bases[pair.r1_index].to_ascii_uppercase();
8406        let b2 = complement_base(r2.bases[pair.r2_index]).to_ascii_uppercase();
8407        let q1 = base_quality(r1, pair.r1_index);
8408        let q2 = base_quality(r2, pair.r2_index);
8409        let Some((merged_base, merged_quality)) =
8410            overlap_consensus_base_and_quality(config, b1, b2, q1, q2)
8411        else {
8412            continue;
8413        };
8414
8415        let merged_r2_base = complement_base(merged_base);
8416        if r1.bases[pair.r1_index] != merged_base || r2.bases[pair.r2_index] != merged_r2_base {
8417            corrected += 1;
8418        }
8419
8420        r1.bases[pair.r1_index] = merged_base;
8421        r2.bases[pair.r2_index] = merged_r2_base;
8422
8423        if config.change_quality
8424            && let (Some(r1_qualities), Some(r2_qualities)) =
8425                (r1.qualities.as_mut(), r2.qualities.as_mut())
8426        {
8427            let merged_ascii = merged_quality.saturating_add(33);
8428            r1_qualities[pair.r1_index] = merged_ascii;
8429            r2_qualities[pair.r2_index] = merged_ascii;
8430        }
8431    }
8432
8433    CorrectionResult {
8434        corrected,
8435        ..CorrectionResult::default()
8436    }
8437}
8438
8439fn overlap_expected_mismatch_rejects(
8440    r1: &SequenceRecord,
8441    r2: &SequenceRecord,
8442    overlap: &PairOverlap,
8443) -> bool {
8444    let (Some(q1), Some(q2)) = (r1.qualities.as_ref(), r2.qualities.as_ref()) else {
8445        return false;
8446    };
8447
8448    let mut expected = 0.0f64;
8449    for pair in &overlap.pairs {
8450        let b1 = r1.bases[pair.r1_index].to_ascii_uppercase();
8451        let b2 = complement_base(r2.bases[pair.r2_index]).to_ascii_uppercase();
8452        if !is_defined_base(b1) || !is_defined_base(b2) {
8453            continue;
8454        }
8455        let p1 = 1.0 - phred_error_probability(q1[pair.r1_index].saturating_sub(33));
8456        let p2 = 1.0 - phred_error_probability(q2[pair.r2_index].saturating_sub(33));
8457        expected += 1.0 - (p1 * p2);
8458    }
8459
8460    (expected + 0.05) * 4.0 < overlap.mismatches as f64
8461}
8462
8463fn overlap_probability_rejects(
8464    r1: &SequenceRecord,
8465    r2: &SequenceRecord,
8466    overlap: &PairOverlap,
8467) -> bool {
8468    const MIN_PROBABILITY: f64 = 0.0008;
8469
8470    let (Some(q1), Some(q2)) = (r1.qualities.as_ref(), r2.qualities.as_ref()) else {
8471        return false;
8472    };
8473
8474    let mut ln_actual = 0.0f64;
8475    let mut ln_common = 0.0f64;
8476    let mut measured = 0usize;
8477
8478    for pair in &overlap.pairs {
8479        let b1 = r1.bases[pair.r1_index].to_ascii_uppercase();
8480        let b2 = complement_base(r2.bases[pair.r2_index]).to_ascii_uppercase();
8481        if !is_defined_base(b1) || !is_defined_base(b2) {
8482            continue;
8483        }
8484
8485        let prob_correct = overlap_correctness_probability_v4(q1[pair.r1_index])
8486            * overlap_correctness_probability_v4(q2[pair.r2_index]);
8487        let prob_match = prob_correct + (1.0 - prob_correct) * 0.25;
8488        let prob_error = 1.0 - prob_match;
8489
8490        ln_common += prob_match.max(prob_error).ln();
8491        ln_actual += if b1 == b2 { prob_match } else { prob_error }.ln();
8492        measured += 1;
8493    }
8494
8495    if measured == 0 {
8496        return false;
8497    }
8498
8499    0.5 * (ln_actual - ln_common) < MIN_PROBABILITY.ln()
8500}
8501
8502fn overlap_consensus_base_and_quality(
8503    config: &Config,
8504    r1_base: u8,
8505    r2_base: u8,
8506    q1: u8,
8507    q2: u8,
8508) -> Option<(u8, u8)> {
8509    const MAX_MERGE_QUALITY: u8 = 50;
8510
8511    if !is_defined_base(r1_base) && !is_defined_base(r2_base) {
8512        return None;
8513    }
8514    if !is_defined_base(r1_base) {
8515        return Some((r2_base, q2));
8516    }
8517    if !is_defined_base(r2_base) {
8518        return Some((r1_base, q1));
8519    }
8520    if r1_base == r2_base {
8521        let merged_quality = q1
8522            .max(q2)
8523            .saturating_add(q1.min(q2) / 4)
8524            .min(MAX_MERGE_QUALITY);
8525        return Some((r1_base, merged_quality));
8526    }
8527    if q1 == q2 {
8528        return Some((b'N', 0));
8529    }
8530    if q1 > q2 {
8531        if q2 > config.max_quality_to_correct {
8532            return None;
8533        }
8534        return Some((r1_base, q1.saturating_sub(q2)));
8535    }
8536    if q1 > config.max_quality_to_correct {
8537        return None;
8538    }
8539    Some((r2_base, q2.saturating_sub(q1)))
8540}
8541
8542fn overlap_entropy_min_overlap(bases: &[u8]) -> usize {
8543    overlap_entropy_min_overlap_side(bases.iter().copied()).max(overlap_entropy_min_overlap_side(
8544        bases.iter().rev().copied(),
8545    ))
8546}
8547
8548fn overlap_entropy_min_overlap_side(bases: impl IntoIterator<Item = u8>) -> usize {
8549    const K: usize = 3;
8550    const MASK: usize = (1 << (2 * K)) - 1;
8551    const MIN_SCORE: usize = 42;
8552
8553    let mut counts = [0u16; 1 << (2 * K)];
8554    let mut kmer = 0usize;
8555    let mut len = 0usize;
8556    let mut ones = 0usize;
8557    let mut twos = 0usize;
8558    let mut seen = 0usize;
8559
8560    for base in bases {
8561        let Some(bits) = base_to_two_bit(base) else {
8562            len = 0;
8563            kmer = 0;
8564            seen += 1;
8565            continue;
8566        };
8567        len += 1;
8568        kmer = ((kmer << 2) | bits) & MASK;
8569        if len >= K {
8570            counts[kmer] = counts[kmer].saturating_add(1);
8571            if counts[kmer] == 1 {
8572                ones += 1;
8573            } else if counts[kmer] == 2 {
8574                twos += 1;
8575            }
8576            if ones * 4 + twos >= MIN_SCORE {
8577                return seen;
8578            }
8579        }
8580        seen += 1;
8581    }
8582
8583    seen + 1
8584}
8585
8586fn base_to_two_bit(base: u8) -> Option<usize> {
8587    match base.to_ascii_uppercase() {
8588        b'A' => Some(0),
8589        b'C' => Some(1),
8590        b'G' => Some(2),
8591        b'T' => Some(3),
8592        _ => None,
8593    }
8594}
8595
8596#[derive(Debug, Clone, Copy)]
8597struct OverlapBasePair {
8598    r1_index: usize,
8599    r2_index: usize,
8600}
8601
8602#[derive(Debug, Clone)]
8603struct PairOverlap {
8604    pairs: Vec<OverlapBasePair>,
8605    mismatches: usize,
8606}
8607
8608const OVERLAP_MAX_RATIO: f64 = 0.075;
8609const OVERLAP_MIN_SECOND_RATIO: f64 = 0.12;
8610const OVERLAP_RATIO_MARGIN: f64 = 7.5;
8611const OVERLAP_RATIO_OFFSET: f64 = 0.55;
8612const OVERLAP_PROB_CORRECT4: &[f64] = &[
8613    0.0000, 0.2501, 0.3690, 0.4988, 0.6019, 0.6838, 0.7488, 0.8005, 0.8415, 0.8741, 0.9000, 0.9206,
8614    0.9369, 0.9499, 0.9602, 0.9684, 0.9749, 0.9800, 0.9842, 0.9874, 0.9900, 0.9921, 0.9937, 0.9950,
8615    0.9960, 0.9968, 0.9975, 0.9980, 0.9984, 0.9987, 0.9990, 0.9992, 0.9994, 0.9995, 0.9996, 0.9997,
8616    0.9997, 0.9998, 0.9998, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999,
8617    0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999,
8618];
8619
8620fn best_pair_overlap(r1: &SequenceRecord, r2: &SequenceRecord) -> Option<PairOverlap> {
8621    best_pair_overlap_without_qualities(&r1.bases, &r2.bases)
8622}
8623
8624fn overlap_correctness_probability_v4(quality_ascii: u8) -> f64 {
8625    let phred = quality_ascii.saturating_sub(33) as usize;
8626    OVERLAP_PROB_CORRECT4[phred.min(OVERLAP_PROB_CORRECT4.len() - 1)]
8627}
8628
8629fn best_pair_overlap_without_qualities(r1: &[u8], r2: &[u8]) -> Option<PairOverlap> {
8630    if r1.is_empty() || r2.is_empty() {
8631        return None;
8632    }
8633    if r1.len().min(r2.len()) < 35 {
8634        return None;
8635    }
8636    let min_overlap = 11usize
8637        .max(overlap_entropy_min_overlap(r1))
8638        .max(overlap_entropy_min_overlap(r2));
8639    let min_length = r1.len().min(r2.len());
8640    if min_overlap > min_length {
8641        return None;
8642    }
8643
8644    let best_ratio_cap = find_best_overlap_ratio_without_qualities(r1, r2, min_overlap);
8645    if best_ratio_cap > OVERLAP_MAX_RATIO {
8646        return None;
8647    }
8648    let max_ratio = best_ratio_cap.min(OVERLAP_MAX_RATIO);
8649    let margin2 = (OVERLAP_RATIO_MARGIN + OVERLAP_RATIO_OFFSET) / min_length as f64;
8650    let mut best_insert: Option<usize> = None;
8651    let mut best_overlap = 0usize;
8652    let mut best_bad = min_length as f64;
8653    let mut best_good = 0.0f64;
8654    let mut best_ratio = 1.0f64;
8655    let mut best_mismatches = 0usize;
8656    let mut second_best_ratio = 1.0f64;
8657    let mut ambig = false;
8658
8659    let largest_insert_to_test = r1.len() + r2.len() - 5;
8660    for insert in (25..=largest_insert_to_test).rev() {
8661        let start1 = if insert <= r2.len() {
8662            0
8663        } else {
8664            insert - r2.len()
8665        };
8666        let start2 = if insert >= r2.len() {
8667            0
8668        } else {
8669            r2.len() - insert
8670        };
8671        let overlap = (r1.len() - start1).min(r2.len() - start2).min(insert);
8672        if overlap < 5 {
8673            continue;
8674        }
8675
8676        let bad_limit =
8677            1.2 * best_ratio.min(max_ratio) * OVERLAP_RATIO_MARGIN * overlap as f64 + 1.0;
8678        let mut good = 0.0f64;
8679        let mut bad = 0.0f64;
8680        let mut mismatches = 0usize;
8681
8682        for step in 0..overlap {
8683            let r1_index = start1 + step;
8684            let r2_rc_index = start2 + step;
8685            let r2_index = r2.len() - 1 - r2_rc_index;
8686            let b1 = r1[r1_index].to_ascii_uppercase();
8687            let b2 = complement_base(r2[r2_index]).to_ascii_uppercase();
8688            if b1 == b2 {
8689                if b1 != b'N' {
8690                    good += 0.95;
8691                }
8692            } else {
8693                bad += 0.95;
8694                mismatches += 1;
8695                if bad > bad_limit {
8696                    break;
8697                }
8698            }
8699        }
8700
8701        if bad > bad_limit {
8702            continue;
8703        }
8704        if bad == 0.0 && good > 5.0 && good < min_overlap as f64 {
8705            return None;
8706        }
8707
8708        let ratio = (bad + OVERLAP_RATIO_OFFSET) / overlap as f64;
8709        if ratio < best_ratio * OVERLAP_RATIO_MARGIN {
8710            ambig = ratio * OVERLAP_RATIO_MARGIN >= best_ratio || good < min_overlap as f64;
8711
8712            if ratio < best_ratio {
8713                second_best_ratio = best_ratio;
8714                best_insert = Some(insert);
8715                best_overlap = overlap;
8716                best_bad = bad;
8717                best_good = good;
8718                best_ratio = ratio;
8719                best_mismatches = mismatches;
8720            } else if ratio < second_best_ratio {
8721                second_best_ratio = ratio;
8722            }
8723
8724            if (ambig && best_ratio < margin2) || second_best_ratio < OVERLAP_MIN_SECOND_RATIO {
8725                return None;
8726            }
8727        }
8728    }
8729
8730    if second_best_ratio < OVERLAP_MIN_SECOND_RATIO {
8731        ambig = true;
8732    }
8733    if !ambig && best_ratio > max_ratio {
8734        return None;
8735    }
8736
8737    let insert = best_insert?;
8738    let start1 = if insert <= r2.len() {
8739        0
8740    } else {
8741        insert - r2.len()
8742    };
8743    let start2 = if insert >= r2.len() {
8744        0
8745    } else {
8746        r2.len() - insert
8747    };
8748    let mut pairs = Vec::with_capacity(best_overlap);
8749    for step in 0..best_overlap {
8750        let r1_index = start1 + step;
8751        let r2_rc_index = start2 + step;
8752        let r2_index = r2.len() - 1 - r2_rc_index;
8753        pairs.push(OverlapBasePair { r1_index, r2_index });
8754    }
8755
8756    let _ = (best_bad, best_good);
8757    Some(PairOverlap {
8758        pairs,
8759        mismatches: best_mismatches,
8760    })
8761}
8762
8763fn find_best_overlap_ratio_without_qualities(r1: &[u8], r2: &[u8], min_overlap: usize) -> f64 {
8764    let mut best_ratio = OVERLAP_MAX_RATIO + 0.0001;
8765    let largest_insert_to_test = r1.len() + r2.len() - min_overlap;
8766
8767    for insert in (35..=largest_insert_to_test).rev() {
8768        let start1 = if insert <= r2.len() {
8769            0
8770        } else {
8771            insert - r2.len()
8772        };
8773        let start2 = if insert >= r2.len() {
8774            0
8775        } else {
8776            r2.len() - insert
8777        };
8778        let overlap = (r1.len() - start1).min(r2.len() - start2).min(insert);
8779        if overlap < min_overlap {
8780            continue;
8781        }
8782
8783        let mut good = 0.0f64;
8784        let mut bad = 0.0f64;
8785        let bad_limit = best_ratio * overlap as f64 + 1.0;
8786
8787        for step in 0..overlap {
8788            let r1_index = start1 + step;
8789            let r2_rc_index = start2 + step;
8790            let r2_index = r2.len() - 1 - r2_rc_index;
8791            let b1 = r1[r1_index].to_ascii_uppercase();
8792            let b2 = complement_base(r2[r2_index]).to_ascii_uppercase();
8793            if b1 == b2 {
8794                if b1 != b'N' {
8795                    good += 0.95;
8796                }
8797            } else {
8798                bad += 0.95;
8799                if bad > bad_limit {
8800                    break;
8801                }
8802            }
8803        }
8804
8805        if bad > bad_limit {
8806            continue;
8807        }
8808        if bad == 0.0 && good > 5.0 && good < min_overlap as f64 {
8809            return 100.0;
8810        }
8811        let ratio = (bad + OVERLAP_RATIO_OFFSET) / overlap as f64;
8812        if ratio < best_ratio {
8813            best_ratio = ratio;
8814            if good >= min_overlap as f64 && ratio < OVERLAP_MAX_RATIO * 0.5 {
8815                return best_ratio;
8816            }
8817        }
8818    }
8819
8820    best_ratio
8821}
8822
8823fn correct_read_errors(
8824    config: &Config,
8825    counts: &dyn CountLookup,
8826    record: &mut SequenceRecord,
8827) -> CorrectionResult {
8828    if config.max_errors_to_correct == 0 || record.len() < config.k || config.k > 31 {
8829        return CorrectionResult::default();
8830    }
8831    let mut coverage = coverage_windows_for_record(config, counts, record);
8832    if coverage.len() <= config.prefix_len.max(1) {
8833        return CorrectionResult::default();
8834    }
8835    if !has_error_discontinuity(config, &coverage) {
8836        return CorrectionResult::default();
8837    }
8838
8839    if config.mark_errors_only {
8840        return mark_read_errors(config, record, &coverage);
8841    }
8842
8843    let original_bases = record.bases.clone();
8844    let original_qualities = record.qualities.clone();
8845    let mut result = CorrectionResult::default();
8846    let mut remaining = config.max_errors_to_correct;
8847
8848    if config.correct_from_left {
8849        let left = correct_errors_from_left(config, counts, record, &mut coverage, remaining);
8850        if left.uncorrectable {
8851            record.bases = original_bases;
8852            record.qualities = original_qualities;
8853            if config.mark_uncorrectable_errors {
8854                result.marked += mark_read_errors(config, record, &coverage).marked;
8855            }
8856            result.uncorrectable = true;
8857            return result;
8858        }
8859        remaining = remaining.saturating_sub(left.corrected);
8860        result.corrected += left.corrected;
8861    }
8862
8863    if config.correct_from_right && remaining > 0 {
8864        let checkpoint_bases = record.bases.clone();
8865        let checkpoint_qualities = record.qualities.clone();
8866        let right = correct_errors_from_right(config, counts, record, &mut coverage, remaining);
8867        if right.uncorrectable {
8868            record.bases = checkpoint_bases;
8869            record.qualities = checkpoint_qualities;
8870            if config.mark_uncorrectable_errors {
8871                result.marked += mark_read_errors(config, record, &coverage).marked;
8872            }
8873            result.uncorrectable = true;
8874            return result;
8875        }
8876        result.corrected += right.corrected;
8877    }
8878
8879    result
8880}
8881
8882fn correct_errors_from_left(
8883    config: &Config,
8884    counts: &dyn CountLookup,
8885    record: &mut SequenceRecord,
8886    coverage: &mut Vec<i64>,
8887    max_to_correct: usize,
8888) -> CorrectionResult {
8889    let mut found = 0usize;
8890    let mut corrected = 0usize;
8891    let low = u64_to_i64_saturating(config.error_correct_low_thresh);
8892    let high = u64_to_i64_saturating(config.error_correct_high_thresh);
8893    let mult = u64_to_i64_saturating(config.error_correct_ratio);
8894
8895    for i in config.prefix_len..coverage.len() {
8896        let a = min_coverage(&coverage[i - config.prefix_len..i]);
8897        let b = coverage[i];
8898        if !is_correction_discontinuity(a, b, low, high, mult) {
8899            continue;
8900        }
8901        found += 1;
8902        let loc = i + config.k - 1;
8903        if found > max_to_correct || base_quality(record, loc) > config.max_quality_to_correct {
8904            return CorrectionResult {
8905                corrected,
8906                uncorrectable: true,
8907                ..CorrectionResult::default()
8908            };
8909        }
8910        let target_lower = high.max(a / 2);
8911        let target_upper = a.saturating_mul(2);
8912        let target = CorrectionTarget {
8913            low,
8914            lower_bound: target_lower,
8915            upper_bound: target_upper,
8916            mult,
8917        };
8918        if try_correct_base(config, counts, record, loc, target) {
8919            corrected += 1;
8920            *coverage = coverage_windows_for_record(config, counts, record);
8921        } else {
8922            return CorrectionResult {
8923                corrected,
8924                uncorrectable: true,
8925                ..CorrectionResult::default()
8926            };
8927        }
8928    }
8929
8930    CorrectionResult {
8931        corrected,
8932        ..CorrectionResult::default()
8933    }
8934}
8935
8936fn correct_errors_from_right(
8937    config: &Config,
8938    counts: &dyn CountLookup,
8939    record: &mut SequenceRecord,
8940    coverage: &mut Vec<i64>,
8941    max_to_correct: usize,
8942) -> CorrectionResult {
8943    if coverage.len() <= config.prefix_len {
8944        return CorrectionResult::default();
8945    }
8946    let mut found = 0usize;
8947    let mut corrected = 0usize;
8948    let low = u64_to_i64_saturating(config.error_correct_low_thresh);
8949    let high = u64_to_i64_saturating(config.error_correct_high_thresh);
8950    let mult = u64_to_i64_saturating(config.error_correct_ratio);
8951    let start = coverage.len() - config.prefix_len - 1;
8952
8953    for i in (0..=start).rev() {
8954        let a = min_coverage(&coverage[i + 1..=i + config.prefix_len]);
8955        let b = coverage[i];
8956        if !is_correction_discontinuity(a, b, low, high, mult) {
8957            continue;
8958        }
8959        found += 1;
8960        let loc = i;
8961        if found > max_to_correct || base_quality(record, loc) > config.max_quality_to_correct {
8962            return CorrectionResult {
8963                corrected,
8964                uncorrectable: true,
8965                ..CorrectionResult::default()
8966            };
8967        }
8968        let target_lower = high.max(a / 2);
8969        let target_upper = a.saturating_mul(2);
8970        let target = CorrectionTarget {
8971            low,
8972            lower_bound: target_lower,
8973            upper_bound: target_upper,
8974            mult,
8975        };
8976        if try_correct_base(config, counts, record, loc, target) {
8977            corrected += 1;
8978            *coverage = coverage_windows_for_record(config, counts, record);
8979        } else {
8980            return CorrectionResult {
8981                corrected,
8982                uncorrectable: true,
8983                ..CorrectionResult::default()
8984            };
8985        }
8986    }
8987
8988    CorrectionResult {
8989        corrected,
8990        ..CorrectionResult::default()
8991    }
8992}
8993
8994fn try_correct_base(
8995    config: &Config,
8996    counts: &dyn CountLookup,
8997    record: &mut SequenceRecord,
8998    loc: usize,
8999    target: CorrectionTarget,
9000) -> bool {
9001    let original = record.bases[loc];
9002    let mut candidates = [(b'A', 0i64), (b'C', 0), (b'G', 0), (b'T', 0)];
9003    for (base, support) in &mut candidates {
9004        *support = substitution_support(config, counts, record, loc, *base);
9005    }
9006    candidates.sort_by(|left, right| right.1.cmp(&left.1));
9007    let (best_base, best_support) = candidates[0];
9008    let second_best = candidates[1].1;
9009    if best_base == original.to_ascii_uppercase() {
9010        return false;
9011    }
9012    if best_support < target.lower_bound || best_support > target.upper_bound {
9013        return false;
9014    }
9015    if !(second_best <= target.low || second_best.saturating_mul(target.mult) <= best_support) {
9016        return false;
9017    }
9018
9019    record.bases[loc] = best_base;
9020    if !is_defined_base(original)
9021        && let Some(qualities) = record.qualities.as_mut()
9022    {
9023        qualities[loc] = 20u8.saturating_add(33);
9024    }
9025    true
9026}
9027
9028fn substitution_support(
9029    config: &Config,
9030    counts: &dyn CountLookup,
9031    record: &SequenceRecord,
9032    loc: usize,
9033    base: u8,
9034) -> i64 {
9035    let mut candidate = record.clone();
9036    candidate.bases[loc] = base;
9037    let windows = unfiltered_kmer_windows_for_record(&candidate, config);
9038    if windows.is_empty() {
9039        return 0;
9040    }
9041    let first = (loc + 1).saturating_sub(config.k);
9042    let last = loc.min(windows.len() - 1);
9043    let mut support = i64::MAX;
9044    for window in windows.iter().take(last + 1).skip(first) {
9045        let depth = window
9046            .as_ref()
9047            .map(|kmer| u64_to_i64_saturating(counts.depth(kmer)))
9048            .unwrap_or(0);
9049        support = support.min(depth);
9050    }
9051    if support == i64::MAX { 0 } else { support }
9052}
9053
9054fn mark_read_errors(
9055    config: &Config,
9056    record: &mut SequenceRecord,
9057    coverage: &[i64],
9058) -> CorrectionResult {
9059    let low = u64_to_i64_saturating(config.error_correct_low_thresh);
9060    let high = u64_to_i64_saturating(config.error_correct_high_thresh);
9061    let mult = u64_to_i64_saturating(config.error_correct_ratio);
9062    let mut marked = 0usize;
9063    let mut marks = Vec::new();
9064
9065    if config.correct_from_left {
9066        for i in config.prefix_len..coverage.len() {
9067            let a = min_coverage(&coverage[i - config.prefix_len..i]);
9068            let b = coverage[i];
9069            if is_correction_discontinuity(a, b, low, high, mult) {
9070                marks.push(i + config.k - 1);
9071            }
9072        }
9073    }
9074    if config.correct_from_right && coverage.len() > config.prefix_len {
9075        let start = coverage.len() - config.prefix_len - 1;
9076        for i in (0..=start).rev() {
9077            let a = min_coverage(&coverage[i + 1..=i + config.prefix_len]);
9078            let b = coverage[i];
9079            if is_correction_discontinuity(a, b, low, high, mult) {
9080                marks.push(i);
9081            }
9082        }
9083    }
9084
9085    marks.sort_unstable();
9086    marks.dedup();
9087    for loc in marks {
9088        if let Some(qualities) = record.qualities.as_mut() {
9089            let phred = qualities[loc].saturating_sub(33);
9090            if phred == 0 {
9091                continue;
9092            }
9093            let new_phred = if config.mark_with_one {
9094                1
9095            } else {
9096                (phred / 2).saturating_sub(3).max(1)
9097            };
9098            qualities[loc] = new_phred.saturating_add(33);
9099        } else {
9100            record.bases[loc] = b'N';
9101        }
9102        marked += 1;
9103    }
9104
9105    CorrectionResult {
9106        marked,
9107        ..CorrectionResult::default()
9108    }
9109}
9110
9111fn coverage_windows_for_record(
9112    config: &Config,
9113    counts: &dyn CountLookup,
9114    record: &SequenceRecord,
9115) -> Vec<i64> {
9116    unfiltered_kmer_windows_for_record(record, config)
9117        .iter()
9118        .map(|window| {
9119            window
9120                .as_ref()
9121                .map(|kmer| u64_to_i64_saturating(counts.depth(kmer)))
9122                .unwrap_or(0)
9123        })
9124        .collect()
9125}
9126
9127fn has_error_discontinuity(config: &Config, coverage: &[i64]) -> bool {
9128    let low = u64_to_i64_saturating(config.error_correct_low_thresh);
9129    let high = u64_to_i64_saturating(config.error_correct_high_thresh);
9130    let mult = u64_to_i64_saturating(config.error_correct_ratio);
9131    if coverage.len() <= config.prefix_len {
9132        return false;
9133    }
9134    for i in config.prefix_len..coverage.len() {
9135        if is_correction_discontinuity(
9136            min_coverage(&coverage[i - config.prefix_len..i]),
9137            coverage[i],
9138            low,
9139            high,
9140            mult,
9141        ) {
9142            return true;
9143        }
9144    }
9145    let start = coverage.len() - config.prefix_len - 1;
9146    for i in (0..=start).rev() {
9147        if is_correction_discontinuity(
9148            min_coverage(&coverage[i + 1..=i + config.prefix_len]),
9149            coverage[i],
9150            low,
9151            high,
9152            mult,
9153        ) {
9154            return true;
9155        }
9156    }
9157    false
9158}
9159
9160fn is_correction_discontinuity(a: i64, b: i64, low: i64, high: i64, mult: i64) -> bool {
9161    a >= high && (b <= low || a >= b.saturating_mul(mult))
9162}
9163
9164fn min_coverage(values: &[i64]) -> i64 {
9165    values.iter().copied().min().unwrap_or(0)
9166}
9167
9168fn base_quality(record: &SequenceRecord, loc: usize) -> u8 {
9169    record
9170        .qualities
9171        .as_ref()
9172        .and_then(|qualities| qualities.get(loc))
9173        .copied()
9174        .map(|quality| quality.saturating_sub(33))
9175        .unwrap_or(10)
9176}
9177
9178fn is_defined_base(base: u8) -> bool {
9179    matches!(base.to_ascii_uppercase(), b'A' | b'C' | b'G' | b'T')
9180}
9181
9182fn complement_base(base: u8) -> u8 {
9183    match base.to_ascii_uppercase() {
9184        b'A' => b'T',
9185        b'C' => b'G',
9186        b'G' => b'C',
9187        b'T' => b'A',
9188        _ => b'N',
9189    }
9190}
9191
9192fn fix_spikes(
9193    coverage: &mut [i64],
9194    windows: &[Option<KmerKey>],
9195    counts: &dyn CountLookup,
9196    k: usize,
9197) {
9198    if k == 0 || coverage.len() < 3 {
9199        return;
9200    }
9201    if coverage[1] - coverage[0] > 1 {
9202        coverage[0] = precise_kmer_count(windows[0].as_ref(), counts, k);
9203    }
9204
9205    let last = coverage.len() - 1;
9206    if coverage[last] - coverage[last - 1] > 1 {
9207        coverage[last] = precise_kmer_count(windows[last].as_ref(), counts, k);
9208    }
9209
9210    for i in 1..last {
9211        let b = coverage[i];
9212        if b <= 1 {
9213            continue;
9214        }
9215        let a = coverage[i - 1].max(1);
9216        let c = coverage[i + 1].max(1);
9217        if b > a && b > c && (b < 6 || b > a + 1 || b > c + 1) {
9218            coverage[i] = precise_min_kmer_count(windows[i].as_ref(), counts, k);
9219        }
9220    }
9221}
9222
9223fn precise_kmer_count(window: Option<&KmerKey>, counts: &dyn CountLookup, k: usize) -> i64 {
9224    let Some(window) = window else {
9225        return 0;
9226    };
9227    let key = raw_kmer_key(window);
9228    let b = kmer_count(window, key, counts, k);
9229    if b < 1 {
9230        return b;
9231    }
9232    let a = left_kmer_count(window, key, counts, k);
9233    if a >= b {
9234        return b;
9235    }
9236    let c = right_kmer_count(window, key, counts, k);
9237    if c >= b {
9238        return b;
9239    }
9240    (a + c) / 2
9241}
9242
9243fn precise_min_kmer_count(window: Option<&KmerKey>, counts: &dyn CountLookup, k: usize) -> i64 {
9244    let Some(window) = window else {
9245        return 0;
9246    };
9247    let key = raw_kmer_key(window);
9248    let b = kmer_count(window, key, counts, k);
9249    if b < 1 {
9250        return b;
9251    }
9252    let a = left_kmer_count(window, key, counts, k);
9253    if a < 1 {
9254        return a;
9255    }
9256    let c = right_kmer_count(window, key, counts, k);
9257    a.min(b).min(c)
9258}
9259
9260fn raw_kmer_key(window: &KmerKey) -> u64 {
9261    match window {
9262        KmerKey::Short(key) | KmerKey::LongHash(key) => *key,
9263    }
9264}
9265
9266fn kmer_count(template: &KmerKey, raw_key: u64, counts: &dyn CountLookup, k: usize) -> i64 {
9267    let key = match template {
9268        KmerKey::Short(_) => KmerKey::Short(canonical_short_code(raw_key, k)),
9269        KmerKey::LongHash(_) => KmerKey::LongHash(java_canonical_long_key(raw_key, k)),
9270    };
9271    u64_to_i64_saturating(counts.depth(&key))
9272}
9273
9274fn left_kmer_count(template: &KmerKey, key: u64, counts: &dyn CountLookup, k: usize) -> i64 {
9275    let key2 = key >> 2;
9276    let shift = ((2 * (k - 1)) & 63) as u32;
9277    (0..4)
9278        .map(|base| kmer_count(template, key2 | (base << shift), counts, k))
9279        .fold(0i64, i64::saturating_add)
9280}
9281
9282fn right_kmer_count(template: &KmerKey, key: u64, counts: &dyn CountLookup, k: usize) -> i64 {
9283    let mask = if k >= 32 {
9284        u64::MAX
9285    } else {
9286        (1u64 << (2 * k)) - 1
9287    };
9288    let key2 = (key << 2) & mask;
9289    (0..4)
9290        .map(|base| kmer_count(template, key2 | base, counts, k))
9291        .fold(0i64, i64::saturating_add)
9292}
9293
9294fn java_canonical_long_key(key: u64, k: usize) -> u64 {
9295    let reverse = java_reverse_complement_binary_fast(key, k);
9296    key.max(reverse)
9297}
9298
9299fn java_reverse_complement_binary_fast(key: u64, k: usize) -> u64 {
9300    let mut x = !key;
9301    x = ((x & 0x3333_3333_3333_3333) << 2) | ((x & 0xCCCC_CCCC_CCCC_CCCC) >> 2);
9302    x = ((x & 0x0F0F_0F0F_0F0F_0F0F) << 4) | ((x & 0xF0F0_F0F0_F0F0_F0F0) >> 4);
9303    x = ((x & 0x00FF_00FF_00FF_00FF) << 8) | ((x & 0xFF00_FF00_FF00_FF00) >> 8);
9304    x = ((x & 0x0000_FFFF_0000_FFFF) << 16) | ((x & 0xFFFF_0000_FFFF_0000) >> 16);
9305    x = x.rotate_right(32);
9306
9307    let shift = (2usize.wrapping_mul(32usize.wrapping_sub(k)) & 63) as u32;
9308    x >> shift
9309}
9310
9311fn decide_pair(
9312    config: &Config,
9313    input_counts: &dyn CountLookup,
9314    r1: &SequenceRecord,
9315    r2: Option<&SequenceRecord>,
9316    rand: Option<f64>,
9317) -> PairDecision {
9318    let analysis = analyze_pair(config, input_counts, r1, r2);
9319    decide_pair_from_analysis(config, r1, r2, analysis, rand)
9320}
9321
9322fn decide_pair_from_analysis(
9323    config: &Config,
9324    r1: &SequenceRecord,
9325    r2: Option<&SequenceRecord>,
9326    analysis: PairAnalysis,
9327    rand: Option<f64>,
9328) -> PairDecision {
9329    let (target_depth, max_depth) = dynamic_depth_limits(config, &analysis);
9330    let mut toss = false;
9331
9332    match analysis.depth_proxy_al {
9333        Some(depth) => {
9334            if depth > max_depth && (analysis.error1 || analysis.error2 || !config.discard_bad_only)
9335            {
9336                let coin = deterministic_coin(rand, depth);
9337                if coin > target_depth {
9338                    toss = true;
9339                }
9340            }
9341        }
9342        None => toss = true,
9343    }
9344
9345    if r1.len() < config.min_length || r2.is_some_and(|mate| mate.len() < config.min_length) {
9346        toss = true;
9347    }
9348
9349    if config.toss_error_reads && (analysis.error1 || analysis.error2) {
9350        let save_rare = config.save_rare_reads
9351            && analysis
9352                .depth_proxy_al
9353                .is_some_and(|depth| depth <= target_depth && depth >= config.high_thresh);
9354        if !save_rare
9355            && (!config.require_both_bad || r2.is_none() || (analysis.error1 && analysis.error2))
9356        {
9357            toss = true;
9358        }
9359    }
9360
9361    if config.toss_by_low_true_depth && !config.save_rare_reads {
9362        let low_enough = analysis
9363            .max_true_depth
9364            .is_some_and(|depth| depth < config.min_depth);
9365        let required_bad = !config.require_both_bad
9366            || r2.is_none()
9367            || (depth_below_min(analysis.read1.min_true_depth, config.min_depth)
9368                && analysis
9369                    .read2
9370                    .as_ref()
9371                    .is_some_and(|read| depth_below_min(read.min_true_depth, config.min_depth)));
9372        if low_enough && required_bad {
9373            toss = true;
9374        }
9375    }
9376
9377    if config.keep_all {
9378        toss = false;
9379    }
9380
9381    PairDecision { toss, analysis }
9382}
9383
9384fn dynamic_depth_limits(config: &Config, analysis: &PairAnalysis) -> (u64, u64) {
9385    let default_max_depth = config.max_depth.unwrap_or(config.target_depth);
9386    if analysis.low_kmer_count == 0 || analysis.total_kmer_count == 0 {
9387        return (config.target_depth, default_max_depth);
9388    }
9389
9390    let low_target = ((config.target_depth as f64) * config.target_bad_percent_low)
9391        .round()
9392        .max(1.0);
9393    let high_target = ((config.target_depth as f64) * config.target_bad_percent_high)
9394        .round()
9395        .max(low_target)
9396        .min(config.target_depth as f64);
9397    let fraction_good = (analysis.total_kmer_count - analysis.low_kmer_count) as f64
9398        / analysis.total_kmer_count as f64;
9399    let adjusted = low_target + (high_target - low_target) * (fraction_good * fraction_good);
9400    let target = adjusted as u64;
9401    (target.max(1), target.max(1))
9402}
9403
9404fn maybe_rename_pair(
9405    config: &Config,
9406    r1: &SequenceRecord,
9407    r2: Option<&SequenceRecord>,
9408    analysis: &PairAnalysis,
9409) -> (SequenceRecord, Option<SequenceRecord>) {
9410    if !config.rename_reads {
9411        return (r1.clone(), r2.cloned());
9412    }
9413    let d1 = depth_label(analysis.read1.depth_al);
9414    let out1 = match r2 {
9415        Some(_) => {
9416            let mut id = format!(
9417                "id={},d1={},d2={}",
9418                r1.numeric_id,
9419                d1,
9420                depth_label(analysis.read2.as_ref().and_then(|a| a.depth_al))
9421            );
9422            if config.error_correct {
9423                id.push_str(",e1=0,e2=0");
9424            }
9425            id.push_str(" /1");
9426            r1.renamed(id)
9427        }
9428        None => {
9429            let mut id = format!("id={},d1={}", r1.numeric_id, d1);
9430            if config.error_correct {
9431                id.push_str(",e1=0");
9432            }
9433            r1.renamed(id)
9434        }
9435    };
9436    let out2 = r2.map(|mate| {
9437        let mut id = format!(
9438            "id={},d1={},d2={}",
9439            r1.numeric_id,
9440            d1,
9441            depth_label(analysis.read2.as_ref().and_then(|a| a.depth_al))
9442        );
9443        if config.error_correct {
9444            id.push_str(",e1=0,e2=0");
9445        }
9446        id.push_str(" /2");
9447        mate.renamed(id)
9448    });
9449    (out1, out2)
9450}
9451
9452fn depth_label(depth: Option<u64>) -> String {
9453    depth
9454        .map(|value| value.to_string())
9455        .unwrap_or_else(|| "-1".to_string())
9456}
9457
9458fn increment_pair_counts(
9459    config: &Config,
9460    counts: &mut CountMap,
9461    r1: &SequenceRecord,
9462    r2: Option<&SequenceRecord>,
9463) {
9464    increment_pair_counts_with_prefilter(config, counts, r1, r2, None);
9465}
9466
9467fn increment_pair_counts_with_prefilter(
9468    config: &Config,
9469    counts: &mut CountMap,
9470    r1: &SequenceRecord,
9471    r2: Option<&SequenceRecord>,
9472    prefilter: Option<PrefilterGate<'_>>,
9473) {
9474    if config.remove_duplicate_kmers && config.k <= 31 {
9475        for kmer in unique_pair_kmers(config, r1, r2) {
9476            if prefilter.is_none_or(|gate| gate.should_count_in_main(&kmer)) {
9477                *counts.entry(kmer).or_insert(0) += 1;
9478            }
9479        }
9480    } else {
9481        for_each_kmer_for_record(r1, config, |kmer| {
9482            if prefilter.is_none_or(|gate| gate.should_count_in_main(&kmer)) {
9483                *counts.entry(kmer).or_insert(0) += 1;
9484            }
9485        });
9486        if let Some(mate) = r2 {
9487            for_each_kmer_for_record(mate, config, |kmer| {
9488                if prefilter.is_none_or(|gate| gate.should_count_in_main(&kmer)) {
9489                    *counts.entry(kmer).or_insert(0) += 1;
9490                }
9491            });
9492        }
9493    }
9494}
9495
9496fn increment_counts_from_pair_chunk(
9497    config: &Config,
9498    counts: &mut CountMap,
9499    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
9500) {
9501    let chunk_counts = pairs
9502        .par_iter()
9503        .fold(
9504            || count_chunk_local_map(config, pairs),
9505            |mut local_counts, (r1, r2)| {
9506                increment_pair_counts(config, &mut local_counts, r1, r2.as_ref());
9507                local_counts
9508            },
9509        )
9510        .reduce(CountMap::default, |mut left, right| {
9511            merge_count_maps(&mut left, right);
9512            left
9513        });
9514    merge_count_maps(counts, chunk_counts);
9515}
9516
9517fn increment_sketch_from_pair_chunk(
9518    config: &Config,
9519    sketch: &mut PackedCountMinSketch,
9520    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
9521    prefilter: Option<PrefilterGate<'_>>,
9522) {
9523    if config.deterministic && sketch.update_mode == CountMinUpdateMode::Conservative {
9524        increment_sketch_from_pair_chunk_sorted_replay(config, sketch, pairs, prefilter);
9525        return;
9526    }
9527    let chunk_counts = pairs
9528        .par_iter()
9529        .fold(
9530            || count_chunk_local_map(config, pairs),
9531            |mut local_counts, (r1, r2)| {
9532                increment_pair_counts_with_prefilter(
9533                    config,
9534                    &mut local_counts,
9535                    r1,
9536                    r2.as_ref(),
9537                    prefilter,
9538                );
9539                local_counts
9540            },
9541        )
9542        .reduce(CountMap::default, |mut left, right| {
9543            merge_count_maps(&mut left, right);
9544            left
9545        });
9546    let key_increments = chunk_counts.values().copied().sum();
9547    sketch.add_key_counts(&chunk_counts);
9548    sketch.add_key_increments(key_increments);
9549}
9550
9551fn increment_sketch_from_pair_chunk_sorted_replay(
9552    config: &Config,
9553    sketch: &mut PackedCountMinSketch,
9554    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
9555    prefilter: Option<PrefilterGate<'_>>,
9556) {
9557    let mut entries = pairs
9558        .par_iter()
9559        .fold(
9560            || count_chunk_local_map(config, pairs),
9561            |mut local_counts, (r1, r2)| {
9562                increment_pair_counts_with_prefilter(
9563                    config,
9564                    &mut local_counts,
9565                    r1,
9566                    r2.as_ref(),
9567                    prefilter,
9568                );
9569                local_counts
9570            },
9571        )
9572        .map(|counts| counts.into_iter().collect::<Vec<_>>())
9573        .reduce(Vec::new, |mut left, mut right| {
9574            left.append(&mut right);
9575            left
9576        });
9577    entries.par_sort_unstable_by(|(left, _), (right, _)| left.cmp(right));
9578
9579    let mut key_increments = 0u64;
9580    let mut iter = entries.into_iter();
9581    let Some((mut current_key, mut current_count)) = iter.next() else {
9582        return;
9583    };
9584    for (key, count) in iter {
9585        if key == current_key {
9586            current_count = current_count.saturating_add(count);
9587        } else {
9588            key_increments = key_increments.saturating_add(current_count);
9589            sketch.add_key_count(&current_key, current_count);
9590            current_key = key;
9591            current_count = count;
9592        }
9593    }
9594    key_increments = key_increments.saturating_add(current_count);
9595    sketch.add_key_count(&current_key, current_count);
9596    sketch.add_key_increments(key_increments);
9597}
9598
9599fn increment_atomic_packed_sketch_from_pair_chunk(
9600    config: &Config,
9601    sketch: &AtomicPackedCountMinSketch,
9602    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
9603) {
9604    let (key_increments, newly_occupied) = pairs
9605        .par_iter()
9606        .map(|(r1, r2)| increment_pair_atomic_packed_sketch(config, sketch, r1, r2.as_ref()))
9607        .reduce(
9608            || (0u64, 0usize),
9609            |left, right| {
9610                (
9611                    left.0.saturating_add(right.0),
9612                    left.1.saturating_add(right.1),
9613                )
9614            },
9615        );
9616    sketch.add_key_increments(key_increments);
9617    sketch.add_occupied_slots(newly_occupied);
9618}
9619
9620fn increment_pair_atomic_packed_sketch(
9621    config: &Config,
9622    sketch: &AtomicPackedCountMinSketch,
9623    r1: &SequenceRecord,
9624    r2: Option<&SequenceRecord>,
9625) -> (u64, usize) {
9626    if config.remove_duplicate_kmers && config.k <= 31 {
9627        let keys = unique_pair_kmers(config, r1, r2);
9628        let mut newly_occupied = 0usize;
9629        for key in &keys {
9630            newly_occupied += sketch.add_key_count_counting_newly_occupied(key, 1);
9631        }
9632        return (keys.len() as u64, newly_occupied);
9633    }
9634    let mut key_increments = 0u64;
9635    let mut newly_occupied = 0usize;
9636    for_each_kmer_for_record(r1, config, |kmer| {
9637        newly_occupied += sketch.add_key_count_counting_newly_occupied(&kmer, 1);
9638        key_increments += 1;
9639    });
9640    if let Some(mate) = r2 {
9641        for_each_kmer_for_record(mate, config, |kmer| {
9642            newly_occupied += sketch.add_key_count_counting_newly_occupied(&kmer, 1);
9643            key_increments += 1;
9644        });
9645    }
9646    (key_increments, newly_occupied)
9647}
9648
9649fn increment_atomic_sketch_from_pair_chunk(
9650    config: &Config,
9651    sketch: &AtomicCountMinSketch,
9652    pairs: &[(SequenceRecord, Option<SequenceRecord>)],
9653    prefilter: Option<PrefilterGate<'_>>,
9654) {
9655    if !config.deterministic {
9656        let (key_increments, newly_occupied) = pairs
9657            .par_iter()
9658            .map(|(r1, r2)| {
9659                increment_pair_atomic_sketch_direct(config, sketch, r1, r2.as_ref(), prefilter)
9660            })
9661            .reduce(
9662                || (0u64, 0usize),
9663                |left, right| {
9664                    (
9665                        left.0.saturating_add(right.0),
9666                        left.1.saturating_add(right.1),
9667                    )
9668                },
9669            );
9670        sketch.add_key_increments(key_increments);
9671        sketch.add_occupied_slots(newly_occupied);
9672        return;
9673    }
9674
9675    let mut entries = pairs
9676        .par_iter()
9677        .fold(
9678            || count_chunk_local_map(config, pairs),
9679            |mut local_counts, (r1, r2)| {
9680                increment_pair_counts_with_prefilter(
9681                    config,
9682                    &mut local_counts,
9683                    r1,
9684                    r2.as_ref(),
9685                    prefilter,
9686                );
9687                local_counts
9688            },
9689        )
9690        .map(|counts| counts.into_iter().collect::<Vec<_>>())
9691        .reduce(Vec::new, |mut left, mut right| {
9692            left.append(&mut right);
9693            left
9694        });
9695    entries.par_sort_unstable_by(|(left, _), (right, _)| left.cmp(right));
9696
9697    let mut key_increments = 0u64;
9698    let mut iter = entries.into_iter();
9699    let Some((mut current_key, mut current_count)) = iter.next() else {
9700        return;
9701    };
9702    for (key, count) in iter {
9703        if key == current_key {
9704            current_count = current_count.saturating_add(count);
9705        } else {
9706            key_increments = key_increments.saturating_add(current_count);
9707            sketch.add_key_count(&current_key, current_count);
9708            current_key = key;
9709            current_count = count;
9710        }
9711    }
9712    key_increments = key_increments.saturating_add(current_count);
9713    sketch.add_key_count(&current_key, current_count);
9714    sketch.add_key_increments(key_increments);
9715}
9716
9717fn increment_pair_atomic_sketch_direct(
9718    config: &Config,
9719    sketch: &AtomicCountMinSketch,
9720    r1: &SequenceRecord,
9721    r2: Option<&SequenceRecord>,
9722    prefilter: Option<PrefilterGate<'_>>,
9723) -> (u64, usize) {
9724    if config.remove_duplicate_kmers && config.k <= 31 {
9725        let keys = unique_pair_kmers(config, r1, r2);
9726        let mut key_increments = 0u64;
9727        let mut newly_occupied = 0usize;
9728        for key in &keys {
9729            if prefilter.is_none_or(|gate| gate.should_count_in_main(key)) {
9730                newly_occupied += sketch.add_key_count_counting_newly_occupied(key, 1);
9731                key_increments += 1;
9732            }
9733        }
9734        return (key_increments, newly_occupied);
9735    }
9736
9737    let mut key_increments = 0u64;
9738    let mut newly_occupied = 0usize;
9739    for_each_kmer_for_record(r1, config, |kmer| {
9740        if prefilter.is_none_or(|gate| gate.should_count_in_main(&kmer)) {
9741            newly_occupied += sketch.add_key_count_counting_newly_occupied(&kmer, 1);
9742            key_increments += 1;
9743        }
9744    });
9745    if let Some(mate) = r2 {
9746        for_each_kmer_for_record(mate, config, |kmer| {
9747            if prefilter.is_none_or(|gate| gate.should_count_in_main(&kmer)) {
9748                newly_occupied += sketch.add_key_count_counting_newly_occupied(&kmer, 1);
9749                key_increments += 1;
9750            }
9751        });
9752    }
9753    (key_increments, newly_occupied)
9754}
9755
9756#[cfg(test)]
9757fn retain_prefilter_saturated_counts(counts: &mut CountMap, prefilter: Option<PrefilterGate<'_>>) {
9758    let Some(prefilter) = prefilter else {
9759        return;
9760    };
9761    counts.retain(|key, _| prefilter.should_count_in_main(key));
9762}
9763
9764fn merge_count_maps(counts: &mut CountMap, source: CountMap) {
9765    for (kmer, count) in source {
9766        *counts.entry(kmer).or_insert(0) += count;
9767    }
9768}
9769
9770fn trim_pair(config: &Config, r1: &mut SequenceRecord, r2: Option<&mut SequenceRecord>) {
9771    if !config.trim_left && !config.trim_right {
9772        return;
9773    }
9774    trim_record(config, r1);
9775    if let Some(mate) = r2 {
9776        trim_record(config, mate);
9777    }
9778}
9779
9780fn trim_record(config: &Config, record: &mut SequenceRecord) {
9781    if record.is_empty() {
9782        return;
9783    }
9784    let (left0, right0) = if config.trim_optimal {
9785        optimal_trim_amounts(record, config)
9786    } else if config.trim_window {
9787        (0, window_trim_right_amount(record, config))
9788    } else {
9789        simple_trim_amounts(record, config)
9790    };
9791    let left = if config.trim_left { left0 } else { 0 };
9792    let right = if config.trim_right { right0 } else { 0 };
9793    trim_by_amount(record, left, right, 1);
9794}
9795
9796fn optimal_trim_amounts(record: &SequenceRecord, config: &Config) -> (usize, usize) {
9797    let avg_error_rate = config
9798        .trim_optimal_bias
9799        .unwrap_or_else(|| phred_to_prob_error(config.trim_quality));
9800    if let Some(qualities) = record.qualities.as_deref() {
9801        let nprob = (avg_error_rate * 1.1).clamp(0.75, 1.0);
9802        let mut max_score = 0.0f64;
9803        let mut score = 0.0f64;
9804        let mut max_loc = 0usize;
9805        let mut max_count = 0usize;
9806        let mut count = 0usize;
9807
9808        for (idx, (&base, &quality)) in record.bases.iter().zip(qualities).enumerate() {
9809            let phred = quality.saturating_sub(33);
9810            let prob_error = if base == b'N' || phred < 1 {
9811                nprob
9812            } else {
9813                phred_to_prob_error(f64::from(phred))
9814            };
9815            score += avg_error_rate - prob_error;
9816            if score > 0.0 {
9817                count += 1;
9818                if score > max_score || (score == max_score && count > max_count) {
9819                    max_score = score;
9820                    max_count = count;
9821                    max_loc = idx;
9822                }
9823            } else {
9824                score = 0.0;
9825                count = 0;
9826            }
9827        }
9828
9829        if max_score > 0.0 {
9830            (max_loc + 1 - max_count, record.len() - max_loc - 1)
9831        } else {
9832            (0, record.len())
9833        }
9834    } else if avg_error_rate >= 1.0 {
9835        (0, 0)
9836    } else {
9837        (
9838            test_left_n(&record.bases, config.trim_min_good_interval),
9839            test_right_n(&record.bases, config.trim_min_good_interval),
9840        )
9841    }
9842}
9843
9844fn simple_trim_amounts(record: &SequenceRecord, config: &Config) -> (usize, usize) {
9845    let trimq = config.trim_quality as u8;
9846    if let Some(qualities) = record.qualities.as_deref() {
9847        (
9848            test_left_quality(qualities, trimq, config.trim_min_good_interval),
9849            test_right_quality(qualities, trimq, config.trim_min_good_interval),
9850        )
9851    } else {
9852        (
9853            test_left_n(&record.bases, config.trim_min_good_interval),
9854            test_right_n(&record.bases, config.trim_min_good_interval),
9855        )
9856    }
9857}
9858
9859fn window_trim_right_amount(record: &SequenceRecord, config: &Config) -> usize {
9860    let trimq = config.trim_quality as i32;
9861    let Some(qualities) = record.qualities.as_deref() else {
9862        return if trimq > 0 {
9863            0
9864        } else {
9865            test_right_n(&record.bases, config.trim_min_good_interval)
9866        };
9867    };
9868    if qualities.len() < config.trim_window_length {
9869        return if trimq > 0 {
9870            0
9871        } else {
9872            test_right_n(&record.bases, config.trim_min_good_interval)
9873        };
9874    }
9875
9876    let Ok(window) = isize::try_from(config.trim_window_length) else {
9877        return 0;
9878    };
9879    let threshold = (config.trim_window_length as i32 * trimq).max(1);
9880    let mut sum = 0i32;
9881    for (idx, &quality) in qualities.iter().enumerate() {
9882        let Ok(idx) = isize::try_from(idx) else {
9883            return 0;
9884        };
9885        let j = idx - window;
9886        sum += i32::from(quality.saturating_sub(33));
9887        if j >= -1 {
9888            if j >= 0 {
9889                sum -= i32::from(qualities[j as usize].saturating_sub(33));
9890            }
9891            if sum < threshold {
9892                return qualities.len() - j as usize - 1;
9893            }
9894        }
9895    }
9896    0
9897}
9898
9899fn test_left_quality(qualities: &[u8], trimq: u8, min_good_interval: usize) -> usize {
9900    let mut good = 0usize;
9901    let mut last_bad = None;
9902    for (idx, &quality) in qualities.iter().enumerate() {
9903        if good >= min_good_interval {
9904            break;
9905        }
9906        if quality.saturating_sub(33) > trimq {
9907            good += 1;
9908        } else {
9909            good = 0;
9910            last_bad = Some(idx);
9911        }
9912    }
9913    last_bad.map_or(0, |idx| idx + 1)
9914}
9915
9916fn test_right_quality(qualities: &[u8], trimq: u8, min_good_interval: usize) -> usize {
9917    let mut good = 0usize;
9918    let mut last_bad = qualities.len();
9919    for (idx, &quality) in qualities.iter().enumerate().rev() {
9920        if good >= min_good_interval {
9921            break;
9922        }
9923        if quality.saturating_sub(33) > trimq {
9924            good += 1;
9925        } else {
9926            good = 0;
9927            last_bad = idx;
9928        }
9929    }
9930    qualities.len() - last_bad
9931}
9932
9933fn test_left_n(bases: &[u8], min_good_interval: usize) -> usize {
9934    let mut good = 0usize;
9935    let mut last_bad = None;
9936    for (idx, &base) in bases.iter().enumerate() {
9937        if good >= min_good_interval {
9938            break;
9939        }
9940        if base != b'N' {
9941            good += 1;
9942        } else {
9943            good = 0;
9944            last_bad = Some(idx);
9945        }
9946    }
9947    last_bad.map_or(0, |idx| idx + 1)
9948}
9949
9950fn test_right_n(bases: &[u8], min_good_interval: usize) -> usize {
9951    let mut good = 0usize;
9952    let mut last_bad = bases.len();
9953    for (idx, &base) in bases.iter().enumerate().rev() {
9954        if good >= min_good_interval {
9955            break;
9956        }
9957        if base != b'N' {
9958            good += 1;
9959        } else {
9960            good = 0;
9961            last_bad = idx;
9962        }
9963    }
9964    bases.len() - last_bad
9965}
9966
9967fn trim_by_amount(
9968    record: &mut SequenceRecord,
9969    mut left_trim: usize,
9970    mut right_trim: usize,
9971    min_resulting_length: usize,
9972) -> usize {
9973    let len = record.len();
9974    if len == 0 {
9975        return 0;
9976    }
9977    let min_resulting_length = min_resulting_length.min(len);
9978    if left_trim + right_trim + min_resulting_length > len {
9979        right_trim = 1usize.max(len.saturating_sub(min_resulting_length));
9980        left_trim = 0;
9981    }
9982    let total = left_trim + right_trim;
9983    if total > 0 {
9984        record.bases = record.bases[left_trim..len - right_trim].to_vec();
9985        if let Some(qualities) = record.qualities.take() {
9986            let qlen = qualities.len();
9987            record.qualities = if total >= qlen {
9988                None
9989            } else {
9990                Some(qualities[left_trim..qlen - right_trim].to_vec())
9991            };
9992        }
9993    }
9994    total
9995}
9996
9997fn phred_to_prob_error(q: f64) -> f64 {
9998    if q <= 0.0 {
9999        0.75
10000    } else if q <= 1.0 {
10001        0.75 - q * 0.05
10002    } else {
10003        0.7_f64.min(10_f64.powf(-0.1 * q))
10004    }
10005}
10006
10007fn increment_sparse_hist_from_analysis(
10008    hist: &mut SparseHist,
10009    analysis: &ReadAnalysis,
10010    hist_len: usize,
10011) {
10012    for depth in &analysis.coverage_desc {
10013        if *depth < 0 {
10014            continue;
10015        }
10016        let idx = (*depth as usize).min(hist_len - 1);
10017        *hist.entry(idx).or_insert(0) += 1;
10018    }
10019}
10020
10021#[cfg(test)]
10022fn increment_hist_from_pair_chunk(
10023    config: &Config,
10024    hist_counts: &dyn CountLookup,
10025    keep_filter_counts: Option<&dyn CountLookup>,
10026    hist: &mut [u64],
10027    pairs: &[AnalysisPair],
10028) {
10029    let chunk_hist = sparse_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, pairs);
10030    merge_sparse_hist_into_dense(hist, chunk_hist);
10031}
10032
10033fn sparse_hist_from_pair_chunk(
10034    config: &Config,
10035    hist_counts: &dyn CountLookup,
10036    keep_filter_counts: Option<&dyn CountLookup>,
10037    pairs: &[AnalysisPair],
10038) -> SparseHist {
10039    pairs
10040        .par_iter()
10041        .fold(SparseHist::default, |mut local_hist, (r1, r2, rand)| {
10042            if let Some(input_counts) = keep_filter_counts {
10043                let decision = decide_pair(config, input_counts, r1, r2.as_ref(), *rand);
10044                if decision.toss {
10045                    return local_hist;
10046                }
10047            }
10048
10049            let analysis = analyze_pair(config, hist_counts, r1, r2.as_ref());
10050            increment_sparse_hist_from_analysis(&mut local_hist, &analysis.read1, config.hist_len);
10051            if let Some(read2) = &analysis.read2 {
10052                increment_sparse_hist_from_analysis(&mut local_hist, read2, config.hist_len);
10053            }
10054            local_hist
10055        })
10056        .reduce(SparseHist::default, |mut left, right| {
10057            merge_sparse_hist(&mut left, right);
10058            left
10059        })
10060}
10061
10062fn merge_sparse_hist(target: &mut SparseHist, source: SparseHist) {
10063    for (idx, count) in source {
10064        *target.entry(idx).or_insert(0) += count;
10065    }
10066}
10067
10068#[cfg(test)]
10069fn merge_sparse_hist_into_dense(target: &mut [u64], source: SparseHist) {
10070    for (idx, count) in source {
10071        target[idx] += count;
10072    }
10073}
10074
10075fn increment_sparse_read_hist(
10076    hist: &mut SparseReadDepthHist,
10077    analysis: &ReadAnalysis,
10078    read_len: usize,
10079    hist_len: usize,
10080) {
10081    if !analysis.had_kmer_windows {
10082        return;
10083    }
10084    let depth = analysis.depth_al.or(analysis.true_depth).unwrap_or(0);
10085    let idx = (depth as usize).min(hist_len - 1);
10086    let entry = hist.entry(idx).or_insert((0, 0));
10087    entry.0 += 1;
10088    entry.1 += read_len as u64;
10089}
10090
10091#[cfg(test)]
10092fn increment_read_hist_from_pair_chunk(
10093    config: &Config,
10094    hist_counts: &dyn CountLookup,
10095    keep_filter_counts: Option<&dyn CountLookup>,
10096    hist: &mut ReadDepthHistogram,
10097    pairs: &[AnalysisPair],
10098) {
10099    let chunk_hist =
10100        sparse_read_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, pairs);
10101    merge_sparse_read_depth_hist_into_dense(hist, chunk_hist);
10102}
10103
10104fn sparse_read_hist_from_pair_chunk(
10105    config: &Config,
10106    hist_counts: &dyn CountLookup,
10107    keep_filter_counts: Option<&dyn CountLookup>,
10108    pairs: &[AnalysisPair],
10109) -> SparseReadDepthHist {
10110    pairs
10111        .par_iter()
10112        .fold(
10113            SparseReadDepthHist::default,
10114            |mut local_hist, (r1, r2, rand)| {
10115                if let Some(input_counts) = keep_filter_counts {
10116                    let decision = decide_pair(config, input_counts, r1, r2.as_ref(), *rand);
10117                    if decision.toss {
10118                        return local_hist;
10119                    }
10120                }
10121
10122                let analysis = analyze_pair(config, hist_counts, r1, r2.as_ref());
10123                increment_sparse_read_hist(
10124                    &mut local_hist,
10125                    &analysis.read1,
10126                    r1.len(),
10127                    config.hist_len,
10128                );
10129                if let (Some(read2_analysis), Some(read2)) = (&analysis.read2, r2.as_ref()) {
10130                    increment_sparse_read_hist(
10131                        &mut local_hist,
10132                        read2_analysis,
10133                        read2.len(),
10134                        config.hist_len,
10135                    );
10136                }
10137                local_hist
10138            },
10139        )
10140        .reduce(SparseReadDepthHist::default, |mut left, right| {
10141            merge_sparse_read_depth_hist(&mut left, right);
10142            left
10143        })
10144}
10145
10146#[cfg(test)]
10147fn increment_hist_and_read_hist_from_pair_chunk(
10148    config: &Config,
10149    hist_counts: &dyn CountLookup,
10150    keep_filter_counts: Option<&dyn CountLookup>,
10151    depth_hist: &mut [u64],
10152    read_hist: &mut ReadDepthHistogram,
10153    pairs: &[AnalysisPair],
10154) {
10155    let (chunk_depth_hist, chunk_read_hist) =
10156        sparse_hist_and_read_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, pairs);
10157    merge_sparse_hist_into_dense(depth_hist, chunk_depth_hist);
10158    merge_sparse_read_depth_hist_into_dense(read_hist, chunk_read_hist);
10159}
10160
10161fn sparse_hist_and_read_hist_from_pair_chunk(
10162    config: &Config,
10163    hist_counts: &dyn CountLookup,
10164    keep_filter_counts: Option<&dyn CountLookup>,
10165    pairs: &[AnalysisPair],
10166) -> (SparseHist, SparseReadDepthHist) {
10167    pairs
10168        .par_iter()
10169        .fold(
10170            || (SparseHist::default(), SparseReadDepthHist::default()),
10171            |mut local, (r1, r2, rand)| {
10172                if let Some(input_counts) = keep_filter_counts {
10173                    let decision = decide_pair(config, input_counts, r1, r2.as_ref(), *rand);
10174                    if decision.toss {
10175                        return local;
10176                    }
10177                }
10178
10179                let analysis = analyze_pair(config, hist_counts, r1, r2.as_ref());
10180                increment_sparse_hist_from_analysis(&mut local.0, &analysis.read1, config.hist_len);
10181                increment_sparse_read_hist(
10182                    &mut local.1,
10183                    &analysis.read1,
10184                    r1.len(),
10185                    config.hist_len,
10186                );
10187                if let Some(read2_analysis) = &analysis.read2 {
10188                    increment_sparse_hist_from_analysis(
10189                        &mut local.0,
10190                        read2_analysis,
10191                        config.hist_len,
10192                    );
10193                    if let Some(read2) = r2.as_ref() {
10194                        increment_sparse_read_hist(
10195                            &mut local.1,
10196                            read2_analysis,
10197                            read2.len(),
10198                            config.hist_len,
10199                        );
10200                    }
10201                }
10202                local
10203            },
10204        )
10205        .reduce(
10206            || (SparseHist::default(), SparseReadDepthHist::default()),
10207            |mut left, right| {
10208                merge_sparse_hist(&mut left.0, right.0);
10209                merge_sparse_read_depth_hist(&mut left.1, right.1);
10210                left
10211            },
10212        )
10213}
10214
10215fn merge_sparse_read_depth_hist(target: &mut SparseReadDepthHist, source: SparseReadDepthHist) {
10216    for (idx, (reads, bases)) in source {
10217        let entry = target.entry(idx).or_insert((0, 0));
10218        entry.0 += reads;
10219        entry.1 += bases;
10220    }
10221}
10222
10223#[cfg(test)]
10224fn merge_sparse_read_depth_hist_into_dense(
10225    target: &mut ReadDepthHistogram,
10226    source: SparseReadDepthHist,
10227) {
10228    for (idx, (reads, bases)) in source {
10229        target.reads[idx] += reads;
10230        target.bases[idx] += bases;
10231    }
10232}
10233
10234#[cfg(test)]
10235fn write_depth_hist(path: &Path, raw_hist: &[u64], config: &Config) -> Result<()> {
10236    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10237        .with_context(|| format!("creating histogram {}", path.display()))?;
10238    match config.hist_columns {
10239        1 => writeln!(writer, "#tUnique_Kmers")?,
10240        2 => writeln!(writer, "#Depth\tUnique_Kmers")?,
10241        3 => writeln!(writer, "#Depth\tRaw_Count\tUnique_Kmers")?,
10242        _ => unreachable!("validated hist column count"),
10243    }
10244
10245    let total_raw = raw_hist.iter().copied().fold(0u64, u64::saturating_add);
10246    let mut seen_raw = 0u64;
10247    let lim = raw_hist.len().saturating_sub(1);
10248    for depth in 0..lim {
10249        let raw = adjusted_depth_hist_raw(raw_hist, config.zero_bin, depth);
10250        seen_raw = seen_raw.saturating_add(raw);
10251        let unique = unique_from_raw(depth, raw);
10252        if config.print_zero_coverage || unique > 0 || config.hist_columns == 1 {
10253            write_hist_row(&mut writer, config.hist_columns, depth, raw, unique)?;
10254        }
10255        if seen_raw >= total_raw {
10256            break;
10257        }
10258    }
10259
10260    let overflow_raw = (lim..raw_hist.len())
10261        .map(|depth| adjusted_depth_hist_raw(raw_hist, config.zero_bin, depth))
10262        .fold(0u64, u64::saturating_add);
10263    if overflow_raw > 0 {
10264        write_hist_row(
10265            &mut writer,
10266            config.hist_columns,
10267            lim,
10268            overflow_raw,
10269            unique_from_raw(lim, overflow_raw),
10270        )?;
10271    }
10272    writer.flush()?;
10273    Ok(())
10274}
10275
10276fn write_sparse_depth_hist(
10277    path: &Path,
10278    raw_hist: &SparseHist,
10279    hist_len: usize,
10280    config: &Config,
10281) -> Result<()> {
10282    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10283        .with_context(|| format!("creating histogram {}", path.display()))?;
10284    match config.hist_columns {
10285        1 => writeln!(writer, "#tUnique_Kmers")?,
10286        2 => writeln!(writer, "#Depth\tUnique_Kmers")?,
10287        3 => writeln!(writer, "#Depth\tRaw_Count\tUnique_Kmers")?,
10288        _ => unreachable!("validated hist column count"),
10289    }
10290
10291    let hist_len = hist_len.max(1);
10292    let lim = hist_len.saturating_sub(1);
10293    let total_raw = raw_hist.values().copied().fold(0u64, u64::saturating_add);
10294    let mut seen_raw = 0u64;
10295
10296    if config.print_zero_coverage || config.hist_columns == 1 {
10297        for depth in 0..lim {
10298            let raw = adjusted_sparse_depth_hist_raw(raw_hist, hist_len, config.zero_bin, depth);
10299            seen_raw = seen_raw.saturating_add(raw);
10300            write_hist_row(
10301                &mut writer,
10302                config.hist_columns,
10303                depth,
10304                raw,
10305                unique_from_raw(depth, raw),
10306            )?;
10307            if seen_raw >= total_raw {
10308                break;
10309            }
10310        }
10311    } else {
10312        let mut depths: Vec<usize> = raw_hist
10313            .iter()
10314            .filter_map(|(&depth, &raw)| {
10315                let mapped_depth = if !config.zero_bin && hist_len > 1 && depth == 0 {
10316                    1
10317                } else {
10318                    depth
10319                };
10320                (mapped_depth < lim && raw > 0).then_some(mapped_depth)
10321            })
10322            .collect();
10323        depths.sort_unstable();
10324        depths.dedup();
10325        for depth in depths {
10326            let raw = adjusted_sparse_depth_hist_raw(raw_hist, hist_len, config.zero_bin, depth);
10327            seen_raw = seen_raw.saturating_add(raw);
10328            let unique = unique_from_raw(depth, raw);
10329            if unique > 0 {
10330                write_hist_row(&mut writer, config.hist_columns, depth, raw, unique)?;
10331            }
10332            if seen_raw >= total_raw {
10333                break;
10334            }
10335        }
10336    }
10337
10338    let mut overflow_depths: Vec<usize> = raw_hist
10339        .keys()
10340        .copied()
10341        .filter_map(|depth| {
10342            let mapped_depth = if !config.zero_bin && hist_len > 1 && depth == 0 {
10343                1
10344            } else {
10345                depth
10346            };
10347            (mapped_depth >= lim).then_some(mapped_depth)
10348        })
10349        .collect();
10350    overflow_depths.sort_unstable();
10351    overflow_depths.dedup();
10352    let overflow_raw = overflow_depths.into_iter().fold(0u64, |sum, depth| {
10353        sum.saturating_add(adjusted_sparse_depth_hist_raw(
10354            raw_hist,
10355            hist_len,
10356            config.zero_bin,
10357            depth,
10358        ))
10359    });
10360    if overflow_raw > 0 {
10361        write_hist_row(
10362            &mut writer,
10363            config.hist_columns,
10364            lim,
10365            overflow_raw,
10366            unique_from_raw(lim, overflow_raw),
10367        )?;
10368    }
10369    writer.flush()?;
10370    Ok(())
10371}
10372
10373#[cfg(test)]
10374fn adjusted_depth_hist_raw(raw_hist: &[u64], zero_bin: bool, depth: usize) -> u64 {
10375    let raw = raw_hist.get(depth).copied().unwrap_or(0);
10376    if zero_bin || raw_hist.len() <= 1 {
10377        return raw;
10378    }
10379    match depth {
10380        0 => 0,
10381        1 => raw.saturating_add(raw_hist[0]),
10382        _ => raw,
10383    }
10384}
10385
10386fn adjusted_sparse_depth_hist_raw(
10387    raw_hist: &SparseHist,
10388    hist_len: usize,
10389    zero_bin: bool,
10390    depth: usize,
10391) -> u64 {
10392    let raw = raw_hist.get(&depth).copied().unwrap_or(0);
10393    if zero_bin || hist_len <= 1 {
10394        return raw;
10395    }
10396    match depth {
10397        0 => 0,
10398        1 => raw.saturating_add(raw_hist.get(&0).copied().unwrap_or(0)),
10399        _ => raw,
10400    }
10401}
10402
10403#[cfg(test)]
10404fn sparse_hist_to_dense(raw_hist: &SparseHist, hist_len: usize) -> Vec<u64> {
10405    let mut dense = vec![0u64; hist_len.max(1)];
10406    for (&depth, &raw) in raw_hist {
10407        let idx = depth.min(dense.len() - 1);
10408        dense[idx] = dense[idx].saturating_add(raw);
10409    }
10410    dense
10411}
10412
10413fn sparse_hist_to_peak_dense(raw_hist: &SparseHist, hist_len: usize) -> Vec<u64> {
10414    let hist_len = hist_len.max(1);
10415    let last_index = hist_len - 1;
10416    let last_nonzero = raw_hist
10417        .iter()
10418        .filter_map(|(&depth, &raw)| (raw > 0).then_some(depth.min(last_index)))
10419        .max()
10420        .unwrap_or(0);
10421    let dense_len = hist_len.min(
10422        last_nonzero
10423            .saturating_add(PEAK_COMPACT_ZERO_TAIL)
10424            .saturating_add(1),
10425    );
10426    let mut dense = vec![0u64; dense_len.max(1)];
10427    for (&depth, &raw) in raw_hist {
10428        if raw == 0 {
10429            continue;
10430        }
10431        let idx = depth.min(last_index);
10432        if idx < dense.len() {
10433            dense[idx] = dense[idx].saturating_add(raw);
10434        } else {
10435            dense.resize(idx + 1, 0);
10436            dense[idx] = dense[idx].saturating_add(raw);
10437        }
10438    }
10439    dense
10440}
10441
10442fn write_hist_row(
10443    writer: &mut Box<dyn Write>,
10444    columns: u8,
10445    depth: usize,
10446    raw: u64,
10447    unique: u64,
10448) -> Result<()> {
10449    match columns {
10450        1 => writeln!(writer, "{unique}")?,
10451        2 => writeln!(writer, "{depth}\t{unique}")?,
10452        3 => writeln!(writer, "{depth}\t{raw}\t{unique}")?,
10453        _ => unreachable!("validated hist column count"),
10454    }
10455    Ok(())
10456}
10457
10458#[cfg(test)]
10459fn write_read_depth_hist(path: &Path, hist: &ReadDepthHistogram, config: &Config) -> Result<()> {
10460    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10461        .with_context(|| format!("creating read histogram {}", path.display()))?;
10462    writeln!(writer, "#Depth\tReads\tBases")?;
10463
10464    let total_reads: u64 = hist.reads.iter().sum();
10465    let mut seen_reads = 0u64;
10466    let lim = hist.reads.len().saturating_sub(1);
10467
10468    for depth in 0..lim {
10469        let reads = hist.reads[depth];
10470        let bases = hist.bases[depth];
10471        seen_reads += reads;
10472        if config.print_zero_coverage || bases > 0 {
10473            writeln!(writer, "{depth}\t{reads}\t{bases}")?;
10474        }
10475        if seen_reads >= total_reads {
10476            break;
10477        }
10478    }
10479
10480    let overflow_reads: u64 = hist.reads.iter().skip(lim).sum();
10481    let overflow_bases: u64 = hist.bases.iter().skip(lim).sum();
10482    if overflow_reads > 0 || overflow_bases > 0 {
10483        writeln!(writer, "{lim}\t{overflow_reads}\t{overflow_bases}")?;
10484    }
10485    writer.flush()?;
10486    Ok(())
10487}
10488
10489fn write_sparse_read_depth_hist(
10490    path: &Path,
10491    hist: &SparseReadDepthHist,
10492    hist_len: usize,
10493    config: &Config,
10494) -> Result<()> {
10495    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10496        .with_context(|| format!("creating read histogram {}", path.display()))?;
10497    writeln!(writer, "#Depth\tReads\tBases")?;
10498
10499    let hist_len = hist_len.max(1);
10500    let lim = hist_len.saturating_sub(1);
10501    let total_reads = hist
10502        .values()
10503        .map(|(reads, _)| *reads)
10504        .fold(0u64, u64::saturating_add);
10505    let mut seen_reads = 0u64;
10506
10507    if config.print_zero_coverage {
10508        for depth in 0..lim {
10509            let (reads, bases) = hist.get(&depth).copied().unwrap_or_default();
10510            seen_reads = seen_reads.saturating_add(reads);
10511            writeln!(writer, "{depth}\t{reads}\t{bases}")?;
10512            if seen_reads >= total_reads {
10513                break;
10514            }
10515        }
10516    } else {
10517        let mut depths: Vec<usize> = hist.keys().copied().filter(|depth| *depth < lim).collect();
10518        depths.sort_unstable();
10519        for depth in depths {
10520            let (reads, bases) = hist.get(&depth).copied().unwrap_or_default();
10521            seen_reads = seen_reads.saturating_add(reads);
10522            if bases > 0 {
10523                writeln!(writer, "{depth}\t{reads}\t{bases}")?;
10524            }
10525            if seen_reads >= total_reads {
10526                break;
10527            }
10528        }
10529    }
10530
10531    let (overflow_reads, overflow_bases) = hist.iter().filter(|(depth, _)| **depth >= lim).fold(
10532        (0u64, 0u64),
10533        |(read_sum, base_sum), (_, (reads, bases))| {
10534            (
10535                read_sum.saturating_add(*reads),
10536                base_sum.saturating_add(*bases),
10537            )
10538        },
10539    );
10540    if overflow_reads > 0 || overflow_bases > 0 {
10541        writeln!(writer, "{lim}\t{overflow_reads}\t{overflow_bases}")?;
10542    }
10543    writer.flush()?;
10544    Ok(())
10545}
10546
10547fn write_quality_hist(path: &Path, hist: &[u64], config: &Config) -> Result<()> {
10548    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10549        .with_context(|| format!("creating quality histogram {}", path.display()))?;
10550    writeln!(writer, "#Quality\tBases")?;
10551
10552    let total_bases: u64 = hist.iter().sum();
10553    let mut seen_bases = 0u64;
10554    let lim = hist.len().saturating_sub(1);
10555
10556    for (quality, bases) in hist.iter().copied().enumerate().take(lim) {
10557        seen_bases += bases;
10558        if config.print_zero_coverage || bases > 0 {
10559            writeln!(writer, "{quality}\t{bases}")?;
10560        }
10561        if seen_bases >= total_bases {
10562            break;
10563        }
10564    }
10565
10566    let overflow_bases: u64 = hist.iter().skip(lim).sum();
10567    if overflow_bases > 0 {
10568        writeln!(writer, "{lim}\t{overflow_bases}")?;
10569    }
10570    writer.flush()?;
10571    Ok(())
10572}
10573
10574fn write_quality_count_hist(
10575    path: &Path,
10576    first: &[u64],
10577    second: &[u64],
10578    paired: bool,
10579    config: &Config,
10580) -> Result<()> {
10581    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10582        .with_context(|| format!("creating quality-count histogram {}", path.display()))?;
10583    writeln!(
10584        writer,
10585        "#Quality\tcount1\tfraction1{}",
10586        if paired { "\tcount2\tfraction2" } else { "" }
10587    )?;
10588    write_paired_quality_count_rows(&mut writer, first, second, paired, config)?;
10589    writer.flush()?;
10590    Ok(())
10591}
10592
10593fn write_average_quality_hist(
10594    path: &Path,
10595    first: &[u64],
10596    second: &[u64],
10597    paired: bool,
10598    config: &Config,
10599) -> Result<()> {
10600    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10601        .with_context(|| format!("creating average-quality histogram {}", path.display()))?;
10602    writeln!(
10603        writer,
10604        "#Quality\tcount1\tfraction1{}",
10605        if paired { "\tcount2\tfraction2" } else { "" }
10606    )?;
10607    write_paired_quality_count_rows(&mut writer, first, second, paired, config)?;
10608    writer.flush()?;
10609    Ok(())
10610}
10611
10612fn write_paired_quality_count_rows(
10613    writer: &mut Box<dyn Write>,
10614    first: &[u64],
10615    second: &[u64],
10616    paired: bool,
10617    config: &Config,
10618) -> Result<()> {
10619    let total1: u64 = first.iter().sum();
10620    let total2: u64 = second.iter().sum();
10621    let mut remaining = total1 + if paired { total2 } else { 0 };
10622    let denom1 = total1.max(1) as f64;
10623    let denom2 = total2.max(1) as f64;
10624
10625    for (quality, count1) in first.iter().copied().enumerate() {
10626        let count2 = second.get(quality).copied().unwrap_or(0);
10627        if count1 > 0 || (paired && count2 > 0) || config.print_zero_coverage {
10628            write!(writer, "{quality}\t{count1}\t{:.5}", count1 as f64 / denom1)?;
10629            if paired {
10630                write!(writer, "\t{count2}\t{:.5}", count2 as f64 / denom2)?;
10631            }
10632            writeln!(writer)?;
10633        }
10634        remaining = remaining.saturating_sub(count1 + if paired { count2 } else { 0 });
10635        if remaining == 0 && !config.print_zero_coverage {
10636            break;
10637        }
10638    }
10639    Ok(())
10640}
10641
10642fn write_overall_base_quality_hist(path: &Path, hist: &[u64], config: &Config) -> Result<()> {
10643    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10644        .with_context(|| format!("creating overall base-quality histogram {}", path.display()))?;
10645    let median = percentile_histogram(hist, 0.5);
10646    let mean = average_histogram(hist);
10647    let stdev = stdev_histogram(hist, mean, 0);
10648    let mean30 = average_histogram_min(hist, 30);
10649    let stdev30 = stdev_histogram(hist, mean30, 30);
10650    writeln!(writer, "#Median\t{median}")?;
10651    writeln!(writer, "#Mean\t{mean:.3}")?;
10652    writeln!(writer, "#STDev\t{stdev:.3}")?;
10653    writeln!(writer, "#Mean_30\t{mean30:.3}")?;
10654    writeln!(writer, "#STDev_30\t{stdev30:.3}")?;
10655    writeln!(writer, "#Quality\tbases\tfraction")?;
10656
10657    let total: u64 = hist.iter().sum();
10658    let denom = total.max(1) as f64;
10659    let mut remaining = total;
10660    for (quality, bases) in hist.iter().copied().enumerate() {
10661        if bases > 0 || config.print_zero_coverage {
10662            writeln!(writer, "{quality}\t{bases}\t{:.5}", bases as f64 / denom)?;
10663        }
10664        remaining = remaining.saturating_sub(bases);
10665        if remaining == 0 && !config.print_zero_coverage {
10666            break;
10667        }
10668    }
10669    writer.flush()?;
10670    Ok(())
10671}
10672
10673fn write_base_quality_hist(
10674    path: &Path,
10675    hist: &QualitySideHistograms,
10676    config: &Config,
10677) -> Result<()> {
10678    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10679        .with_context(|| format!("creating base-quality histogram {}", path.display()))?;
10680    write!(
10681        writer,
10682        "#BaseNum\tcount_1\tmin_1\tmax_1\tmean_1\tQ1_1\tmed_1\tQ3_1\tLW_1\tRW_1"
10683    )?;
10684    if hist.paired {
10685        write!(
10686            writer,
10687            "\tcount_2\tmin_2\tmax_2\tmean_2\tQ1_2\tmed_2\tQ3_2\tLW_2\tRW_2"
10688        )?;
10689    }
10690    writeln!(writer)?;
10691
10692    for pos in 0..hist.first_by_pos.len() {
10693        let sum1: u64 = hist.first_by_pos[pos].iter().sum();
10694        let sum2: u64 = hist.second_by_pos[pos].iter().sum();
10695        if sum1 == 0 && sum2 == 0 && !config.print_zero_coverage {
10696            break;
10697        }
10698        write!(writer, "{pos}")?;
10699        write_base_quality_summary(&mut writer, &hist.first_by_pos[pos])?;
10700        if hist.paired {
10701            write_base_quality_summary(&mut writer, &hist.second_by_pos[pos])?;
10702        }
10703        writeln!(writer)?;
10704    }
10705    writer.flush()?;
10706    Ok(())
10707}
10708
10709fn write_base_quality_summary(writer: &mut Box<dyn Write>, hist: &[u64]) -> Result<()> {
10710    let count: u64 = hist.iter().sum();
10711    let min = min_histogram(hist);
10712    let max = max_histogram(hist);
10713    let mean = average_histogram(hist);
10714    let q1 = percentile_histogram(hist, 0.25);
10715    let med = percentile_histogram(hist, 0.5);
10716    let q3 = percentile_histogram(hist, 0.75);
10717    let left_whisker = percentile_histogram(hist, 0.02);
10718    let right_whisker = percentile_histogram(hist, 0.98);
10719    write!(
10720        writer,
10721        "\t{count}\t{min}\t{max}\t{mean:.2}\t{q1}\t{med}\t{q3}\t{left_whisker}\t{right_whisker}"
10722    )?;
10723    Ok(())
10724}
10725
10726fn min_histogram(hist: &[u64]) -> usize {
10727    hist.iter().position(|count| *count > 0).unwrap_or_default()
10728}
10729
10730fn max_histogram(hist: &[u64]) -> usize {
10731    hist.iter()
10732        .rposition(|count| *count > 0)
10733        .unwrap_or_default()
10734}
10735
10736fn mode_histogram(hist: &[u64]) -> usize {
10737    hist.iter()
10738        .copied()
10739        .enumerate()
10740        .max_by_key(|(_, count)| *count)
10741        .map_or(0, |(idx, _)| idx)
10742}
10743
10744fn percentile_histogram(hist: &[u64], percentile: f64) -> usize {
10745    let total: u64 = hist.iter().sum();
10746    if total == 0 {
10747        return 0;
10748    }
10749    let threshold = ((total as f64) * percentile).ceil().max(1.0) as u64;
10750    let mut seen = 0u64;
10751    for (idx, count) in hist.iter().copied().enumerate() {
10752        seen += count;
10753        if seen >= threshold {
10754            return idx;
10755        }
10756    }
10757    hist.len().saturating_sub(1)
10758}
10759
10760fn average_histogram(hist: &[u64]) -> f64 {
10761    average_histogram_min(hist, 0)
10762}
10763
10764fn average_histogram_min(hist: &[u64], min_quality: usize) -> f64 {
10765    let mut count = 0u64;
10766    let mut sum = 0u64;
10767    for (quality, bases) in hist.iter().copied().enumerate().skip(min_quality) {
10768        count += bases;
10769        sum += quality as u64 * bases;
10770    }
10771    if count == 0 {
10772        0.0
10773    } else {
10774        sum as f64 / count as f64
10775    }
10776}
10777
10778fn stdev_histogram(hist: &[u64], mean: f64, min_quality: usize) -> f64 {
10779    let mut count = 0u64;
10780    let mut sum = 0.0;
10781    for (quality, bases) in hist.iter().copied().enumerate().skip(min_quality) {
10782        count += bases;
10783        let delta = quality as f64 - mean;
10784        sum += delta * delta * bases as f64;
10785    }
10786    if count == 0 {
10787        0.0
10788    } else {
10789        (sum / count as f64).sqrt()
10790    }
10791}
10792
10793fn write_length_hist(path: &Path, hist: &ReadDepthHistogram, config: &Config) -> Result<()> {
10794    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10795        .with_context(|| format!("creating length histogram {}", path.display()))?;
10796    writeln!(writer, "#Length\tReads\tBases")?;
10797
10798    let total_reads: u64 = hist.reads.iter().sum();
10799    let mut seen_reads = 0u64;
10800    let lim = hist.reads.len().saturating_sub(1);
10801
10802    for len in 0..lim {
10803        let reads = hist.reads[len];
10804        let bases = hist.bases[len];
10805        seen_reads += reads;
10806        if config.print_zero_coverage || reads > 0 {
10807            writeln!(writer, "{len}\t{reads}\t{bases}")?;
10808        }
10809        if seen_reads >= total_reads {
10810            break;
10811        }
10812    }
10813
10814    let overflow_reads: u64 = hist.reads.iter().skip(lim).sum();
10815    let overflow_bases: u64 = hist.bases.iter().skip(lim).sum();
10816    if overflow_reads > 0 || overflow_bases > 0 {
10817        writeln!(writer, "{lim}\t{overflow_reads}\t{overflow_bases}")?;
10818    }
10819    writer.flush()?;
10820    Ok(())
10821}
10822
10823fn write_gc_hist(path: &Path, hist: &ReadDepthHistogram, config: &Config) -> Result<()> {
10824    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10825        .with_context(|| format!("creating GC histogram {}", path.display()))?;
10826    writeln!(writer, "#GC_Bin\tReads\tBases")?;
10827
10828    let total_reads: u64 = hist.reads.iter().sum();
10829    let mut seen_reads = 0u64;
10830    for (bin, reads) in hist.reads.iter().copied().enumerate() {
10831        let bases = hist.bases[bin];
10832        seen_reads += reads;
10833        if config.print_zero_coverage || reads > 0 {
10834            writeln!(writer, "{bin}\t{reads}\t{bases}")?;
10835        }
10836        if seen_reads >= total_reads {
10837            break;
10838        }
10839    }
10840    writer.flush()?;
10841    Ok(())
10842}
10843
10844fn write_base_content_hist(
10845    path: &Path,
10846    hist: &BaseContentHistogram,
10847    config: &Config,
10848) -> Result<()> {
10849    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10850        .with_context(|| format!("creating base-content histogram {}", path.display()))?;
10851    writeln!(writer, "#Pos\tA\tC\tG\tT\tN")?;
10852    let first_rows = write_base_content_rows(&mut writer, &hist.first, 0, config)?;
10853    write_base_content_rows(&mut writer, &hist.second, first_rows, config)?;
10854    writer.flush()?;
10855    Ok(())
10856}
10857
10858fn write_base_content_rows(
10859    writer: &mut Box<dyn Write>,
10860    hist: &[BaseCounts],
10861    offset: usize,
10862    config: &Config,
10863) -> Result<usize> {
10864    let rows = if config.print_zero_coverage {
10865        hist.len()
10866    } else {
10867        hist.iter()
10868            .rposition(|counts| counts.total() > 0)
10869            .map_or(0, |idx| idx + 1)
10870    };
10871
10872    for (pos, counts) in hist.iter().copied().enumerate().take(rows) {
10873        let total = counts.total() as f64;
10874        let fraction = |value: u64| {
10875            if total == 0.0 {
10876                0.0
10877            } else {
10878                value as f64 / total
10879            }
10880        };
10881        writeln!(
10882            writer,
10883            "{}\t{:.5}\t{:.5}\t{:.5}\t{:.5}\t{:.5}",
10884            pos + offset,
10885            fraction(counts.a),
10886            fraction(counts.c),
10887            fraction(counts.g),
10888            fraction(counts.t),
10889            fraction(counts.n)
10890        )?;
10891    }
10892    Ok(rows)
10893}
10894
10895fn write_entropy_hist(path: &Path, hist: &[u64], config: &Config) -> Result<()> {
10896    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10897        .with_context(|| format!("creating entropy histogram {}", path.display()))?;
10898    let bins = hist.len().saturating_sub(1).max(1);
10899    let mult = 1.0 / bins as f64;
10900    let mean = average_histogram(hist) * mult;
10901    let median = percentile_histogram(hist, 0.5) as f64 * mult;
10902    let mode = mode_histogram(hist) as f64 * mult;
10903    let stdev = stdev_histogram(hist, average_histogram(hist), 0) * mult;
10904
10905    writeln!(writer, "#Mean\t{mean:.6}")?;
10906    writeln!(writer, "#Median\t{median:.6}")?;
10907    writeln!(writer, "#Mode\t{mode:.6}")?;
10908    writeln!(writer, "#STDev\t{stdev:.6}")?;
10909    writeln!(writer, "#Value\tCount")?;
10910
10911    for (idx, count) in hist.iter().copied().enumerate() {
10912        if config.print_zero_coverage || count > 0 {
10913            writeln!(writer, "{:.4}\t{count}", idx as f64 * mult)?;
10914        }
10915    }
10916    writer.flush()?;
10917    Ok(())
10918}
10919
10920fn write_identity_hist(path: &Path, hist: &ReadDepthHistogram, config: &Config) -> Result<()> {
10921    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10922        .with_context(|| format!("creating identity histogram {}", path.display()))?;
10923    let bins = hist.reads.len().saturating_sub(1).max(1);
10924    let mult = 100.0 / bins as f64;
10925    let mean_reads = average_histogram(&hist.reads) * mult;
10926    let mean_bases = average_histogram(&hist.bases) * mult;
10927    let median_reads = percentile_histogram(&hist.reads, 0.5) as f64 * mult;
10928    let median_bases = percentile_histogram(&hist.bases, 0.5) as f64 * mult;
10929    let mode_reads = mode_histogram(&hist.reads) as f64 * mult;
10930    let mode_bases = mode_histogram(&hist.bases) as f64 * mult;
10931    let stdev_reads = stdev_histogram(&hist.reads, average_histogram(&hist.reads), 0) * mult;
10932    let stdev_bases = stdev_histogram(&hist.bases, average_histogram(&hist.bases), 0) * mult;
10933
10934    writeln!(writer, "#Mean_reads\t{mean_reads:.3}")?;
10935    writeln!(writer, "#Mean_bases\t{mean_bases:.3}")?;
10936    writeln!(writer, "#Median_reads\t{median_reads:.0}")?;
10937    writeln!(writer, "#Median_bases\t{median_bases:.0}")?;
10938    writeln!(writer, "#Mode_reads\t{mode_reads:.0}")?;
10939    writeln!(writer, "#Mode_bases\t{mode_bases:.0}")?;
10940    writeln!(writer, "#STDev_reads\t{stdev_reads:.3}")?;
10941    writeln!(writer, "#STDev_bases\t{stdev_bases:.3}")?;
10942    writeln!(writer, "#Identity\tReads\tBases")?;
10943
10944    for (idx, reads) in hist.reads.iter().copied().enumerate() {
10945        let bases = hist.bases[idx];
10946        if config.print_zero_coverage || reads > 0 || bases > 0 {
10947            writeln!(writer, "{:.1}\t{reads}\t{bases}", idx as f64 * mult)?;
10948        }
10949    }
10950    writer.flush()?;
10951    Ok(())
10952}
10953
10954fn emit_alignment_fallback_side_outputs(
10955    config: &Config,
10956    hist: &AlignmentFallbackHistograms,
10957) -> Result<()> {
10958    if let Some(path) = &config.match_hist_out {
10959        write_match_fallback_hist(path, hist, config)?;
10960    }
10961    if let Some(path) = &config.insert_hist_out {
10962        write_insert_fallback_hist(path, hist, config)?;
10963    }
10964    if let Some(path) = &config.quality_accuracy_hist_out {
10965        write_quality_accuracy_fallback_hist(path, hist, config)?;
10966    }
10967    if let Some(path) = &config.indel_hist_out {
10968        write_indel_fallback_hist(path, config)?;
10969    }
10970    if let Some(path) = &config.error_hist_out {
10971        write_error_fallback_hist(path, hist, config)?;
10972    }
10973    Ok(())
10974}
10975
10976fn write_match_fallback_hist(
10977    path: &Path,
10978    hist: &AlignmentFallbackHistograms,
10979    config: &Config,
10980) -> Result<()> {
10981    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10982        .with_context(|| format!("creating match histogram {}", path.display()))?;
10983    if hist.paired {
10984        writeln!(
10985            writer,
10986            "#BaseNum\tMatch1\tSub1\tDel1\tIns1\tN1\tOther1\tMatch2\tSub2\tDel2\tIns2\tN2\tOther2"
10987        )?;
10988    } else {
10989        writeln!(writer, "#BaseNum\tMatch1\tSub1\tDel1\tIns1\tN1\tOther1")?;
10990    }
10991
10992    for pos in 0..hist.first_match.len() {
10993        let first = hist.first_match[pos];
10994        let second = hist.second_match[pos];
10995        if first.matches + first.n + second.matches + second.n == 0 && !config.print_zero_coverage {
10996            break;
10997        }
10998        write!(writer, "{}", pos + 1)?;
10999        write_match_fallback_columns(&mut writer, first)?;
11000        if hist.paired {
11001            write_match_fallback_columns(&mut writer, second)?;
11002        }
11003        writeln!(writer)?;
11004    }
11005    writer.flush()?;
11006    Ok(())
11007}
11008
11009fn write_match_fallback_columns(writer: &mut Box<dyn Write>, counts: MatchCounts) -> Result<()> {
11010    let total = (counts.matches + counts.n).max(1) as f64;
11011    write!(
11012        writer,
11013        "\t{:.5}\t0.00000\t0.00000\t0.00000\t{:.5}\t0.00000",
11014        counts.matches as f64 / total,
11015        counts.n as f64 / total
11016    )?;
11017    Ok(())
11018}
11019
11020fn write_insert_fallback_hist(
11021    path: &Path,
11022    hist: &AlignmentFallbackHistograms,
11023    config: &Config,
11024) -> Result<()> {
11025    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
11026        .with_context(|| format!("creating insert-size histogram {}", path.display()))?;
11027    let percent = if hist.read_count == 0 {
11028        0.0
11029    } else {
11030        (hist.pair_count * 2) as f64 * 100.0 / hist.read_count as f64
11031    };
11032    writeln!(writer, "#Mean\t0.000")?;
11033    writeln!(writer, "#Median\t0")?;
11034    writeln!(writer, "#Mode\t0")?;
11035    writeln!(writer, "#STDev\t0.000")?;
11036    writeln!(writer, "#PercentOfPairs\t{percent:.3}")?;
11037    writeln!(writer, "#InsertSize\tCount")?;
11038    writer.flush()?;
11039    Ok(())
11040}
11041
11042fn write_quality_accuracy_fallback_hist(
11043    path: &Path,
11044    hist: &AlignmentFallbackHistograms,
11045    config: &Config,
11046) -> Result<()> {
11047    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
11048        .with_context(|| format!("creating quality-accuracy histogram {}", path.display()))?;
11049    writeln!(writer, "#Deviation\t0.000")?;
11050    writeln!(writer, "#DeviationSub\t0.000")?;
11051    writeln!(writer, "#Avg_STDev\t0.000")?;
11052    writeln!(writer, "#Diversity\t0.000")?;
11053    writeln!(writer, "#Entropy\t0.000")?;
11054    writeln!(
11055        writer,
11056        "#Quality\tMatch\tSub\tIns\tDel\tTrueQuality\tTrueQualitySub"
11057    )?;
11058
11059    let mut remaining: u64 = hist.quality_match.iter().sum();
11060    for (quality, matches) in hist.quality_match.iter().copied().enumerate() {
11061        if matches > 0 || config.print_zero_coverage {
11062            writeln!(writer, "{quality}\t{matches}\t0\t0\t0\t\t")?;
11063        }
11064        remaining = remaining.saturating_sub(matches);
11065        if remaining == 0 && !config.print_zero_coverage {
11066            break;
11067        }
11068    }
11069    writer.flush()?;
11070    Ok(())
11071}
11072
11073fn write_indel_fallback_hist(path: &Path, config: &Config) -> Result<()> {
11074    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
11075        .with_context(|| format!("creating indel histogram {}", path.display()))?;
11076    writeln!(writer, "#Length\tDeletions\tInsertions")?;
11077    if config.print_zero_coverage {
11078        writeln!(writer, "0\t0\t0")?;
11079    }
11080    writer.flush()?;
11081    Ok(())
11082}
11083
11084fn write_error_fallback_hist(
11085    path: &Path,
11086    hist: &AlignmentFallbackHistograms,
11087    config: &Config,
11088) -> Result<()> {
11089    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
11090        .with_context(|| format!("creating error histogram {}", path.display()))?;
11091    writeln!(writer, "#Errors\tCount")?;
11092    if hist.read_count > 0 || config.print_zero_coverage {
11093        writeln!(writer, "0\t{}", hist.read_count)?;
11094    }
11095    writer.flush()?;
11096    Ok(())
11097}
11098
11099fn write_barcode_stats(
11100    path: &Path,
11101    barcodes: &BTreeMap<String, u64>,
11102    config: &Config,
11103) -> Result<()> {
11104    let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
11105        .with_context(|| format!("creating barcode stats {}", path.display()))?;
11106    let total: u64 = barcodes.values().copied().sum();
11107    writeln!(writer, "#Reads\t{total}")?;
11108    writeln!(writer, "#Barcodes\t{}", barcodes.len())?;
11109
11110    let mut sorted: Vec<_> = barcodes.iter().collect();
11111    sorted.sort_by(|(left_name, left_count), (right_name, right_count)| {
11112        right_count
11113            .cmp(left_count)
11114            .then_with(|| left_name.cmp(right_name))
11115    });
11116    for (barcode, count) in sorted {
11117        writeln!(writer, "{barcode}\t{count}")?;
11118    }
11119    writer.flush()?;
11120    Ok(())
11121}
11122
11123fn unique_from_raw(depth: usize, raw: u64) -> u64 {
11124    if depth < 1 {
11125        raw
11126    } else {
11127        (raw + (depth as u64 / 2)) / depth as u64
11128    }
11129}
11130
11131fn percentile_index(cov_last: usize, percentile: f64) -> usize {
11132    ((cov_last as f64) * (1.0 - percentile)) as usize
11133}
11134
11135fn deterministic_coin(rand: Option<f64>, depth: u64) -> u64 {
11136    debug_assert!(depth > 0);
11137    (((rand.unwrap_or(0.0) * depth as f64) as u64) + 1).min(depth)
11138}
11139
11140fn non_negative_depth(depth: i64) -> Option<u64> {
11141    u64::try_from(depth).ok()
11142}
11143
11144fn depth_below_min(depth: Option<u64>, min_depth: u64) -> bool {
11145    depth.is_none_or(|depth| depth < min_depth)
11146}
11147
11148fn u64_to_i64_saturating(value: u64) -> i64 {
11149    i64::try_from(value).unwrap_or(i64::MAX)
11150}
11151
11152fn min_option(a: Option<u64>, b: Option<u64>) -> Option<u64> {
11153    match (a, b) {
11154        (Some(a), Some(b)) => Some(a.min(b)),
11155        (Some(a), None) => Some(a),
11156        (None, Some(b)) => Some(b),
11157        (None, None) => None,
11158    }
11159}
11160
11161fn max_option(a: Option<u64>, b: Option<u64>) -> Option<u64> {
11162    match (a, b) {
11163        (Some(a), Some(b)) => Some(a.max(b)),
11164        (Some(a), None) => Some(a),
11165        (None, Some(b)) => Some(b),
11166        (None, None) => None,
11167    }
11168}
11169
11170fn limit_reached(limit: Option<u64>, reads_seen: u64) -> bool {
11171    limit.is_some_and(|limit| reads_seen >= limit)
11172}
11173
11174fn primary_input_lists(config: &Config) -> Option<InputLists> {
11175    if config.interleaved {
11176        return None;
11177    }
11178    let input = config.in1.as_ref()?;
11179    if input.exists() {
11180        return None;
11181    }
11182    let text = input.to_string_lossy();
11183    if !text.contains(',') {
11184        return None;
11185    }
11186    let first = split_path_list(&text);
11187    if first.len() <= 1 {
11188        return None;
11189    }
11190    let second = config.in2.as_ref().map(|path| {
11191        let text = path.to_string_lossy();
11192        split_path_list(&text)
11193    });
11194    Some(InputLists { first, second })
11195}
11196
11197fn split_path_list(value: &str) -> Vec<PathBuf> {
11198    value
11199        .split(',')
11200        .filter_map(|part| {
11201            let trimmed = part.trim();
11202            (!trimmed.is_empty()).then(|| PathBuf::from(trimmed))
11203        })
11204        .collect()
11205}
11206
11207fn sequence_settings(config: &Config) -> SequenceSettings {
11208    SequenceSettings {
11209        bases: BaseSettings {
11210            u_to_t: config.u_to_t,
11211            to_upper_case: config.to_upper_case,
11212            lower_case_to_n: config.lower_case_to_n,
11213            dot_dash_x_to_n: config.dot_dash_x_to_n,
11214            iupac_to_n: config.iupac_to_n,
11215            fix_junk_and_iupac: config.fix_junk_and_iupac,
11216            junk_mode: config.junk_mode,
11217        },
11218        qualities: QualitySettings {
11219            input_offset: config.quality_in_offset,
11220            min_called: config.min_called_quality,
11221            max_called: config.max_called_quality,
11222            change_quality: config.change_quality,
11223        },
11224    }
11225}
11226
11227fn open_sequence_writer(
11228    path: Option<&Path>,
11229    overwrite: bool,
11230    append: bool,
11231    quality_out_offset: u8,
11232    fake_quality: u8,
11233    fasta_wrap: usize,
11234    gzip_threads: Option<usize>,
11235) -> Result<Option<SequenceWriter>> {
11236    path.map(|path| {
11237        SequenceWriter::from_path_with_append_and_gzip_threads(
11238            path,
11239            overwrite,
11240            append,
11241            quality_out_offset,
11242            fake_quality,
11243            fasta_wrap,
11244            gzip_threads,
11245        )
11246    })
11247    .transpose()
11248}
11249
11250#[cfg(test)]
11251mod tests {
11252    use super::*;
11253    use crate::kmer::kmers_for_record;
11254    use crate::seqio::SequenceRecord;
11255    use std::fs;
11256
11257    fn record(id: &str, bases: &[u8]) -> SequenceRecord {
11258        SequenceRecord {
11259            id: id.to_string(),
11260            numeric_id: 0,
11261            bases: bases.to_vec(),
11262            qualities: Some(vec![b'I'; bases.len()]),
11263        }
11264    }
11265
11266    fn quality_record(id: &str, bases: &[u8], qualities: &[u8]) -> SequenceRecord {
11267        SequenceRecord {
11268            id: id.to_string(),
11269            numeric_id: 0,
11270            bases: bases.to_vec(),
11271            qualities: Some(qualities.to_vec()),
11272        }
11273    }
11274
11275    #[test]
11276    fn gzip_threads_are_split_across_concurrent_gzip_streams() {
11277        assert_eq!(gzip_threads_for_streams(None, 2), None);
11278        assert_eq!(gzip_threads_for_streams(Some(1), 2), Some(1));
11279        assert_eq!(gzip_threads_for_streams(Some(8), 0), Some(8));
11280        assert_eq!(gzip_threads_for_streams(Some(8), 1), Some(8));
11281        assert_eq!(gzip_threads_for_streams(Some(8), 2), Some(4));
11282        assert_eq!(gzip_threads_for_streams(Some(8), 3), Some(2));
11283        assert_eq!(gzip_threads_for_streams(Some(2), 4), Some(1));
11284
11285        assert_eq!(
11286            gzip_threads_for_paths(
11287                Some(8),
11288                [
11289                    Some(Path::new("reads_R1.fq.gz")),
11290                    Some(Path::new("reads_R2.fq.gz")),
11291                ],
11292            ),
11293            Some(4)
11294        );
11295        assert_eq!(
11296            gzip_threads_for_paths(
11297                Some(8),
11298                [
11299                    Some(Path::new("reads_R1.fq")),
11300                    Some(Path::new("reads_R2.fq.gz")),
11301                ],
11302            ),
11303            Some(8)
11304        );
11305    }
11306
11307    #[test]
11308    fn write_depth_hist_folds_zero_bin_without_cloning_input_hist() {
11309        let dir = tempfile::tempdir().unwrap();
11310        let path = dir.path().join("hist.tsv");
11311        let hist = vec![5, 7, 4];
11312        let config = Config {
11313            overwrite: true,
11314            ..Config::default()
11315        };
11316
11317        write_depth_hist(&path, &hist, &config).unwrap();
11318
11319        assert_eq!(hist, vec![5, 7, 4]);
11320        assert_eq!(
11321            fs::read_to_string(path).unwrap(),
11322            "#Depth\tRaw_Count\tUnique_Kmers\n1\t12\t12\n2\t4\t2\n"
11323        );
11324    }
11325
11326    #[test]
11327    fn write_depth_hist_preserves_zero_bin_when_requested() {
11328        let dir = tempfile::tempdir().unwrap();
11329        let path = dir.path().join("hist.tsv");
11330        let hist = vec![5, 7, 4];
11331        let config = Config {
11332            overwrite: true,
11333            zero_bin: true,
11334            ..Config::default()
11335        };
11336
11337        write_depth_hist(&path, &hist, &config).unwrap();
11338
11339        assert_eq!(
11340            fs::read_to_string(path).unwrap(),
11341            "#Depth\tRaw_Count\tUnique_Kmers\n0\t5\t5\n1\t7\t7\n2\t4\t2\n"
11342        );
11343    }
11344
11345    #[test]
11346    fn write_sparse_depth_hist_matches_dense_output() {
11347        let dir = tempfile::tempdir().unwrap();
11348        let dense_path = dir.path().join("dense.hist.tsv");
11349        let sparse_path = dir.path().join("sparse.hist.tsv");
11350        let hist = vec![5, 7, 4];
11351        let sparse = SparseHist::from_iter([(0, 5), (1, 7), (2, 4)]);
11352        let config = Config {
11353            overwrite: true,
11354            ..Config::default()
11355        };
11356
11357        write_depth_hist(&dense_path, &hist, &config).unwrap();
11358        write_sparse_depth_hist(&sparse_path, &sparse, hist.len(), &config).unwrap();
11359
11360        assert_eq!(
11361            fs::read_to_string(sparse_path).unwrap(),
11362            fs::read_to_string(dense_path).unwrap()
11363        );
11364    }
11365
11366    #[test]
11367    fn write_sparse_depth_hist_matches_dense_zero_coverage_columns_one() {
11368        let dir = tempfile::tempdir().unwrap();
11369        let dense_path = dir.path().join("dense.hist.tsv");
11370        let sparse_path = dir.path().join("sparse.hist.tsv");
11371        let hist = vec![0, 0, 6, 0, 4];
11372        let sparse = SparseHist::from_iter([(2, 6), (4, 4)]);
11373        let config = Config {
11374            overwrite: true,
11375            hist_columns: 1,
11376            print_zero_coverage: true,
11377            ..Config::default()
11378        };
11379
11380        write_depth_hist(&dense_path, &hist, &config).unwrap();
11381        write_sparse_depth_hist(&sparse_path, &sparse, hist.len(), &config).unwrap();
11382
11383        assert_eq!(
11384            fs::read_to_string(sparse_path).unwrap(),
11385            fs::read_to_string(dense_path).unwrap()
11386        );
11387    }
11388
11389    #[test]
11390    fn output_counts_sparse_depth_hist_matches_dense_hist() {
11391        let hist_len = 5;
11392        let mut exact = CountMap::default();
11393        exact.insert(KmerKey::Short(1), 1);
11394        exact.insert(KmerKey::Short(2), 3);
11395        exact.insert(KmerKey::Short(3), 9);
11396        let exact = OutputCounts::Exact(exact);
11397        assert_eq!(
11398            sparse_hist_to_dense(&exact.sparse_depth_hist(hist_len), hist_len),
11399            exact.depth_hist(hist_len)
11400        );
11401
11402        let mut packed = PackedCountMinSketch::new(8, 1, 4).unwrap();
11403        packed.set_cell(0, 1);
11404        packed.set_cell(1, 2);
11405        packed.set_cell(2, 9);
11406        let packed = OutputCounts::Sketch(packed);
11407        assert_eq!(
11408            sparse_hist_to_dense(&packed.sparse_depth_hist(hist_len), hist_len),
11409            packed.depth_hist(hist_len)
11410        );
11411
11412        let atomic = AtomicCountMinSketch::new(64, 1).unwrap();
11413        atomic.add_key_count(&KmerKey::Short(7), 2);
11414        atomic.add_key_count(&KmerKey::Short(11), 4);
11415        atomic.add_key_count(&KmerKey::Short(13), 9);
11416        let atomic = OutputCounts::AtomicSketch(atomic);
11417        assert_eq!(
11418            sparse_hist_to_dense(&atomic.sparse_depth_hist(hist_len), hist_len),
11419            atomic.depth_hist(hist_len)
11420        );
11421    }
11422
11423    #[test]
11424    fn sparse_peak_dense_trims_trailing_zero_histlen_without_changing_peaks() {
11425        let dir = tempfile::tempdir().unwrap();
11426        let dense_path = dir.path().join("dense.peaks.tsv");
11427        let compact_path = dir.path().join("compact.peaks.tsv");
11428        let hist_len = 10_000;
11429        let mut dense = vec![0u64; hist_len];
11430        dense[18] = 180;
11431        dense[19] = 380;
11432        dense[20] = 720;
11433        dense[21] = 380;
11434        dense[22] = 180;
11435        let sparse = SparseHist::from_iter(
11436            dense
11437                .iter()
11438                .copied()
11439                .enumerate()
11440                .filter_map(|(depth, raw)| (raw > 0).then_some((depth, raw))),
11441        );
11442        let compact = sparse_hist_to_peak_dense(&sparse, hist_len);
11443        let config = Config {
11444            overwrite: true,
11445            k: 5,
11446            peak_min_height: 1,
11447            peak_min_volume: 1,
11448            peak_min_width: 1,
11449            peak_min_peak: 1,
11450            peak_max_peak: 100,
11451            peak_max_count: 8,
11452            ..Config::default()
11453        };
11454
11455        assert!(compact.len() < 128);
11456        write_peaks(&dense_path, &dense, &config).unwrap();
11457        write_peaks(&compact_path, &compact, &config).unwrap();
11458
11459        assert_eq!(
11460            fs::read_to_string(compact_path).unwrap(),
11461            fs::read_to_string(dense_path).unwrap()
11462        );
11463    }
11464
11465    #[test]
11466    fn write_sparse_read_depth_hist_matches_dense_output() {
11467        let dir = tempfile::tempdir().unwrap();
11468        let dense_path = dir.path().join("dense.rhist.tsv");
11469        let sparse_path = dir.path().join("sparse.rhist.tsv");
11470        let mut dense = ReadDepthHistogram::new(4);
11471        dense.reads[0] = 5;
11472        dense.bases[0] = 500;
11473        dense.reads[1] = 7;
11474        dense.bases[1] = 700;
11475        dense.reads[3] = 4;
11476        dense.bases[3] = 400;
11477        let mut sparse = SparseReadDepthHist::default();
11478        sparse.insert(0, (5, 500));
11479        sparse.insert(1, (7, 700));
11480        sparse.insert(3, (4, 400));
11481        let config = Config {
11482            overwrite: true,
11483            ..Config::default()
11484        };
11485
11486        write_read_depth_hist(&dense_path, &dense, &config).unwrap();
11487        write_sparse_read_depth_hist(&sparse_path, &sparse, 4, &config).unwrap();
11488
11489        assert_eq!(
11490            fs::read_to_string(sparse_path).unwrap(),
11491            fs::read_to_string(dense_path).unwrap()
11492        );
11493    }
11494
11495    #[test]
11496    fn write_sparse_read_depth_hist_streams_zero_coverage_without_dense_histogram() {
11497        let dir = tempfile::tempdir().unwrap();
11498        let path = dir.path().join("sparse.rhist.tsv");
11499        let mut sparse = SparseReadDepthHist::default();
11500        sparse.insert(2, (1, 8));
11501        let config = Config {
11502            overwrite: true,
11503            print_zero_coverage: true,
11504            ..Config::default()
11505        };
11506
11507        write_sparse_read_depth_hist(&path, &sparse, 8, &config).unwrap();
11508
11509        assert_eq!(
11510            fs::read_to_string(path).unwrap(),
11511            "#Depth\tReads\tBases\n0\t0\t0\n1\t0\t0\n2\t1\t8\n"
11512        );
11513    }
11514
11515    #[test]
11516    fn output_gzip_threads_are_split_across_all_active_output_streams() {
11517        fn plan(first: Option<&str>, second: Option<&str>) -> OutputPathPlan {
11518            OutputPathPlan {
11519                pairs: vec![OutputPathPair {
11520                    first: first.map(PathBuf::from),
11521                    second: second.map(PathBuf::from),
11522                }],
11523                fanout: false,
11524            }
11525        }
11526
11527        let keep = plan(Some("keep1.fq.gz"), Some("keep2.fq.gz"));
11528        let toss = plan(Some("toss1.fq.gz"), Some("toss2.fq.gz"));
11529        let low = plan(Some("low.fq.gz"), None);
11530        let mid = plan(Some("mid.fq"), None);
11531        let high = plan(None, None);
11532        let uncorrected = plan(Some("uncorrected1.fq.gz"), Some("uncorrected2.fq.gz"));
11533
11534        assert_eq!(
11535            output_gzip_threads_for_plans(
11536                Some(8),
11537                [&keep, &toss, &low, &mid, &high, &uncorrected],
11538                0
11539            )
11540            .unwrap(),
11541            Some(1)
11542        );
11543
11544        assert_eq!(
11545            output_gzip_threads_for_plans(Some(8), [&keep, &toss], 0).unwrap(),
11546            Some(2)
11547        );
11548    }
11549
11550    fn write_fastq(path: &Path, records: &[(&str, &[u8], &[u8])]) {
11551        let mut text = Vec::new();
11552        for (id, bases, qualities) in records {
11553            text.extend_from_slice(b"@");
11554            text.extend_from_slice(id.as_bytes());
11555            text.extend_from_slice(b"\n");
11556            text.extend_from_slice(bases);
11557            text.extend_from_slice(b"\n+\n");
11558            text.extend_from_slice(qualities);
11559            text.extend_from_slice(b"\n");
11560        }
11561        fs::write(path, text).unwrap();
11562    }
11563
11564    fn write_repeated_fastq(
11565        path: &Path,
11566        prefix: &str,
11567        bases: &[u8],
11568        qualities: &[u8],
11569        count: usize,
11570    ) {
11571        let mut text = Vec::new();
11572        for index in 1..=count {
11573            text.extend_from_slice(b"@");
11574            text.extend_from_slice(format!("{prefix}{index}").as_bytes());
11575            text.extend_from_slice(b"\n");
11576            text.extend_from_slice(bases);
11577            text.extend_from_slice(b"\n+\n");
11578            text.extend_from_slice(qualities);
11579            text.extend_from_slice(b"\n");
11580        }
11581        fs::write(path, text).unwrap();
11582    }
11583
11584    #[test]
11585    fn exact_counts_remove_duplicate_kmers_per_read() {
11586        let config = Config {
11587            k: 3,
11588            min_quality: 0,
11589            min_prob: 0.0,
11590            ..Config::default()
11591        };
11592        let mut counts = CountMap::default();
11593        increment_pair_counts(&config, &mut counts, &record("r1", b"AAAAAA"), None);
11594        assert_eq!(counts.values().copied().sum::<u64>(), 1);
11595    }
11596
11597    #[test]
11598    fn exact_counts_keep_duplicate_long_kmers_like_java_bbnorm() {
11599        let config = Config {
11600            k: 40,
11601            min_quality: 0,
11602            min_prob: 0.0,
11603            ..Config::default()
11604        };
11605        let mut counts = CountMap::default();
11606        let record = record("r1", b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA");
11607        let kmers = kmers_for_record(&record, &config);
11608        assert!(kmers.len() > 1);
11609        assert!(kmers.windows(2).all(|pair| pair[0] == pair[1]));
11610
11611        increment_pair_counts(&config, &mut counts, &record, None);
11612
11613        assert_eq!(counts.len(), 1);
11614        assert_eq!(counts.values().copied().sum::<u64>(), kmers.len() as u64);
11615    }
11616
11617    #[test]
11618    fn constrained_count_min_inflates_colliding_counts() {
11619        let config = Config {
11620            count_min: crate::cli::CountMinSettings {
11621                cells: Some(1),
11622                hashes: Some(2),
11623                bits: Some(8),
11624                memory_bytes: None,
11625            },
11626            ..Config::default()
11627        };
11628        let mut counts = CountMap::default();
11629        counts.insert(KmerKey::Short(7), 2);
11630        counts.insert(KmerKey::Short(11), 5);
11631
11632        apply_count_min_collision_estimates(&config, &mut counts);
11633
11634        assert_eq!(counts.get(&KmerKey::Short(7)), Some(&7));
11635        assert_eq!(counts.get(&KmerKey::Short(11)), Some(&7));
11636    }
11637
11638    #[test]
11639    fn constrained_count_min_honors_cell_bit_saturation() {
11640        let config = Config {
11641            count_min: crate::cli::CountMinSettings {
11642                cells: Some(1),
11643                hashes: Some(1),
11644                bits: Some(2),
11645                memory_bytes: None,
11646            },
11647            ..Config::default()
11648        };
11649        let mut counts = CountMap::default();
11650        counts.insert(KmerKey::Short(7), 2);
11651        counts.insert(KmerKey::Short(11), 5);
11652
11653        apply_count_min_collision_estimates(&config, &mut counts);
11654
11655        assert_eq!(counts.get(&KmerKey::Short(7)), Some(&3));
11656        assert_eq!(counts.get(&KmerKey::Short(11)), Some(&3));
11657    }
11658
11659    #[test]
11660    fn constrained_count_min_caps_wide_cells_like_kcountarray() {
11661        let config = Config {
11662            count_min: crate::cli::CountMinSettings {
11663                cells: Some(1),
11664                hashes: Some(1),
11665                bits: Some(32),
11666                memory_bytes: None,
11667            },
11668            ..Config::default()
11669        };
11670        let mut counts = CountMap::default();
11671        counts.insert(KmerKey::Short(7), i32::MAX as u64 + 10);
11672        counts.insert(KmerKey::Short(11), 1);
11673
11674        apply_count_min_collision_estimates(&config, &mut counts);
11675
11676        assert_eq!(counts.get(&KmerKey::Short(7)), Some(&(i32::MAX as u64)));
11677        assert_eq!(counts.get(&KmerKey::Short(11)), Some(&(i32::MAX as u64)));
11678        assert_eq!(count_min_max_count(31), i32::MAX as u64);
11679        assert_eq!(count_min_max_count(32), i32::MAX as u64);
11680        assert_eq!(count_min_max_count(64), i32::MAX as u64);
11681    }
11682
11683    #[test]
11684    fn count_min_budget_guard_rejects_tables_above_safe_memory() {
11685        let available = 1_000_000usize;
11686        let safe_budget = safe_explicit_count_min_bytes(available);
11687        let fitting_cells = safe_budget / 4;
11688        assert!(
11689            ensure_count_min_budget_fits_ceiling("main", fitting_cells, 32, safe_budget).is_ok()
11690        );
11691
11692        let oversized_cells = safe_budget.div_ceil(4) + 1;
11693        let err = ensure_count_min_budget_fits_ceiling("main", oversized_cells, 32, safe_budget)
11694            .unwrap_err()
11695            .to_string();
11696        assert!(
11697            err.contains("above safe memory budget"),
11698            "unexpected error: {err}"
11699        );
11700    }
11701
11702    #[test]
11703    fn count_min_budget_guard_respects_configured_memory_below_available_ram() {
11704        let configured = 1_000_000usize;
11705        let available = 10_000_000usize;
11706        let safe_budget = count_min_safe_budget_bytes(Some(configured), Some(available)).unwrap();
11707        assert_eq!(safe_budget, configured);
11708
11709        assert!(ensure_count_min_budget_fits_ceiling("main", 250_000, 32, safe_budget).is_ok());
11710
11711        let cells_that_fit_available_but_not_configured = 250_001usize;
11712        let err = ensure_count_min_budget_fits_ceiling(
11713            "main",
11714            cells_that_fit_available_but_not_configured,
11715            32,
11716            safe_budget,
11717        )
11718        .unwrap_err()
11719        .to_string();
11720        assert!(
11721            err.contains("above safe memory budget"),
11722            "unexpected configured-budget error: {err}"
11723        );
11724    }
11725
11726    #[test]
11727    fn count_min_budget_guard_rejects_size_overflow_before_prime_sizing() {
11728        let err = count_min_total_bytes(usize::MAX, 32)
11729            .unwrap_err()
11730            .to_string();
11731        assert!(
11732            err.contains("overflowed"),
11733            "unexpected overflow error: {err}"
11734        );
11735    }
11736
11737    #[test]
11738    fn count_min_hash_uses_bbtools_row_rotation_masks() {
11739        let key = KmerKey::Short(0x1234_5678_9abc_def0);
11740        let first = count_min_bucket(&key, 0, 1024);
11741        let second = count_min_bucket(&key, 1, 1024);
11742        let third = count_min_bucket(&key, 2, 1024);
11743
11744        assert!(first < 1024);
11745        assert!(second < 1024);
11746        assert!(third < 1024);
11747        assert_ne!(first, second);
11748        assert_ne!(second, third);
11749
11750        let row0 = bbtools_mask_hash(raw_kmer_key(&key), 0, BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED);
11751        let row1 = bbtools_mask_hash(
11752            row0.rotate_right(BBTOOLS_HASH_BITS),
11753            1,
11754            BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
11755        );
11756        assert_eq!(
11757            count_min_bucket(&key, 1, 1024),
11758            KCountArrayLayout::new(1024, 32).bucket(row1)
11759        );
11760
11761        let expected = [
11762            0x575a_4571_d954_c5e8,
11763            0x12bb_293c_ca33_0af3,
11764            0x0287_fcd8_b8b4_e1c9,
11765            0x2b62_7d06_2179_52bb,
11766            0x6bc1_463c_9db3_e422,
11767            0x710a_bca5_aeb9_5819,
11768            0x2487_597d_41ef_8ea1,
11769            0x653b_8694_aa03_bbf0,
11770        ];
11771        assert_eq!(
11772            &bbtools_hash_masks(BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED)[0][..8],
11773            expected.as_slice()
11774        );
11775
11776        for row in bbtools_hash_masks(BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED) {
11777            for &mask in row {
11778                assert_eq!((mask & 0xffff_ffff).count_ones(), 16);
11779                assert!((15..=16).contains(&(mask >> 32).count_ones()));
11780                assert_eq!(mask >> 63, 0);
11781            }
11782        }
11783    }
11784
11785    #[test]
11786    fn prefilter_and_main_sketches_use_independent_kcountarray_mask_seeds() {
11787        let config = Config {
11788            count_min: crate::cli::CountMinSettings {
11789                cells: Some(512),
11790                hashes: Some(2),
11791                bits: Some(32),
11792                memory_bytes: None,
11793            },
11794            prefilter: crate::cli::PrefilterSettings {
11795                enabled: true,
11796                force_disabled: false,
11797                ..Default::default()
11798            },
11799            ..Config::default()
11800        };
11801
11802        let prefilter = new_prefilter_count_min_sketch(&config).unwrap();
11803        let main = new_atomic_count_min_sketch_with_mask_seed(
11804            &config,
11805            BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED,
11806        )
11807        .unwrap();
11808        let key = KmerKey::Short(0x1234_5678_9abc_def0);
11809
11810        assert_eq!(
11811            prefilter.layout.mask_seed,
11812            BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED
11813        );
11814        assert_eq!(main.layout.mask_seed, BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED);
11815        assert_ne!(
11816            count_min_bucket_with_layout(&key, 0, prefilter.layout),
11817            count_min_bucket_with_layout(&key, 0, main.layout)
11818        );
11819    }
11820
11821    #[test]
11822    fn nondeterministic_input_prefilter_uses_atomic_packed_sketch() {
11823        let config = Config {
11824            deterministic: false,
11825            count_min: crate::cli::CountMinSettings {
11826                cells: Some(512),
11827                hashes: Some(3),
11828                bits: Some(32),
11829                memory_bytes: None,
11830            },
11831            prefilter: crate::cli::PrefilterSettings {
11832                enabled: true,
11833                force_disabled: false,
11834                cells: Some(256),
11835                hashes: Some(2),
11836                bits: Some(2),
11837                memory_bytes: None,
11838                memory_fraction_micros: None,
11839            },
11840            ..Config::default()
11841        };
11842
11843        let prefilter = new_input_prefilter_count_min_sketch(&config).unwrap();
11844        let layout = prefilter.layout_summary("input_prefilter", Some(prefilter.max_count()));
11845
11846        assert!(matches!(
11847            prefilter,
11848            PrefilterCountMinSketch::AtomicPacked(_)
11849        ));
11850        assert_eq!(layout.kind, "atomic_packed");
11851        assert_eq!(layout.bits, 2);
11852        assert_eq!(layout.hashes, 2);
11853        assert_eq!(layout.update_mode, "conservative");
11854    }
11855
11856    #[test]
11857    fn nondefault_kcountarray_mask_seeds_are_cached() {
11858        let seed = BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED + BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP * 2;
11859        let first = bbtools_hash_masks(seed);
11860        let second = bbtools_hash_masks(seed);
11861        let third = bbtools_hash_masks(seed + BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP);
11862
11863        assert!(std::ptr::eq(first, second));
11864        assert!(!std::ptr::eq(first, third));
11865        assert_ne!(first[0][0], third[0][0]);
11866    }
11867
11868    #[test]
11869    fn countup_prefilter_mask_seed_uses_dedicated_hot_cache() {
11870        let config = Config {
11871            count_up: true,
11872            prefilter: crate::cli::PrefilterSettings {
11873                enabled: true,
11874                force_disabled: false,
11875                ..Default::default()
11876            },
11877            count_min: crate::cli::CountMinSettings {
11878                cells: Some(10_000),
11879                bits: Some(32),
11880                ..Default::default()
11881            },
11882            ..Config::default()
11883        };
11884
11885        let seed = countup_output_mask_seed(&config);
11886        assert_eq!(seed, BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED);
11887        assert!(std::ptr::eq(
11888            bbtools_hash_masks(seed),
11889            bbtools_hash_masks(BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED)
11890        ));
11891    }
11892
11893    #[test]
11894    fn kcount_layout_carries_resolved_mask_table_for_bucket_fills() {
11895        let layout = KCountArrayLayout::new_with_min_arrays_and_mask_seed(
11896            4096,
11897            32,
11898            BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
11899            BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED,
11900        );
11901
11902        assert!(std::ptr::eq(
11903            layout.masks,
11904            bbtools_hash_masks(BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED)
11905        ));
11906        assert_eq!(layout.mask_seed, BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED);
11907    }
11908
11909    #[test]
11910    fn incremental_count_min_buckets_match_row_hash_replay() {
11911        let layout = KCountArrayLayout::new_with_min_arrays_and_mask_seed(
11912            4096,
11913            32,
11914            BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
11915            BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED,
11916        );
11917        for raw in [0, 1, 7, 31, 63, 255, 0x1234_5678_9abc_def0] {
11918            let key = KmerKey::Short(raw);
11919            let mut slots = [usize::MAX; 16];
11920            fill_count_min_buckets(&key, 8, layout, &mut slots);
11921
11922            for (hash_index, slot) in slots.iter().enumerate().take(8) {
11923                assert_eq!(
11924                    *slot,
11925                    count_min_bucket_with_layout(&key, hash_index, layout)
11926                );
11927            }
11928        }
11929    }
11930
11931    fn find_partial_row_collision(
11932        cells: usize,
11933        bits: u8,
11934    ) -> (KmerKey, KmerKey, usize, usize, usize) {
11935        let layout = KCountArrayLayout::new(cells, bits);
11936        let mut seen: Vec<Option<(KmerKey, usize)>> = vec![None; cells];
11937        for raw in 0..100_000u64 {
11938            let key = KmerKey::Short(raw);
11939            let row0 = count_min_bucket_with_layout(&key, 0, layout);
11940            let row1 = count_min_bucket_with_layout(&key, 1, layout);
11941            if let Some((previous, previous_row1)) = &seen[row0] {
11942                if *previous_row1 != row1 {
11943                    return (previous.clone(), key, row0, *previous_row1, row1);
11944                }
11945            } else {
11946                seen[row0] = Some((key, row1));
11947            }
11948        }
11949        panic!("expected to find a partial row collision for {cells} cells");
11950    }
11951
11952    fn find_two_sided_partial_collisions(cells: usize, bits: u8) -> (KmerKey, KmerKey, KmerKey) {
11953        let layout = KCountArrayLayout::new(cells, bits);
11954        let base = KmerKey::Short(0);
11955        let base_row0 = count_min_bucket_with_layout(&base, 0, layout);
11956        let base_row1 = count_min_bucket_with_layout(&base, 1, layout);
11957        let mut row0_match = None;
11958        let mut row1_match = None;
11959        for raw in 1..200_000u64 {
11960            let key = KmerKey::Short(raw);
11961            let row0 = count_min_bucket_with_layout(&key, 0, layout);
11962            let row1 = count_min_bucket_with_layout(&key, 1, layout);
11963            if row0 == base_row0 && row1 != base_row1 && row0_match.is_none() {
11964                row0_match = Some(key.clone());
11965            }
11966            if row1 == base_row1 && row0 != base_row0 && row1_match.is_none() {
11967                row1_match = Some(key);
11968            }
11969            if let (Some(row0_match), Some(row1_match)) = (row0_match.clone(), row1_match.clone()) {
11970                return (base, row0_match, row1_match);
11971            }
11972        }
11973        panic!("expected to find two-sided partial row collisions for {cells} cells");
11974    }
11975
11976    #[test]
11977    fn prefilter_sketch_defaults_to_kcountarray_locked_updates() {
11978        let config = Config {
11979            prefilter: crate::cli::PrefilterSettings {
11980                enabled: true,
11981                force_disabled: false,
11982                cells: Some(128),
11983                hashes: Some(2),
11984                bits: Some(2),
11985                memory_bytes: None,
11986                memory_fraction_micros: None,
11987            },
11988            threads: Some(2),
11989            ..Config::default()
11990        };
11991        let mut prefilter = new_prefilter_count_min_sketch(&config).unwrap();
11992        assert_eq!(prefilter.update_mode, CountMinUpdateMode::Conservative);
11993        let (left, right, row0, _, _) = find_partial_row_collision(prefilter.cells, prefilter.bits);
11994
11995        prefilter.add_key_count(&left, 2);
11996        prefilter.add_key_count(&right, 1);
11997
11998        assert_eq!(prefilter.cell(row0), 2);
11999    }
12000
12001    #[test]
12002    fn lockedincrement_false_uses_independent_row_increments() {
12003        let config = Config {
12004            prefilter: crate::cli::PrefilterSettings {
12005                enabled: true,
12006                force_disabled: false,
12007                cells: Some(128),
12008                hashes: Some(2),
12009                bits: Some(2),
12010                memory_bytes: None,
12011                memory_fraction_micros: None,
12012            },
12013            locked_increment: Some(false),
12014            threads: Some(2),
12015            ..Config::default()
12016        };
12017        let mut unlocked = new_prefilter_count_min_sketch(&config).unwrap();
12018        assert_eq!(unlocked.update_mode, CountMinUpdateMode::Independent);
12019        let (left, right, row0, row1_left, row1_right) =
12020            find_partial_row_collision(unlocked.cells, unlocked.bits);
12021
12022        let mut locked =
12023            PackedCountMinSketch::new(unlocked.cells, unlocked.hashes, unlocked.bits).unwrap();
12024        locked.add_key_count(&left, 2);
12025        locked.add_key_count(&right, 1);
12026        unlocked.add_key_count(&left, 2);
12027        unlocked.add_key_count(&right, 1);
12028
12029        assert_eq!(locked.cell(row0), 2);
12030        assert_eq!(unlocked.cell(row0), 3);
12031        assert_eq!(unlocked.cell(row1_left), 2);
12032        assert_eq!(unlocked.cell(row1_right), 1);
12033    }
12034
12035    #[test]
12036    fn atomic_count_min_honors_unlocked_independent_updates() {
12037        let config = Config {
12038            count_min: crate::cli::CountMinSettings {
12039                cells: Some(128),
12040                hashes: Some(2),
12041                bits: Some(32),
12042                memory_bytes: None,
12043            },
12044            locked_increment: Some(false),
12045            threads: Some(2),
12046            ..Config::default()
12047        };
12048        let unlocked = new_atomic_count_min_sketch(&config).unwrap();
12049        assert_eq!(unlocked.update_mode, CountMinUpdateMode::Independent);
12050        let (left, right, row0, row1_left, row1_right) =
12051            find_partial_row_collision(unlocked.cells, 32);
12052
12053        let locked = AtomicCountMinSketch::new(unlocked.cells, unlocked.hashes).unwrap();
12054        locked.add_key_count(&left, 2);
12055        locked.add_key_count(&right, 1);
12056        unlocked.add_key_count(&left, 2);
12057        unlocked.add_key_count(&right, 1);
12058
12059        assert_eq!(locked.cells_by_hash[row0].load(Ordering::Relaxed), 2);
12060        assert_eq!(unlocked.cells_by_hash[row0].load(Ordering::Relaxed), 3);
12061        assert_eq!(unlocked.cells_by_hash[row1_left].load(Ordering::Relaxed), 2);
12062        assert_eq!(
12063            unlocked.cells_by_hash[row1_right].load(Ordering::Relaxed),
12064            1
12065        );
12066    }
12067
12068    #[test]
12069    fn atomic_count_min_allocates_locks_only_for_conservative_updates() {
12070        let conservative = new_atomic_count_min_sketch(&Config {
12071            count_min: crate::cli::CountMinSettings {
12072                cells: Some(128),
12073                hashes: Some(2),
12074                bits: Some(32),
12075                memory_bytes: None,
12076            },
12077            ..Config::default()
12078        })
12079        .unwrap();
12080        let independent = new_atomic_count_min_sketch(&Config {
12081            count_min: crate::cli::CountMinSettings {
12082                cells: Some(128),
12083                hashes: Some(2),
12084                bits: Some(32),
12085                memory_bytes: None,
12086            },
12087            locked_increment: Some(false),
12088            ..Config::default()
12089        })
12090        .unwrap();
12091
12092        assert_eq!(conservative.locks.len(), BBTOOLS_KCOUNT_ARRAY_LOCKS);
12093        assert!(independent.locks.is_empty());
12094    }
12095
12096    #[test]
12097    fn atomic_count_min_parallel_replay_requires_nondeterministic_mode() {
12098        let deterministic = new_atomic_count_min_sketch(&Config {
12099            count_min: crate::cli::CountMinSettings {
12100                cells: Some(128),
12101                hashes: Some(2),
12102                bits: Some(32),
12103                memory_bytes: None,
12104            },
12105            deterministic: true,
12106            ..Config::default()
12107        })
12108        .unwrap();
12109        let nondeterministic = new_atomic_count_min_sketch(&Config {
12110            count_min: crate::cli::CountMinSettings {
12111                cells: Some(128),
12112                hashes: Some(2),
12113                bits: Some(32),
12114                memory_bytes: None,
12115            },
12116            deterministic: false,
12117            ..Config::default()
12118        })
12119        .unwrap();
12120
12121        assert!(!deterministic.parallel_replay);
12122        assert!(nondeterministic.parallel_replay);
12123    }
12124
12125    #[test]
12126    fn packed_count_min_increment_returns_previous_min_like_kcountarray() {
12127        let key = KmerKey::Short(7);
12128        let mut sketch = PackedCountMinSketch::new(128, 2, 4).unwrap();
12129
12130        assert_eq!(sketch.increment_and_return_unincremented(&key, 1), 0);
12131        assert_eq!(sketch.depth(&key), 1);
12132        assert_eq!(sketch.increment_and_return_unincremented(&key, 3), 1);
12133        assert_eq!(sketch.depth(&key), 4);
12134    }
12135
12136    #[test]
12137    fn packed_count_min_increment_return_saturates_at_cell_max() {
12138        let key = KmerKey::Short(11);
12139        let mut sketch = PackedCountMinSketch::new(1, 2, 2).unwrap();
12140
12141        assert_eq!(sketch.increment_and_return_unincremented(&key, 10), 0);
12142        assert_eq!(sketch.depth(&key), 3);
12143        assert_eq!(sketch.increment_and_return_unincremented(&key, 1), 3);
12144        assert_eq!(sketch.depth(&key), 3);
12145    }
12146
12147    #[test]
12148    fn atomic_count_min_increment_returns_previous_min_like_kcountarray() {
12149        let key = KmerKey::Short(13);
12150        let sketch = AtomicCountMinSketch::new(128, 2).unwrap();
12151
12152        assert_eq!(sketch.increment_and_return_unincremented(&key, 1), 0);
12153        assert_eq!(sketch.depth(&key), 1);
12154        assert_eq!(sketch.increment_and_return_unincremented(&key, 3), 1);
12155        assert_eq!(sketch.depth(&key), 4);
12156    }
12157
12158    #[test]
12159    fn atomic_packed_count_min_matches_packed_sequential_updates() {
12160        let keys = [
12161            (KmerKey::Short(13), 1),
12162            (KmerKey::Short(29), 2),
12163            (KmerKey::Short(13), 1),
12164            (KmerKey::Short(47), 3),
12165        ];
12166        let mut packed = PackedCountMinSketch::new_with_min_arrays_and_mask_seed(
12167            4099,
12168            3,
12169            2,
12170            BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
12171            BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
12172        )
12173        .unwrap();
12174        let atomic = AtomicPackedCountMinSketch::new_with_min_arrays_and_update_mode(
12175            4099,
12176            3,
12177            2,
12178            BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
12179            CountMinUpdateMode::Conservative,
12180            BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
12181        )
12182        .unwrap();
12183
12184        for (key, count) in &keys {
12185            packed.add_key_count(key, *count);
12186            atomic.add_key_count(key, *count);
12187        }
12188        let key_increments = keys.iter().map(|(_, count)| *count).sum();
12189        packed.add_key_increments(key_increments);
12190        atomic.add_key_increments(key_increments);
12191
12192        for slot in 0..packed.cells {
12193            assert_eq!(atomic.cell(slot), packed.cell(slot));
12194        }
12195        let occupied = (0..packed.cells)
12196            .filter(|&slot| packed.cell(slot) > 0)
12197            .count();
12198        assert_eq!(atomic.occupied_slots_at_least(1), occupied);
12199        assert_eq!(atomic.unique_kmers(), packed.unique_kmers());
12200    }
12201
12202    #[test]
12203    fn atomic_count_min_conservative_updates_are_key_locked_like_kcountarray() {
12204        let key = KmerKey::Short(13);
12205        let pool = rayon::ThreadPoolBuilder::new()
12206            .num_threads(4)
12207            .build()
12208            .unwrap();
12209
12210        pool.install(|| {
12211            let sketch = AtomicCountMinSketch::new(128, 3).unwrap();
12212
12213            (0..10_000u64)
12214                .into_par_iter()
12215                .for_each(|_| sketch.add_key_count(&key, 1));
12216
12217            assert_eq!(sketch.depth(&key), 10_000);
12218        });
12219    }
12220
12221    #[test]
12222    fn atomic_count_min_bulk_replay_matches_locked_sequential_updates() {
12223        let mut counts = CountMap::default();
12224        counts.insert(KmerKey::Short(13), 17);
12225        counts.insert(KmerKey::Short(29), 3);
12226        counts.insert(KmerKey::Short(31), 9);
12227        let locked = AtomicCountMinSketch::new(128, 3).unwrap();
12228        let bulk = AtomicCountMinSketch::new(128, 3).unwrap();
12229
12230        for (key, count) in &counts {
12231            locked.add_key_count(key, *count);
12232        }
12233        bulk.add_key_counts(&counts);
12234
12235        for slot in 0..locked.cells {
12236            assert_eq!(
12237                locked.cells_by_hash[slot].load(Ordering::Relaxed),
12238                bulk.cells_by_hash[slot].load(Ordering::Relaxed)
12239            );
12240        }
12241    }
12242
12243    #[test]
12244    fn packed_count_min_reduced_sorted_replay_matches_individual_kmer_updates() {
12245        let keys = [
12246            KmerKey::Short(13),
12247            KmerKey::Short(29),
12248            KmerKey::Short(13),
12249            KmerKey::Short(31),
12250            KmerKey::Short(29),
12251            KmerKey::Short(29),
12252            KmerKey::Short(47),
12253        ];
12254        let mut individual = PackedCountMinSketch::new(4099, 3, 16).unwrap();
12255        let mut reduced = PackedCountMinSketch::new(4099, 3, 16).unwrap();
12256
12257        for key in &keys {
12258            individual.increment(key);
12259        }
12260        for (key, count) in sorted_reduced_test_runs(keys) {
12261            reduced.add_key_count(&key, count);
12262            reduced.add_key_increments(count);
12263        }
12264
12265        assert_eq!(reduced.increments, individual.increments);
12266        assert_eq!(reduced.occupied_slots, individual.occupied_slots);
12267        assert_eq!(reduced.words, individual.words);
12268    }
12269
12270    #[test]
12271    #[ignore = "microbenchmark for packed 16-bit/3-hash sketch kernel"]
12272    fn bench_packed_count_min_16bit_3hash_short_kernel() {
12273        let mut sketch = PackedCountMinSketch::new_with_min_arrays_and_mask_seed(
12274            67_108_859,
12275            3,
12276            16,
12277            BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
12278            BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
12279        )
12280        .unwrap();
12281        let keys = (0..1_000_000u64)
12282            .map(|i| KmerKey::Short(i.wrapping_mul(0x9e37_79b9_7f4a_7c15)))
12283            .collect::<Vec<_>>();
12284
12285        let start = Instant::now();
12286        let mut checksum = 0u64;
12287        for key in &keys {
12288            checksum ^= std::hint::black_box(
12289                sketch.increment_16bit_3hash_conservative_and_return_unincremented(key, 1),
12290            );
12291        }
12292        let elapsed = start.elapsed();
12293        eprintln!(
12294            "packed_16bit_3hash_short_kernel\tupdates={}\telapsed_seconds={:.6}\tchecksum={}",
12295            keys.len(),
12296            elapsed.as_secs_f64(),
12297            checksum
12298        );
12299        std::hint::black_box(sketch);
12300    }
12301
12302    #[test]
12303    fn atomic_count_min_reduced_sorted_replay_matches_individual_kmer_updates() {
12304        let keys = [
12305            KmerKey::Short(13),
12306            KmerKey::Short(29),
12307            KmerKey::Short(13),
12308            KmerKey::Short(31),
12309            KmerKey::Short(29),
12310            KmerKey::Short(29),
12311            KmerKey::Short(47),
12312        ];
12313        let individual = AtomicCountMinSketch::new(4099, 3).unwrap();
12314        let reduced = AtomicCountMinSketch::new(4099, 3).unwrap();
12315
12316        for key in &keys {
12317            individual.increment_key(key);
12318            individual.add_key_increments(1);
12319        }
12320        for (key, count) in sorted_reduced_test_runs(keys) {
12321            reduced.add_key_count(&key, count);
12322            reduced.add_key_increments(count);
12323        }
12324
12325        assert_eq!(
12326            reduced.increments.load(Ordering::Relaxed),
12327            individual.increments.load(Ordering::Relaxed)
12328        );
12329        assert_eq!(
12330            reduced.occupied_slots.load(Ordering::Relaxed),
12331            individual.occupied_slots.load(Ordering::Relaxed)
12332        );
12333        for slot in 0..individual.cells {
12334            assert_eq!(
12335                reduced.cells_by_hash[slot].load(Ordering::Relaxed),
12336                individual.cells_by_hash[slot].load(Ordering::Relaxed)
12337            );
12338        }
12339    }
12340
12341    fn sorted_reduced_test_runs<const N: usize>(keys: [KmerKey; N]) -> Vec<(KmerKey, u64)> {
12342        let mut keys = keys;
12343        keys.sort_unstable();
12344        let mut runs = Vec::new();
12345        for key in keys {
12346            if let Some((last_key, count)) = runs.last_mut()
12347                && last_key == &key
12348            {
12349                *count += 1;
12350                continue;
12351            }
12352            runs.push((key, 1));
12353        }
12354        runs
12355    }
12356
12357    #[test]
12358    fn exact_collision_estimates_follow_lockedincrement_mode() {
12359        let mut config = Config {
12360            count_min: crate::cli::CountMinSettings {
12361                cells: Some(128),
12362                hashes: Some(2),
12363                bits: Some(8),
12364                memory_bytes: None,
12365            },
12366            threads: Some(2),
12367            ..Config::default()
12368        };
12369        let cells = count_min_table_cells_from_total_bits(128, 8);
12370        let (left, right0, right1) = find_two_sided_partial_collisions(cells, 8);
12371        let mut locked = CountMap::default();
12372        locked.insert(left.clone(), 2);
12373        locked.insert(right0, 1);
12374        locked.insert(right1, 1);
12375        let mut unlocked = locked.clone();
12376
12377        apply_count_min_collision_estimates(&config, &mut locked);
12378        config.locked_increment = Some(false);
12379        apply_count_min_collision_estimates(&config, &mut unlocked);
12380
12381        assert_eq!(locked.get(&left), Some(&2));
12382        assert_eq!(unlocked.get(&left), Some(&3));
12383    }
12384
12385    #[test]
12386    fn prefilter_exact_estimates_follow_lockedincrement_mode() {
12387        let mut config = Config {
12388            prefilter: crate::cli::PrefilterSettings {
12389                enabled: true,
12390                force_disabled: false,
12391                cells: Some(128),
12392                hashes: Some(2),
12393                bits: Some(8),
12394                memory_bytes: None,
12395                memory_fraction_micros: None,
12396            },
12397            threads: Some(2),
12398            ..Config::default()
12399        };
12400        let cells = count_min_table_cells_from_total_bits(128, 8);
12401        let (left, right0, right1) = find_two_sided_partial_collisions(cells, 8);
12402        let mut locked = CountMap::default();
12403        locked.insert(left.clone(), 2);
12404        locked.insert(right0, 1);
12405        locked.insert(right1, 1);
12406        let mut unlocked = locked.clone();
12407
12408        apply_prefilter_collision_estimates(&config, &mut locked);
12409        config.locked_increment = Some(false);
12410        apply_prefilter_collision_estimates(&config, &mut unlocked);
12411
12412        assert_eq!(locked.get(&left), Some(&2));
12413        assert_eq!(unlocked.get(&left), Some(&3));
12414    }
12415
12416    #[test]
12417    fn prefilter_sketch_saturates_with_independent_row_increments_when_unlocked() {
12418        let config = Config {
12419            prefilter: crate::cli::PrefilterSettings {
12420                enabled: true,
12421                force_disabled: false,
12422                cells: Some(128),
12423                hashes: Some(2),
12424                bits: Some(2),
12425                memory_bytes: None,
12426                memory_fraction_micros: None,
12427            },
12428            locked_increment: Some(false),
12429            threads: Some(2),
12430            ..Config::default()
12431        };
12432        let mut prefilter = new_prefilter_count_min_sketch(&config).unwrap();
12433        let (left, right, row0, row1_left, row1_right) =
12434            find_partial_row_collision(prefilter.cells, prefilter.bits);
12435
12436        let mut conservative =
12437            PackedCountMinSketch::new(prefilter.cells, prefilter.hashes, prefilter.bits).unwrap();
12438        conservative.add_key_count(&left, 2);
12439        conservative.add_key_count(&right, 1);
12440        prefilter.add_key_count(&left, 2);
12441        prefilter.add_key_count(&right, 1);
12442
12443        assert_eq!(conservative.cell(row0), 2);
12444        assert_eq!(prefilter.cell(row0), 3);
12445        assert_eq!(prefilter.cell(row1_left), 2);
12446        assert_eq!(prefilter.cell(row1_right), 1);
12447    }
12448
12449    #[test]
12450    fn packed_count_min_sketch_uses_fixed_saturating_cells() {
12451        let mut sketch = PackedCountMinSketch::new(1, 2, 3).unwrap();
12452        for _ in 0..10 {
12453            sketch.increment(&KmerKey::Short(7));
12454        }
12455
12456        assert_eq!(sketch.words.len(), 1);
12457        assert_eq!(sketch.depth(&KmerKey::Short(7)), 7);
12458        assert_eq!(sketch.depth(&KmerKey::Short(11)), 7);
12459        assert_eq!(sketch.unique_kmers(), 10);
12460    }
12461
12462    #[test]
12463    fn packed_count_min_depth_hist_uses_raw_depth_counts() {
12464        let mut sketch = PackedCountMinSketch::new(8, 2, 4).unwrap();
12465        sketch.set_cell(0, 1);
12466        sketch.set_cell(1, 2);
12467        sketch.set_cell(2, 2);
12468        sketch.set_cell(3, 5);
12469
12470        assert_eq!(sketch.occupied_slots_at_least(1), 4);
12471        assert_eq!(sketch.tracked_slots.as_ref().unwrap().len(), 4);
12472        assert_eq!(sketch.depth_hist(4), vec![0, 1, 4, 5]);
12473    }
12474
12475    #[test]
12476    fn packed_count_min_tracks_occupied_slots_without_duplicates() {
12477        let key = KmerKey::Short(17);
12478        let mut sketch = PackedCountMinSketch::new(128, 1, 4).unwrap();
12479
12480        sketch.add_key_count(&key, 1);
12481        sketch.add_key_count(&key, 2);
12482
12483        assert_eq!(sketch.occupied_slots_at_least(1), 1);
12484        assert_eq!(sketch.occupied_slots_at_least(3), 1);
12485        assert_eq!(sketch.tracked_slots.as_ref().unwrap().len(), 1);
12486        assert_eq!(sketch.depth_hist(5), vec![0, 0, 0, 3, 0]);
12487    }
12488
12489    #[test]
12490    fn packed_count_min_disables_slot_tracking_for_large_tables() {
12491        let sketch = PackedCountMinSketch::new(PACKED_SKETCH_TRACKED_SLOT_LIMIT + 1, 1, 1).unwrap();
12492
12493        assert!(sketch.tracked_slots.is_none());
12494        assert_eq!(sketch.tracked_slot_memory_bytes(), 0);
12495        assert_eq!(
12496            sketch.layout_summary("large", None).memory_bytes,
12497            sketch.words.len() * std::mem::size_of::<u64>()
12498        );
12499    }
12500
12501    #[test]
12502    fn packed_count_min_layout_reports_tracked_slot_memory() {
12503        let key = KmerKey::Short(17);
12504        let mut sketch = PackedCountMinSketch::new(128, 1, 4).unwrap();
12505
12506        sketch.add_key_count(&key, 1);
12507
12508        let backing_bytes = sketch.words.len() * std::mem::size_of::<u64>();
12509        assert!(sketch.tracked_slot_memory_bytes() >= std::mem::size_of::<usize>());
12510        assert_eq!(
12511            sketch.layout_summary("small", None).memory_bytes,
12512            backing_bytes + sketch.tracked_slot_memory_bytes()
12513        );
12514    }
12515
12516    #[test]
12517    fn packed_count_min_depth_hist_uses_compact_cell_bound_but_returns_requested_len() {
12518        let mut sketch = PackedCountMinSketch::new(16, 1, 4).unwrap();
12519        sketch.set_cell(0, 1);
12520        sketch.set_cell(1, 15);
12521
12522        let hist = sketch.depth_hist(1024);
12523
12524        assert_eq!(hist.len(), 1024);
12525        assert_eq!(hist[1], 1);
12526        assert_eq!(hist[15], 15);
12527        assert!(hist[16..].iter().all(|&value| value == 0));
12528    }
12529
12530    #[test]
12531    fn packed_count_min_untracked_depth_hist_uses_compact_reducers() {
12532        let mut sketch = PackedCountMinSketch::new(16, 1, 4).unwrap();
12533        sketch.tracked_slots = None;
12534        sketch.set_cell(0, 1);
12535        sketch.set_cell(1, 15);
12536
12537        let hist = sketch.depth_hist(1024);
12538
12539        assert_eq!(hist.len(), 1024);
12540        assert_eq!(hist[1], 1);
12541        assert_eq!(hist[15], 15);
12542        assert!(hist[16..].iter().all(|&value| value == 0));
12543    }
12544
12545    #[test]
12546    fn packed_count_min_depth_hist_uses_dynamic_reducers_for_wide_cells() {
12547        let mut sketch = PackedCountMinSketch::new(16, 1, 32).unwrap();
12548        sketch.set_cell(0, 1);
12549        sketch.set_cell(1, 4096);
12550
12551        let hist = sketch.depth_hist(8192);
12552
12553        assert_eq!(hist.len(), 8192);
12554        assert_eq!(hist[1], 1);
12555        assert_eq!(hist[4096], 4096);
12556        assert!(hist[4097..].iter().all(|&value| value == 0));
12557    }
12558
12559    #[test]
12560    fn packed_count_min_untracked_depth_hist_uses_dynamic_reducers_for_wide_cells() {
12561        let mut sketch = PackedCountMinSketch::new(16, 1, 32).unwrap();
12562        sketch.tracked_slots = None;
12563        sketch.set_cell(0, 2);
12564        sketch.set_cell(1, 4096);
12565
12566        let hist = sketch.depth_hist(8192);
12567
12568        assert_eq!(hist.len(), 8192);
12569        assert_eq!(hist[2], 2);
12570        assert_eq!(hist[4096], 4096);
12571        assert!(hist[4097..].iter().all(|&value| value == 0));
12572    }
12573
12574    #[test]
12575    fn atomic_count_min_depth_hist_uses_raw_depth_counts() {
12576        let sketch = AtomicCountMinSketch::new(8, 2).unwrap();
12577        sketch.cells_by_hash[0].store(1, Ordering::Relaxed);
12578        sketch.cells_by_hash[1].store(2, Ordering::Relaxed);
12579        sketch.cells_by_hash[2].store(2, Ordering::Relaxed);
12580        sketch.cells_by_hash[3].store(5, Ordering::Relaxed);
12581
12582        assert_eq!(sketch.depth_hist(4), vec![0, 1, 4, 5]);
12583    }
12584
12585    #[test]
12586    fn atomic_count_min_depth_hist_uses_compact_dynamic_reducers() {
12587        let sketch = AtomicCountMinSketch::new(16, 2).unwrap();
12588        sketch.cells_by_hash[0].store(1, Ordering::Relaxed);
12589        sketch.cells_by_hash[1].store(7, Ordering::Relaxed);
12590
12591        let hist = sketch.depth_hist(8192);
12592
12593        assert_eq!(hist.len(), 8192);
12594        assert_eq!(hist[1], 1);
12595        assert_eq!(hist[7], 7);
12596        assert!(hist[8..].iter().all(|&value| value == 0));
12597    }
12598
12599    #[test]
12600    fn combined_primary_histograms_match_separate_collectors() {
12601        let dir = tempfile::tempdir().unwrap();
12602        let path = dir.path().join("reads.fq");
12603        write_fastq(
12604            &path,
12605            &[
12606                ("r1", b"ACGTACGT", b"IIIIIIII"),
12607                ("r2", b"ACGTTCGT", b"IIIIIIII"),
12608                ("r3", b"TTTTACGT", b"IIIIIIII"),
12609            ],
12610        );
12611        let config = Config {
12612            in1: Some(path.clone()),
12613            k: 3,
12614            min_quality: 0,
12615            min_prob: 0.0,
12616            ..Config::default()
12617        };
12618        let mut counts = CountMap::default();
12619        count_single_file(&config, &path, &mut counts, None).unwrap();
12620
12621        let separate_hist = collect_primary_hist(&config, &counts, None, 0).unwrap();
12622        let sparse_hist = collect_primary_sparse_hist(&config, &counts, None, 0).unwrap();
12623        let separate_rhist = collect_primary_read_hist(&config, &counts, None, 0).unwrap();
12624        let sparse_rhist = collect_primary_sparse_read_hist(&config, &counts, None, 0).unwrap();
12625        let (sparse_combined_hist, sparse_combined_rhist) =
12626            collect_primary_sparse_hist_and_read_hist(&config, &counts, None, 0).unwrap();
12627        let (combined_hist, combined_rhist) =
12628            collect_primary_hist_and_read_hist(&config, &counts, None, 0).unwrap();
12629
12630        assert_eq!(
12631            sparse_hist_to_dense(&sparse_hist, config.hist_len),
12632            separate_hist
12633        );
12634        assert_eq!(
12635            sparse_hist_to_dense(&sparse_combined_hist, config.hist_len),
12636            separate_hist
12637        );
12638        assert_eq!(combined_hist, separate_hist);
12639        assert_eq!(combined_rhist.reads, separate_rhist.reads);
12640        assert_eq!(combined_rhist.bases, separate_rhist.bases);
12641        let mut dense_sparse_rhist = ReadDepthHistogram::new(config.hist_len);
12642        merge_sparse_read_depth_hist_into_dense(&mut dense_sparse_rhist, sparse_rhist);
12643        assert_eq!(dense_sparse_rhist.reads, separate_rhist.reads);
12644        assert_eq!(dense_sparse_rhist.bases, separate_rhist.bases);
12645        let mut dense_sparse_combined_rhist = ReadDepthHistogram::new(config.hist_len);
12646        merge_sparse_read_depth_hist_into_dense(
12647            &mut dense_sparse_combined_rhist,
12648            sparse_combined_rhist,
12649        );
12650        assert_eq!(dense_sparse_combined_rhist.reads, separate_rhist.reads);
12651        assert_eq!(dense_sparse_combined_rhist.bases, separate_rhist.bases);
12652    }
12653
12654    #[test]
12655    fn countup_work_source_collects_input_histograms_like_separate_collectors() {
12656        let dir = tempfile::tempdir().unwrap();
12657        let path = dir.path().join("reads.fq");
12658        write_fastq(
12659            &path,
12660            &[
12661                ("r1", b"ACGTACGT", b"IIIIIIII"),
12662                ("r2", b"ACGTTCGT", b"IIIIIIII"),
12663                ("r3", b"TTTTACGT", b"IIIIIIII"),
12664            ],
12665        );
12666        let config = Config {
12667            in1: Some(path.clone()),
12668            count_up: true,
12669            k: 3,
12670            min_quality: 0,
12671            min_prob: 0.0,
12672            hist_len: 64,
12673            ..Config::default()
12674        };
12675        let mut counts = CountMap::default();
12676        count_single_file(&config, &path, &mut counts, None).unwrap();
12677
12678        let separate_hist = collect_primary_hist(&config, &counts, None, 0).unwrap();
12679        let separate_rhist = collect_primary_read_hist(&config, &counts, None, 0).unwrap();
12680        let build = collect_countup_work_source(&config, &counts, 0, true, true).unwrap();
12681
12682        assert_eq!(build.format1, SeqFormat::Fastq);
12683        assert_eq!(build.format2, None);
12684        assert_eq!(
12685            sparse_hist_to_dense(&build.input_hist.unwrap(), config.hist_len),
12686            separate_hist
12687        );
12688        let mut combined_rhist = ReadDepthHistogram::new(config.hist_len);
12689        merge_sparse_read_depth_hist_into_dense(
12690            &mut combined_rhist,
12691            build.input_read_hist.unwrap(),
12692        );
12693        assert_eq!(combined_rhist.reads, separate_rhist.reads);
12694        assert_eq!(combined_rhist.bases, separate_rhist.bases);
12695    }
12696
12697    #[test]
12698    fn combined_primary_histograms_with_keep_filter_match_separate_collectors() {
12699        let dir = tempfile::tempdir().unwrap();
12700        let path = dir.path().join("reads.fq");
12701        write_fastq(
12702            &path,
12703            &[
12704                ("r1", b"ACGTACGT", b"IIIIIIII"),
12705                ("r2", b"ACGTACGT", b"IIIIIIII"),
12706                ("r3", b"TTTTACGT", b"IIIIIIII"),
12707            ],
12708        );
12709        let config = Config {
12710            in1: Some(path.clone()),
12711            k: 3,
12712            min_quality: 0,
12713            min_prob: 0.0,
12714            ..Config::default()
12715        };
12716        let mut input_counts = CountMap::default();
12717        count_single_file(&config, &path, &mut input_counts, None).unwrap();
12718        let mut kept_counts = CountMap::default();
12719        increment_pair_counts(
12720            &config,
12721            &mut kept_counts,
12722            &record("kept", b"ACGTACGT"),
12723            None,
12724        );
12725
12726        let separate_hist =
12727            collect_primary_hist(&config, &kept_counts, Some(&input_counts), 17).unwrap();
12728        let sparse_hist =
12729            collect_primary_sparse_hist(&config, &kept_counts, Some(&input_counts), 17).unwrap();
12730        let separate_rhist =
12731            collect_primary_read_hist(&config, &kept_counts, Some(&input_counts), 17).unwrap();
12732        let sparse_rhist =
12733            collect_primary_sparse_read_hist(&config, &kept_counts, Some(&input_counts), 17)
12734                .unwrap();
12735        let (sparse_combined_hist, sparse_combined_rhist) =
12736            collect_primary_sparse_hist_and_read_hist(
12737                &config,
12738                &kept_counts,
12739                Some(&input_counts),
12740                17,
12741            )
12742            .unwrap();
12743        let (combined_hist, combined_rhist) =
12744            collect_primary_hist_and_read_hist(&config, &kept_counts, Some(&input_counts), 17)
12745                .unwrap();
12746
12747        assert_eq!(
12748            sparse_hist_to_dense(&sparse_hist, config.hist_len),
12749            separate_hist
12750        );
12751        assert_eq!(
12752            sparse_hist_to_dense(&sparse_combined_hist, config.hist_len),
12753            separate_hist
12754        );
12755        assert_eq!(combined_hist, separate_hist);
12756        assert_eq!(combined_rhist.reads, separate_rhist.reads);
12757        assert_eq!(combined_rhist.bases, separate_rhist.bases);
12758        let mut dense_sparse_rhist = ReadDepthHistogram::new(config.hist_len);
12759        merge_sparse_read_depth_hist_into_dense(&mut dense_sparse_rhist, sparse_rhist);
12760        assert_eq!(dense_sparse_rhist.reads, separate_rhist.reads);
12761        assert_eq!(dense_sparse_rhist.bases, separate_rhist.bases);
12762        let mut dense_sparse_combined_rhist = ReadDepthHistogram::new(config.hist_len);
12763        merge_sparse_read_depth_hist_into_dense(
12764            &mut dense_sparse_combined_rhist,
12765            sparse_combined_rhist,
12766        );
12767        assert_eq!(dense_sparse_combined_rhist.reads, separate_rhist.reads);
12768        assert_eq!(dense_sparse_combined_rhist.bases, separate_rhist.bases);
12769    }
12770
12771    #[test]
12772    fn packed_count_min_unique_kmers_uses_bbtools_hash_adjusted_estimate() {
12773        let mut sketch = PackedCountMinSketch::new(1024, 4, 8).unwrap();
12774        for bucket in 0..256 {
12775            sketch.set_cell(bucket, 1);
12776        }
12777        sketch.increments = 1_000;
12778
12779        let estimated = sketch.unique_kmers();
12780        assert!(
12781            (70..=80).contains(&estimated),
12782            "BBTools-style hash-adjusted estimate was {estimated}"
12783        );
12784    }
12785
12786    #[test]
12787    fn packed_count_min_unique_kmers_honors_min_depth_threshold() {
12788        let mut sketch = PackedCountMinSketch::new(1024, 4, 8).unwrap();
12789        for bucket in 0..256 {
12790            let depth = if bucket < 128 { 3 } else { 1 };
12791            sketch.set_cell(bucket, depth);
12792        }
12793        sketch.increments = 1_000;
12794
12795        let total_estimated = sketch.unique_kmers();
12796        let high_depth_estimated = sketch.unique_kmers_at_least(2);
12797
12798        assert!(
12799            (70..=80).contains(&total_estimated),
12800            "all-depth estimate was {total_estimated}"
12801        );
12802        assert!(
12803            (30..=40).contains(&high_depth_estimated),
12804            "thresholded estimate was {high_depth_estimated}"
12805        );
12806        assert_eq!(sketch.unique_kmers_at_least(9), 0);
12807    }
12808
12809    #[test]
12810    fn atomic_count_min_unique_kmers_honors_min_depth_threshold() {
12811        let sketch = AtomicCountMinSketch::new(1024, 4).unwrap();
12812        for bucket in 0..256 {
12813            let depth = if bucket < 128 { 3 } else { 1 };
12814            sketch.cells_by_hash[bucket].store(depth, Ordering::Relaxed);
12815        }
12816        sketch.occupied_slots.store(256, Ordering::Relaxed);
12817        sketch.add_key_increments(1_000);
12818
12819        let total_estimated = sketch.unique_kmers();
12820        let high_depth_estimated = sketch.unique_kmers_at_least(2);
12821
12822        assert!(
12823            (70..=80).contains(&total_estimated),
12824            "all-depth estimate was {total_estimated}"
12825        );
12826        assert!(
12827            (30..=40).contains(&high_depth_estimated),
12828            "thresholded estimate was {high_depth_estimated}"
12829        );
12830        assert_eq!(sketch.occupied_slots_at_least(1), 256);
12831    }
12832
12833    #[test]
12834    fn cardinality_estimator_tracks_unique_keys_with_fixed_register_memory() {
12835        let config = Config {
12836            k: 31,
12837            cardinality: crate::cli::CardinalitySettings {
12838                input: true,
12839                buckets: 2048,
12840                seed: 42,
12841                ..Default::default()
12842            },
12843            ..Default::default()
12844        };
12845        let mut estimator = KmerCardinalityEstimator::from_config(&config);
12846        for key in 0..1_000 {
12847            estimator.observe_key(&KmerKey::Short(key));
12848            estimator.observe_key(&KmerKey::Short(key));
12849        }
12850
12851        let estimate = estimator.estimate();
12852        assert_eq!(estimate.k, 31);
12853        assert_eq!(estimate.buckets, 2048);
12854        assert!(
12855            (900..=1_100).contains(&estimate.estimated_unique_kmers),
12856            "cardinality estimate was {}",
12857            estimate.estimated_unique_kmers
12858        );
12859        assert_eq!(estimator.registers.len(), 2048);
12860    }
12861
12862    #[test]
12863    fn packed_count_min_sketch_packs_cells_across_word_boundaries() {
12864        let mut sketch = PackedCountMinSketch::new(17, 1, 5).unwrap();
12865        for slot in 0..17 {
12866            sketch.set_cell(slot, slot as u64);
12867        }
12868
12869        for slot in 0..17 {
12870            assert_eq!(sketch.cell(slot), slot as u64);
12871        }
12872    }
12873
12874    #[test]
12875    fn bounded_input_counts_builds_direct_sketch_when_cells_are_constrained() {
12876        let dir = tempfile::tempdir().unwrap();
12877        let path = dir.path().join("reads.fq");
12878        write_fastq(
12879            &path,
12880            &[
12881                ("r1", b"ACGTACGT", b"IIIIIIII"),
12882                ("r2", b"ACGTTCGT", b"IIIIIIII"),
12883            ],
12884        );
12885        let config = Config {
12886            in1: Some(path),
12887            k: 3,
12888            min_quality: 0,
12889            min_prob: 0.0,
12890            count_min: crate::cli::CountMinSettings {
12891                cells: Some(4),
12892                hashes: Some(2),
12893                bits: Some(4),
12894                memory_bytes: None,
12895            },
12896            ..Config::default()
12897        };
12898        let probe = kmers_for_record(&record("probe", b"ACGTACGT"), &config)
12899            .into_iter()
12900            .next()
12901            .unwrap();
12902
12903        let counts = build_input_counts(&config).unwrap();
12904
12905        let InputCounts::Sketch(sketch) = counts else {
12906            panic!("cells= should build a bounded packed count-min sketch");
12907        };
12908        assert_eq!(sketch.words.len(), 1);
12909        assert!(sketch.depth(&probe) > 0);
12910    }
12911
12912    #[test]
12913    fn auto_count_min_uses_sketch_when_input_metadata_exceeds_threshold() {
12914        let dir = tempfile::tempdir().unwrap();
12915        let path = dir.path().join("reads.fq");
12916        write_fastq(
12917            &path,
12918            &[
12919                ("r1", b"ACGTACGT", b"IIIIIIII"),
12920                ("r2", b"ACGTTCGT", b"IIIIIIII"),
12921            ],
12922        );
12923        let config = Config {
12924            in1: Some(path),
12925            k: 3,
12926            min_quality: 0,
12927            min_prob: 0.0,
12928            auto_count_min_input_bytes: 1,
12929            auto_count_min_memory_bytes: Some(4096),
12930            ..Config::default()
12931        };
12932
12933        let counts = build_input_counts(&config).unwrap();
12934
12935        match counts {
12936            InputCounts::AtomicSketch(sketch) => {
12937                assert!(sketch.cells > 0);
12938                assert!(sketch.increments.load(Ordering::Relaxed) > 0);
12939            }
12940            InputCounts::AtomicPackedSketch(sketch) => {
12941                assert!(sketch.cells > 0);
12942                assert!(sketch.increments.load(Ordering::Relaxed) > 0);
12943            }
12944            InputCounts::Sketch(sketch) => {
12945                assert!(sketch.cells > 0);
12946                assert!(sketch.increments > 0);
12947            }
12948            InputCounts::PrefilteredSketch { .. } => {}
12949            InputCounts::Exact(_) => {
12950                panic!("large-input auto count-min should build a bounded sketch");
12951            }
12952        }
12953    }
12954
12955    #[test]
12956    fn force_exact_counts_overrides_auto_and_explicit_sketch_settings() {
12957        let dir = tempfile::tempdir().unwrap();
12958        let path = dir.path().join("reads.fq");
12959        write_fastq(
12960            &path,
12961            &[
12962                ("r1", b"ACGTACGT", b"IIIIIIII"),
12963                ("r2", b"ACGTTCGT", b"IIIIIIII"),
12964            ],
12965        );
12966        let config = Config {
12967            in1: Some(path),
12968            k: 3,
12969            min_quality: 0,
12970            min_prob: 0.0,
12971            force_exact_counts: true,
12972            auto_count_min_input_bytes: 1,
12973            count_min: crate::cli::CountMinSettings {
12974                cells: Some(1),
12975                hashes: Some(2),
12976                bits: Some(4),
12977                memory_bytes: Some(1024),
12978            },
12979            ..Config::default()
12980        };
12981
12982        let counts = build_input_counts(&config).unwrap();
12983
12984        let InputCounts::Exact(counts) = counts else {
12985            panic!("force_exact_counts should override automatic and explicit sketch settings");
12986        };
12987        assert!(counts.len() > 1);
12988    }
12989
12990    #[test]
12991    fn bounded_sketch_chunked_parallel_is_deterministic_and_conservative() {
12992        let config = Config {
12993            k: 3,
12994            min_quality: 0,
12995            min_prob: 0.0,
12996            count_min: crate::cli::CountMinSettings {
12997                cells: Some(32),
12998                hashes: Some(3),
12999                bits: Some(8),
13000                memory_bytes: None,
13001            },
13002            ..Config::default()
13003        };
13004        let pairs = vec![
13005            (
13006                record("r1/1", b"ACGTACGT"),
13007                Some(record("r1/2", b"TCGTACGA")),
13008            ),
13009            (record("r2/1", b"AAAAACCC"), None),
13010            (
13011                record("r3/1", b"GGGGTTTT"),
13012                Some(record("r3/2", b"CCCCAAAA")),
13013            ),
13014        ];
13015        let mut exact = CountMap::default();
13016        for (r1, r2) in &pairs {
13017            increment_pair_counts(&config, &mut exact, r1, r2.as_ref());
13018        }
13019        let mut chunked_a = new_bounded_count_min_sketch(&config).unwrap();
13020        let mut chunked_b = new_bounded_count_min_sketch(&config).unwrap();
13021
13022        increment_sketch_from_pair_chunk(&config, &mut chunked_a, &pairs, None);
13023        increment_sketch_from_pair_chunk(&config, &mut chunked_b, &pairs, None);
13024
13025        assert_eq!(chunked_a.words, chunked_b.words);
13026        assert_eq!(chunked_a.increments, exact.values().copied().sum::<u64>());
13027        for (key, exact_depth) in exact {
13028            assert!(chunked_a.depth(&key) >= exact_depth.min(chunked_a.max_count));
13029        }
13030    }
13031
13032    #[test]
13033    fn atomic_count_min_chunked_parallel_matches_sequential_conservative_bits32() {
13034        let config = Config {
13035            k: 3,
13036            min_quality: 0,
13037            min_prob: 0.0,
13038            count_min: crate::cli::CountMinSettings {
13039                cells: Some(64),
13040                hashes: Some(3),
13041                bits: Some(32),
13042                memory_bytes: None,
13043            },
13044            ..Config::default()
13045        };
13046        let pairs = vec![
13047            (
13048                record("r1/1", b"ACGTACGT"),
13049                Some(record("r1/2", b"TCGTACGA")),
13050            ),
13051            (record("r2/1", b"AAAAACCC"), None),
13052            (
13053                record("r3/1", b"GGGGTTTT"),
13054                Some(record("r3/2", b"CCCCAAAA")),
13055            ),
13056        ];
13057        let sequential = new_atomic_count_min_sketch(&config).unwrap();
13058        let mut merged_counts = CountMap::default();
13059        for (r1, r2) in &pairs {
13060            let mut pair_counts = CountMap::default();
13061            increment_pair_counts(&config, &mut pair_counts, r1, r2.as_ref());
13062            merge_count_maps(&mut merged_counts, pair_counts);
13063        }
13064        let mut entries = merged_counts.into_iter().collect::<Vec<_>>();
13065        entries.sort_unstable_by(|(left, _), (right, _)| left.cmp(right));
13066        let key_increments = entries.iter().map(|(_, count)| *count).sum();
13067        for (key, count) in entries {
13068            sequential.add_key_count(&key, count);
13069        }
13070        sequential.add_key_increments(key_increments);
13071        let chunked = new_atomic_count_min_sketch(&config).unwrap();
13072
13073        increment_atomic_sketch_from_pair_chunk(&config, &chunked, &pairs, None);
13074
13075        assert_eq!(
13076            chunked.increments.load(Ordering::Relaxed),
13077            sequential.increments.load(Ordering::Relaxed)
13078        );
13079        assert_eq!(
13080            chunked.occupied_slots.load(Ordering::Relaxed),
13081            sequential.occupied_slots.load(Ordering::Relaxed)
13082        );
13083        for slot in 0..sequential.cells {
13084            assert_eq!(
13085                u64::from(chunked.cells_by_hash[slot].load(Ordering::Relaxed)),
13086                u64::from(sequential.cells_by_hash[slot].load(Ordering::Relaxed))
13087            );
13088        }
13089    }
13090
13091    #[test]
13092    fn nondeterministic_atomic_count_min_direct_path_matches_sequential_without_collisions() {
13093        let config = Config {
13094            k: 5,
13095            min_quality: 0,
13096            min_prob: 0.0,
13097            deterministic: false,
13098            count_min: crate::cli::CountMinSettings {
13099                cells: Some(8192),
13100                hashes: Some(1),
13101                bits: Some(32),
13102                memory_bytes: None,
13103            },
13104            ..Config::default()
13105        };
13106        let pairs = vec![
13107            (
13108                record("r1/1", b"ACGTACGTAC"),
13109                Some(record("r1/2", b"TCGTACGAAA")),
13110            ),
13111            (record("r2/1", b"AAAAACCCCC"), None),
13112            (
13113                record("r3/1", b"GGGGTTTTAA"),
13114                Some(record("r3/2", b"CCCCAAAAGG")),
13115            ),
13116        ];
13117        let sequential = new_atomic_count_min_sketch(&Config {
13118            deterministic: true,
13119            ..config.clone()
13120        })
13121        .unwrap();
13122        let mut merged_counts = CountMap::default();
13123        for (r1, r2) in &pairs {
13124            increment_pair_counts(&config, &mut merged_counts, r1, r2.as_ref());
13125        }
13126        let key_increments = merged_counts.values().copied().sum();
13127        sequential.add_key_counts(&merged_counts);
13128        sequential.add_key_increments(key_increments);
13129
13130        let direct = new_atomic_count_min_sketch(&config).unwrap();
13131        increment_atomic_sketch_from_pair_chunk(&config, &direct, &pairs, None);
13132
13133        assert_eq!(
13134            direct.increments.load(Ordering::Relaxed),
13135            sequential.increments.load(Ordering::Relaxed)
13136        );
13137        assert_eq!(
13138            direct.occupied_slots.load(Ordering::Relaxed),
13139            sequential.occupied_slots.load(Ordering::Relaxed)
13140        );
13141        for slot in 0..sequential.cells {
13142            assert_eq!(
13143                u64::from(direct.cells_by_hash[slot].load(Ordering::Relaxed)),
13144                u64::from(sequential.cells_by_hash[slot].load(Ordering::Relaxed))
13145            );
13146        }
13147    }
13148
13149    #[test]
13150    fn atomic_count_min_conservative_update_reduces_collision_inflation() {
13151        let config = Config {
13152            k: 3,
13153            min_quality: 0,
13154            min_prob: 0.0,
13155            count_min: crate::cli::CountMinSettings {
13156                cells: Some(1),
13157                hashes: Some(3),
13158                bits: Some(32),
13159                memory_bytes: None,
13160            },
13161            ..Config::default()
13162        };
13163        let key_a = KmerKey::Short(1);
13164        let key_b = KmerKey::Short(2);
13165        let sketch = new_atomic_count_min_sketch(&config).unwrap();
13166
13167        sketch.add_key_count(&key_a, 5);
13168        sketch.add_key_count(&key_b, 1);
13169
13170        assert_eq!(sketch.depth(&key_a), 6);
13171        assert_eq!(sketch.depth(&key_b), 6);
13172    }
13173
13174    #[test]
13175    fn bounded_output_counts_uses_sketch_for_kept_kmers_when_cells_are_constrained() {
13176        let config = Config {
13177            k: 3,
13178            min_quality: 0,
13179            min_prob: 0.0,
13180            count_min: crate::cli::CountMinSettings {
13181                cells: Some(4),
13182                hashes: Some(2),
13183                bits: Some(4),
13184                memory_bytes: None,
13185            },
13186            ..Config::default()
13187        };
13188        let r1 = record("r1", b"ACGTACGT");
13189        let probe = kmers_for_record(&r1, &config).into_iter().next().unwrap();
13190        let pair = NormalizedPair {
13191            input_list_index: 0,
13192            r1: r1.clone(),
13193            r2: None,
13194            out_r1: r1,
13195            out_r2: None,
13196            decision: PairDecision::default(),
13197            uncorrectable: false,
13198            read_count: 1,
13199            base_count: 8,
13200        };
13201        let mut counts = new_output_counts(&config).unwrap();
13202
13203        increment_output_counts_from_normalized_chunk(&config, &mut counts, &[pair]);
13204
13205        let OutputCounts::Sketch(sketch) = counts else {
13206            panic!("cells= should use a bounded output sketch for kept-kmer side counts");
13207        };
13208        assert_eq!(sketch.words.len(), 1);
13209        assert!(sketch.depth(&probe) > 0);
13210    }
13211
13212    #[test]
13213    fn nondeterministic_atomic_output_counts_direct_path_matches_sequential_without_collisions() {
13214        let config = Config {
13215            k: 5,
13216            min_quality: 0,
13217            min_prob: 0.0,
13218            deterministic: false,
13219            count_min: crate::cli::CountMinSettings {
13220                cells: Some(8192),
13221                hashes: Some(1),
13222                bits: Some(32),
13223                memory_bytes: None,
13224            },
13225            ..Config::default()
13226        };
13227        let kept_a = record("r1", b"ACGTACGTAC");
13228        let kept_b = record("r2", b"TTTTCCCCAA");
13229        let tossed = record("r3", b"GGGGAAAACC");
13230        let pairs = vec![
13231            NormalizedPair {
13232                input_list_index: 0,
13233                r1: kept_a.clone(),
13234                r2: None,
13235                out_r1: kept_a,
13236                out_r2: None,
13237                decision: PairDecision::default(),
13238                uncorrectable: false,
13239                read_count: 1,
13240                base_count: 10,
13241            },
13242            NormalizedPair {
13243                input_list_index: 0,
13244                r1: kept_b.clone(),
13245                r2: None,
13246                out_r1: kept_b,
13247                out_r2: None,
13248                decision: PairDecision::default(),
13249                uncorrectable: false,
13250                read_count: 1,
13251                base_count: 10,
13252            },
13253            NormalizedPair {
13254                input_list_index: 0,
13255                r1: tossed.clone(),
13256                r2: None,
13257                out_r1: tossed,
13258                out_r2: None,
13259                decision: PairDecision {
13260                    toss: true,
13261                    ..PairDecision::default()
13262                },
13263                uncorrectable: false,
13264                read_count: 1,
13265                base_count: 10,
13266            },
13267        ];
13268        let sequential_config = Config {
13269            deterministic: true,
13270            ..config.clone()
13271        };
13272        let mut sequential = new_output_counts(&sequential_config).unwrap();
13273        let mut direct = new_output_counts(&config).unwrap();
13274
13275        increment_output_counts_from_normalized_chunk(&sequential_config, &mut sequential, &pairs);
13276        increment_output_counts_from_normalized_chunk(&config, &mut direct, &pairs);
13277
13278        let (OutputCounts::AtomicSketch(sequential), OutputCounts::AtomicSketch(direct)) =
13279            (sequential, direct)
13280        else {
13281            panic!("bits=32 output counts should use atomic sketches");
13282        };
13283        assert_eq!(
13284            direct.increments.load(Ordering::Relaxed),
13285            sequential.increments.load(Ordering::Relaxed)
13286        );
13287        assert_eq!(
13288            direct.occupied_slots.load(Ordering::Relaxed),
13289            sequential.occupied_slots.load(Ordering::Relaxed)
13290        );
13291        for slot in 0..sequential.cells {
13292            assert_eq!(
13293                u64::from(direct.cells_by_hash[slot].load(Ordering::Relaxed)),
13294                u64::from(sequential.cells_by_hash[slot].load(Ordering::Relaxed))
13295            );
13296        }
13297    }
13298
13299    #[test]
13300    fn bounded_sketch_memory_budget_derives_cell_count() {
13301        let config = Config {
13302            count_min: crate::cli::CountMinSettings {
13303                cells: None,
13304                hashes: Some(2),
13305                bits: Some(8),
13306                memory_bytes: Some(1000),
13307            },
13308            threads: Some(2),
13309            ..Config::default()
13310        };
13311
13312        let sketch = new_bounded_count_min_sketch(&config).unwrap();
13313
13314        assert_eq!(sketch.cells, 998);
13315        assert_eq!(sketch.words.len(), 125);
13316    }
13317
13318    #[test]
13319    fn count_min_table_sizing_prime_adjusts_like_kcountarray() {
13320        assert_eq!(count_min_table_cells_from_total(1, 3), 1);
13321        assert_eq!(count_min_table_cells_from_total(9, 3), 7);
13322        assert_eq!(count_min_table_cells_from_total(64, 3), 62);
13323        assert_eq!(count_min_table_cells_from_total(1000, 2), 998);
13324    }
13325
13326    #[test]
13327    fn non_prefiltered_short_kmer_sketch_caps_cells_to_kmer_space_like_bbnorm() {
13328        let config = Config {
13329            k: 3,
13330            count_min: crate::cli::CountMinSettings {
13331                cells: Some(10_000),
13332                hashes: Some(2),
13333                bits: Some(8),
13334                memory_bytes: None,
13335            },
13336            ..Config::default()
13337        };
13338
13339        assert_eq!(short_kmer_space_cells(3), Some(64));
13340        assert_eq!(main_count_min_total_cells(&config, 8), 64);
13341
13342        let sketch = new_bounded_count_min_sketch(&config).unwrap();
13343        assert!(sketch.cells <= 64);
13344    }
13345
13346    #[test]
13347    fn prefiltered_short_kmer_sketch_preserves_requested_cells_like_bbnorm() {
13348        let config = Config {
13349            k: 3,
13350            count_min: crate::cli::CountMinSettings {
13351                cells: Some(10_000),
13352                hashes: Some(2),
13353                bits: Some(8),
13354                memory_bytes: None,
13355            },
13356            prefilter: crate::cli::PrefilterSettings {
13357                cells: Some(128),
13358                hashes: Some(2),
13359                bits: Some(2),
13360                ..Default::default()
13361            },
13362            ..Config::default()
13363        };
13364
13365        assert_eq!(main_count_min_total_cells(&config, 8), 10_000);
13366    }
13367
13368    #[test]
13369    fn kcount_array_min_arrays_rounds_threads_like_bbtools() {
13370        assert_eq!(kcount_array_min_arrays_for_threads(1), 2);
13371        assert_eq!(kcount_array_min_arrays_for_threads(2), 2);
13372        assert_eq!(kcount_array_min_arrays_for_threads(3), 4);
13373        assert_eq!(kcount_array_min_arrays_for_threads(8), 8);
13374        assert_eq!(kcount_array_min_arrays_for_threads(9), 16);
13375    }
13376
13377    #[test]
13378    fn bounded_sketch_sizing_uses_configured_threads_for_kcount_arrays() {
13379        let config = Config {
13380            threads: Some(8),
13381            count_min: crate::cli::CountMinSettings {
13382                cells: Some(1000),
13383                hashes: Some(2),
13384                bits: Some(8),
13385                memory_bytes: None,
13386            },
13387            ..Config::default()
13388        };
13389
13390        let sketch = new_bounded_count_min_sketch(&config).unwrap();
13391
13392        assert_eq!(sketch.cells, 904);
13393        assert_eq!(sketch.words.len(), 113);
13394        assert_eq!(sketch.layout.array_mask, 7);
13395        assert_eq!(sketch.layout.array_bits, 3);
13396        assert_eq!(sketch.layout.cells_per_array, 113);
13397    }
13398
13399    #[test]
13400    fn bounded_sketch_sizing_uses_active_rayon_threads_for_auto_threads() {
13401        let pool = rayon::ThreadPoolBuilder::new()
13402            .num_threads(3)
13403            .build()
13404            .unwrap();
13405        pool.install(|| {
13406            let config = Config {
13407                threads: None,
13408                count_min: crate::cli::CountMinSettings {
13409                    cells: Some(1000),
13410                    hashes: Some(2),
13411                    bits: Some(8),
13412                    memory_bytes: None,
13413                },
13414                ..Config::default()
13415            };
13416
13417            let sketch = new_bounded_count_min_sketch(&config).unwrap();
13418
13419            assert_eq!(kcount_array_min_arrays(&config), 4);
13420            assert_eq!(sketch.cells, 964);
13421            assert_eq!(sketch.words.len(), 121);
13422            assert_eq!(sketch.layout.array_mask, 3);
13423            assert_eq!(sketch.layout.array_bits, 2);
13424            assert_eq!(sketch.layout.cells_per_array, 241);
13425        });
13426    }
13427
13428    #[test]
13429    fn explicit_count_min_cells_are_total_budget_like_bbtools() {
13430        let config = Config {
13431            count_min: crate::cli::CountMinSettings {
13432                cells: Some(9),
13433                hashes: Some(3),
13434                bits: Some(8),
13435                memory_bytes: None,
13436            },
13437            ..Config::default()
13438        };
13439
13440        let packed = new_bounded_count_min_sketch(&config).unwrap();
13441        let atomic = new_atomic_count_min_sketch(&Config {
13442            count_min: crate::cli::CountMinSettings {
13443                bits: Some(32),
13444                ..config.count_min
13445            },
13446            ..Config::default()
13447        })
13448        .unwrap();
13449
13450        assert_eq!(packed.cells, 7);
13451        assert_eq!(packed.words.len(), 1);
13452        assert_eq!(atomic.cells, 7);
13453        assert_eq!(atomic.cells_by_hash.len(), 7);
13454    }
13455
13456    #[test]
13457    fn automatic_memory_budget_uses_bbtools_sizing_formula() {
13458        let config = Config {
13459            hist_in: Some(PathBuf::from("hist.tsv")),
13460            hist_len: 1000,
13461            threads: Some(3),
13462            build_passes: 2,
13463            ..Config::default()
13464        };
13465
13466        let usable = bbtools_usable_table_memory_bytes(&config, 1_000_000_000);
13467
13468        assert_eq!(usable, 329_944_000);
13469    }
13470
13471    #[test]
13472    fn countup_auto_memory_budget_halves_filter_bytes_like_bbnorm() {
13473        let config = Config {
13474            auto_count_min_memory_bytes: Some(1_000_000_000),
13475            table_reads: Some(1_000_000),
13476            ..Config::default()
13477        };
13478        let countup_config = Config {
13479            count_up: true,
13480            ..config.clone()
13481        };
13482
13483        assert_eq!(automatic_count_min_memory_bytes(&config), Some(659_920_000));
13484        assert_eq!(
13485            automatic_count_min_memory_bytes(&countup_config),
13486            Some(329_960_000)
13487        );
13488    }
13489
13490    #[test]
13491    fn automatic_output_counts_use_side_budget_and_next_mask_seed() {
13492        let config = Config {
13493            auto_count_min_memory_bytes: Some(1_000_000_000),
13494            table_reads: Some(1_000_000),
13495            threads: Some(8),
13496            deterministic: false,
13497            ..Config::default()
13498        };
13499
13500        assert_eq!(automatic_count_min_memory_bytes(&config), Some(659_920_000));
13501        assert_eq!(
13502            output_count_min_memory_bytes(&config, 32),
13503            Some(164_980_000)
13504        );
13505
13506        let main = new_atomic_count_min_sketch(&config).unwrap();
13507        let output = new_output_counts(&config).unwrap();
13508        let OutputCounts::AtomicSketch(output) = output else {
13509            panic!("automatic bits=32 output counts should use atomic sketches");
13510        };
13511        let main_layout = main.layout_summary("input_main", None);
13512        let output_layout = output.layout_summary("output_kept", None);
13513
13514        assert_eq!(
13515            output_layout.mask_seed,
13516            BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED
13517        );
13518        assert!(output_layout.memory_bytes < main_layout.memory_bytes / 2);
13519        assert!(output_layout.memory_bytes >= OUTPUT_COUNT_MIN_AUTO_MIN_MEMORY_BYTES);
13520    }
13521
13522    #[test]
13523    fn explicit_output_count_memory_preserves_requested_budget() {
13524        let config = Config {
13525            count_min: crate::cli::CountMinSettings {
13526                cells: None,
13527                hashes: Some(3),
13528                bits: Some(32),
13529                memory_bytes: Some(128 * 1024 * 1024),
13530            },
13531            threads: Some(4),
13532            ..Config::default()
13533        };
13534
13535        assert_eq!(
13536            output_count_min_memory_bytes(&config, 32),
13537            Some(128 * 1024 * 1024)
13538        );
13539        let main = new_atomic_count_min_sketch(&config).unwrap();
13540        let output = new_output_counts(&config).unwrap();
13541        let OutputCounts::AtomicSketch(output) = output else {
13542            panic!("explicit bits=32 output counts should use atomic sketches");
13543        };
13544
13545        assert_eq!(output.cells, main.cells);
13546        assert_eq!(
13547            output.layout.mask_seed,
13548            BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED
13549        );
13550    }
13551
13552    #[test]
13553    fn constrained_prefilter_inflates_unsaturated_colliding_counts() {
13554        let config = Config {
13555            prefilter: crate::cli::PrefilterSettings {
13556                enabled: false,
13557                force_disabled: false,
13558                cells: Some(1),
13559                hashes: Some(2),
13560                bits: Some(8),
13561                memory_bytes: None,
13562                memory_fraction_micros: None,
13563            },
13564            ..Config::default()
13565        };
13566        let mut counts = CountMap::default();
13567        counts.insert(KmerKey::Short(7), 2);
13568        counts.insert(KmerKey::Short(11), 5);
13569
13570        apply_prefilter_collision_estimates(&config, &mut counts);
13571
13572        assert_eq!(counts.get(&KmerKey::Short(7)), Some(&7));
13573        assert_eq!(counts.get(&KmerKey::Short(11)), Some(&7));
13574    }
13575
13576    #[test]
13577    fn constrained_prefilter_keeps_exact_counts_after_saturation() {
13578        let config = Config {
13579            prefilter: crate::cli::PrefilterSettings {
13580                enabled: false,
13581                force_disabled: false,
13582                cells: Some(1),
13583                hashes: Some(1),
13584                bits: Some(2),
13585                memory_bytes: None,
13586                memory_fraction_micros: None,
13587            },
13588            ..Config::default()
13589        };
13590        let mut counts = CountMap::default();
13591        counts.insert(KmerKey::Short(7), 2);
13592        counts.insert(KmerKey::Short(11), 5);
13593
13594        apply_prefilter_collision_estimates(&config, &mut counts);
13595
13596        assert_eq!(counts.get(&KmerKey::Short(7)), Some(&2));
13597        assert_eq!(counts.get(&KmerKey::Short(11)), Some(&5));
13598    }
13599
13600    #[test]
13601    fn prefilter_memory_budget_derives_prime_table_cells() {
13602        let config = Config {
13603            prefilter: crate::cli::PrefilterSettings {
13604                enabled: false,
13605                force_disabled: false,
13606                cells: None,
13607                hashes: Some(2),
13608                bits: Some(8),
13609                memory_bytes: Some(1000),
13610                memory_fraction_micros: None,
13611            },
13612            ..Config::default()
13613        };
13614        let mut counts = CountMap::default();
13615        counts.insert(KmerKey::Short(7), 2);
13616        counts.insert(KmerKey::Short(11), 5);
13617
13618        let bits = config.prefilter.bits.unwrap();
13619        let total_cells = count_min_cells_from_memory(config.prefilter.memory_bytes, bits);
13620        let table_cells = count_min_table_cells_from_total_bits(total_cells, bits);
13621
13622        assert_eq!(total_cells, 1000);
13623        assert_eq!(table_cells, 998);
13624
13625        apply_prefilter_collision_estimates(&config, &mut counts);
13626
13627        assert_eq!(counts.get(&KmerKey::Short(7)), Some(&2));
13628        assert_eq!(counts.get(&KmerKey::Short(11)), Some(&5));
13629    }
13630
13631    #[test]
13632    fn prefilter_fraction_derives_memory_from_table_budget() {
13633        let config = Config {
13634            auto_count_min_memory_bytes: Some(10_000),
13635            prefilter: crate::cli::PrefilterSettings {
13636                enabled: true,
13637                force_disabled: false,
13638                cells: None,
13639                hashes: Some(2),
13640                bits: Some(8),
13641                memory_bytes: None,
13642                memory_fraction_micros: Some(350_000),
13643            },
13644            ..Config::default()
13645        };
13646        let mut counts = CountMap::default();
13647        counts.insert(KmerKey::Short(7), 2);
13648        counts.insert(KmerKey::Short(11), 5);
13649
13650        let total_cells = prefilter_total_cells(&config, config.prefilter.bits.unwrap());
13651        let table_cells =
13652            count_min_table_cells_from_total_bits(total_cells, config.prefilter.bits.unwrap());
13653
13654        assert_eq!(total_cells, 3500);
13655        assert_eq!(table_cells, 3494);
13656
13657        apply_prefilter_collision_estimates(&config, &mut counts);
13658
13659        assert_eq!(counts.get(&KmerKey::Short(7)), Some(&2));
13660        assert_eq!(counts.get(&KmerKey::Short(11)), Some(&5));
13661    }
13662
13663    #[test]
13664    fn prefilter_fraction_partitions_main_cell_budget() {
13665        let config = Config {
13666            count_min: crate::cli::CountMinSettings {
13667                cells: Some(1000),
13668                hashes: Some(1),
13669                bits: Some(32),
13670                memory_bytes: None,
13671            },
13672            prefilter: crate::cli::PrefilterSettings {
13673                enabled: true,
13674                force_disabled: false,
13675                cells: None,
13676                hashes: Some(1),
13677                bits: Some(2),
13678                memory_bytes: None,
13679                memory_fraction_micros: Some(250_000),
13680            },
13681            threads: Some(2),
13682            ..Config::default()
13683        };
13684
13685        assert_eq!(main_count_min_total_cells(&config, 32), 750);
13686        assert_eq!(prefilter_total_cells(&config, 2), 4000);
13687
13688        let main = new_atomic_count_min_sketch(&config).unwrap();
13689        let prefilter = new_prefilter_count_min_sketch(&config).unwrap();
13690
13691        assert_eq!(main.cells, count_min_table_cells_from_total_bits(750, 32));
13692        assert_eq!(
13693            prefilter.cells,
13694            count_min_table_cells_from_total_bits(4000, 2)
13695        );
13696        assert_eq!(prefilter.max_count, 3);
13697    }
13698
13699    #[test]
13700    fn prefilter_flag_uses_bbtools_default_fraction_on_bounded_count_min_paths() {
13701        let config = Config {
13702            count_min: crate::cli::CountMinSettings {
13703                cells: Some(10_000),
13704                hashes: Some(2),
13705                bits: Some(32),
13706                memory_bytes: None,
13707            },
13708            prefilter: crate::cli::PrefilterSettings {
13709                enabled: true,
13710                force_disabled: false,
13711                cells: None,
13712                hashes: Some(2),
13713                bits: Some(2),
13714                memory_bytes: None,
13715                memory_fraction_micros: None,
13716            },
13717            ..Config::default()
13718        };
13719
13720        assert!(use_prefilter_collision_estimates(&config));
13721        assert_eq!(main_count_min_total_cells(&config, 32), 6500);
13722        assert_eq!(prefilter_total_cells(&config, 2), 56_000);
13723    }
13724
13725    #[test]
13726    fn zero_prefilter_fraction_does_not_force_prefilter_sketch() {
13727        let config = Config {
13728            count_min: crate::cli::CountMinSettings {
13729                cells: Some(10_000),
13730                bits: Some(32),
13731                ..Default::default()
13732            },
13733            prefilter: crate::cli::PrefilterSettings {
13734                enabled: false,
13735                force_disabled: false,
13736                memory_fraction_micros: Some(0),
13737                ..Default::default()
13738            },
13739            ..Config::default()
13740        };
13741
13742        assert!(!use_prefilter_collision_estimates(&config));
13743        assert_eq!(main_count_min_total_cells(&config, 32), 10_000);
13744    }
13745
13746    #[test]
13747    fn forced_off_prefilter_ignores_lingering_controls_like_bbnorm() {
13748        let config = Config {
13749            count_min: crate::cli::CountMinSettings {
13750                cells: Some(10_000),
13751                hashes: Some(3),
13752                bits: Some(32),
13753                ..Default::default()
13754            },
13755            prefilter: crate::cli::PrefilterSettings {
13756                enabled: false,
13757                force_disabled: true,
13758                cells: Some(1_000),
13759                hashes: Some(1),
13760                bits: Some(2),
13761                memory_bytes: None,
13762                memory_fraction_micros: Some(DEFAULT_PREFILTER_FRACTION_MICROS),
13763            },
13764            ..Config::default()
13765        };
13766
13767        assert!(!use_prefilter_collision_estimates(&config));
13768        assert_eq!(prefilter_memory_fraction_micros(&config), None);
13769        assert_eq!(main_count_min_total_cells(&config, 32), 10_000);
13770    }
13771
13772    #[test]
13773    fn prefilter_default_hashes_track_main_hashes_like_bbnorm() {
13774        let config = Config {
13775            count_min: crate::cli::CountMinSettings {
13776                cells: Some(10_000),
13777                hashes: Some(8),
13778                bits: Some(32),
13779                memory_bytes: None,
13780            },
13781            prefilter: crate::cli::PrefilterSettings {
13782                enabled: true,
13783                force_disabled: false,
13784                bits: Some(2),
13785                ..Default::default()
13786            },
13787            ..Config::default()
13788        };
13789
13790        let prefilter = new_prefilter_count_min_sketch(&config).unwrap();
13791
13792        assert_eq!(default_prefilter_hashes(&config), 4);
13793        assert_eq!(prefilter.hashes, 4);
13794
13795        let explicit = Config {
13796            prefilter: crate::cli::PrefilterSettings {
13797                hashes: Some(1),
13798                ..config.prefilter
13799            },
13800            ..config
13801        };
13802        let prefilter = new_prefilter_count_min_sketch(&explicit).unwrap();
13803        assert_eq!(prefilter.hashes, 1);
13804    }
13805
13806    #[test]
13807    fn explicit_prefilter_hashes_enable_default_partition_like_bbnorm() {
13808        let config = Config {
13809            count_min: crate::cli::CountMinSettings {
13810                cells: Some(10_000),
13811                hashes: Some(3),
13812                bits: Some(32),
13813                memory_bytes: None,
13814            },
13815            prefilter: crate::cli::PrefilterSettings {
13816                enabled: true,
13817                force_disabled: false,
13818                hashes: Some(1),
13819                bits: Some(2),
13820                ..Default::default()
13821            },
13822            ..Config::default()
13823        };
13824
13825        assert_eq!(
13826            prefilter_memory_fraction_micros(&config),
13827            Some(DEFAULT_PREFILTER_FRACTION_MICROS)
13828        );
13829        assert_eq!(main_count_min_total_cells(&config, 32), 6500);
13830        assert_eq!(prefilter_total_cells(&config, 2), 56_000);
13831    }
13832
13833    #[test]
13834    fn prefilter_flag_alone_keeps_small_exact_inputs_on_exact_path() {
13835        let dir = tempfile::tempdir().unwrap();
13836        let path = dir.path().join("reads.fq");
13837        write_fastq(&path, &[("r1", b"ACGTACGT", b"IIIIIIII")]);
13838        let config = Config {
13839            in1: Some(path),
13840            k: 3,
13841            min_quality: 0,
13842            min_prob: 0.0,
13843            prefilter: crate::cli::PrefilterSettings {
13844                enabled: true,
13845                force_disabled: false,
13846                ..Default::default()
13847            },
13848            ..Config::default()
13849        };
13850
13851        let counts = build_input_counts(&config).unwrap();
13852        assert!(matches!(counts, InputCounts::Exact(_)));
13853    }
13854
13855    #[test]
13856    fn prefilter_flag_builds_two_stage_sketch_when_count_min_is_bounded() {
13857        let dir = tempfile::tempdir().unwrap();
13858        let path = dir.path().join("reads.fq");
13859        write_fastq(
13860            &path,
13861            &[
13862                ("r1", b"ACGTACGT", b"IIIIIIII"),
13863                ("r2", b"ACGTACGT", b"IIIIIIII"),
13864                ("r3", b"ACGTACGT", b"IIIIIIII"),
13865            ],
13866        );
13867        let config = Config {
13868            in1: Some(path),
13869            k: 3,
13870            min_quality: 0,
13871            min_prob: 0.0,
13872            count_min: crate::cli::CountMinSettings {
13873                cells: Some(512),
13874                hashes: Some(2),
13875                bits: Some(32),
13876                memory_bytes: None,
13877            },
13878            prefilter: crate::cli::PrefilterSettings {
13879                enabled: true,
13880                force_disabled: false,
13881                ..Default::default()
13882            },
13883            ..Config::default()
13884        };
13885
13886        let counts = build_input_counts(&config).unwrap();
13887        let InputCounts::PrefilteredSketch {
13888            prefilter,
13889            limit,
13890            main,
13891        } = counts
13892        else {
13893            panic!("prefilter=t plus bounded count-min should build a two-stage sketch");
13894        };
13895        assert_eq!(prefilter.bits(), DEFAULT_PREFILTER_BITS);
13896        assert_eq!(limit, prefilter.max_count());
13897        assert_eq!(prefilter_total_cells(&config, DEFAULT_PREFILTER_BITS), 2867);
13898        assert_eq!(main_count_min_total_cells(&config, 32), 332);
13899        assert!(matches!(*main, InputCounts::AtomicSketch(_)));
13900    }
13901
13902    #[test]
13903    fn explicit_prefilter_memory_does_not_shrink_main_table_budget() {
13904        let config = Config {
13905            count_min: crate::cli::CountMinSettings {
13906                cells: Some(1000),
13907                hashes: Some(1),
13908                bits: Some(32),
13909                memory_bytes: None,
13910            },
13911            prefilter: crate::cli::PrefilterSettings {
13912                enabled: true,
13913                force_disabled: false,
13914                cells: None,
13915                hashes: Some(1),
13916                bits: Some(2),
13917                memory_bytes: Some(100),
13918                memory_fraction_micros: Some(250_000),
13919            },
13920            ..Config::default()
13921        };
13922
13923        assert_eq!(main_count_min_total_cells(&config, 32), 1000);
13924        assert_eq!(prefilter_total_cells(&config, 2), 400);
13925    }
13926
13927    #[test]
13928    fn prefiltered_input_counts_use_prefilter_until_saturation() {
13929        let low = KmerKey::Short(1);
13930        let high = KmerKey::Short(2);
13931        let mut prefilter = PackedCountMinSketch::new(4099, 2, 2).unwrap();
13932        prefilter.add_key_count(&low, 2);
13933        prefilter.add_key_count(&high, 3);
13934
13935        let main = AtomicCountMinSketch::new(4099, 2).unwrap();
13936        main.add_key_count(&low, 99);
13937        main.add_key_count(&high, 5);
13938
13939        let counts = InputCounts::PrefilteredSketch {
13940            limit: prefilter.max_count,
13941            prefilter: PrefilterCountMinSketch::Packed(prefilter),
13942            main: Box::new(InputCounts::AtomicSketch(main)),
13943        };
13944
13945        assert_eq!(counts.depth(&low), 2);
13946        assert_eq!(counts.depth(&high), 5);
13947    }
13948
13949    #[test]
13950    fn prefiltered_input_counts_honor_explicit_lower_prefilter_limit() {
13951        let key = KmerKey::Short(7);
13952        let mut prefilter = PackedCountMinSketch::new(4099, 2, 2).unwrap();
13953        prefilter.add_key_count(&key, 2);
13954
13955        let main = AtomicCountMinSketch::new(4099, 2).unwrap();
13956        main.add_key_count(&key, 11);
13957
13958        let counts = InputCounts::PrefilteredSketch {
13959            limit: 2,
13960            prefilter: PrefilterCountMinSketch::Packed(prefilter),
13961            main: Box::new(InputCounts::AtomicSketch(main)),
13962        };
13963
13964        assert_eq!(counts.depth(&key), 11);
13965    }
13966
13967    #[test]
13968    fn input_count_layout_summary_reports_prefilter_and_main_tables() {
13969        let prefilter =
13970            PackedCountMinSketch::new_with_min_arrays_and_mask_seed(4099, 2, 2, 4, 0).unwrap();
13971        let main = AtomicCountMinSketch::new_with_min_arrays_and_update_mode(
13972            8191,
13973            3,
13974            4,
13975            CountMinUpdateMode::Conservative,
13976            7,
13977        )
13978        .unwrap();
13979        let counts = InputCounts::PrefilteredSketch {
13980            limit: prefilter.max_count,
13981            prefilter: PrefilterCountMinSketch::Packed(prefilter),
13982            main: Box::new(InputCounts::AtomicSketch(main)),
13983        };
13984
13985        let layouts = counts.sketch_layouts();
13986
13987        assert_eq!(layouts.len(), 2);
13988        assert_eq!(layouts[0].table, "input_prefilter");
13989        assert_eq!(layouts[0].kind, "packed");
13990        assert_eq!(layouts[0].bits, 2);
13991        assert_eq!(layouts[0].hashes, 2);
13992        assert_eq!(layouts[0].mask_seed, 0);
13993        assert_eq!(layouts[0].update_mode, "conservative");
13994        assert_eq!(layouts[0].prefilter_limit, Some(3));
13995        assert!(layouts[0].memory_bytes > 0);
13996        assert_eq!(layouts[1].table, "input_main");
13997        assert_eq!(layouts[1].kind, "atomic");
13998        assert_eq!(layouts[1].bits, 32);
13999        assert_eq!(layouts[1].hashes, 3);
14000        assert_eq!(layouts[1].mask_seed, 7);
14001        assert_eq!(layouts[1].prefilter_limit, None);
14002        assert!(layouts[1].arrays >= 4);
14003        assert!(layouts[1].memory_bytes >= layouts[1].cells * std::mem::size_of::<AtomicU32>());
14004    }
14005
14006    #[test]
14007    fn prefilter_gate_uses_explicit_limit_for_main_counts() {
14008        let below = KmerKey::Short(1);
14009        let at_limit = KmerKey::Short(2);
14010        let above = KmerKey::Short(3);
14011        let mut prefilter = PackedCountMinSketch::new(4099, 2, 2).unwrap();
14012        prefilter.add_key_count(&below, 1);
14013        prefilter.add_key_count(&at_limit, 2);
14014        prefilter.add_key_count(&above, 3);
14015
14016        let mut counts = CountMap::default();
14017        counts.insert(below.clone(), 10);
14018        counts.insert(at_limit.clone(), 20);
14019        counts.insert(above.clone(), 30);
14020
14021        let prefilter = PrefilterCountMinSketch::Packed(prefilter);
14022        retain_prefilter_saturated_counts(&mut counts, Some(PrefilterGate::new(&prefilter, 2)));
14023
14024        assert!(!counts.contains_key(&below));
14025        assert_eq!(counts.get(&at_limit), Some(&20));
14026        assert_eq!(counts.get(&above), Some(&30));
14027    }
14028
14029    #[test]
14030    fn prefilter_gate_during_collection_matches_post_retain() {
14031        let r1 = record("r1", b"ACGTACGTACGT");
14032        let r2 = record("r2", b"TGCATGCATGCA");
14033
14034        for remove_duplicate_kmers in [false, true] {
14035            let config = Config {
14036                k: 3,
14037                min_quality: 0,
14038                min_prob: 0.0,
14039                remove_duplicate_kmers,
14040                ..Config::default()
14041            };
14042            let mut prefilter = PackedCountMinSketch::new(4099, 2, 2).unwrap();
14043            let keys = unique_pair_kmers(&config, &r1, Some(&r2));
14044            for key in keys.iter().step_by(2) {
14045                prefilter.add_key_count(key, prefilter.max_count);
14046            }
14047            let prefilter = PrefilterCountMinSketch::Packed(prefilter);
14048            let gate = PrefilterGate::new(&prefilter, prefilter.max_count());
14049            assert!(
14050                keys.iter().any(|key| !gate.should_count_in_main(key)),
14051                "fixture should include at least one prefilter-rejected k-mer"
14052            );
14053
14054            let mut post_retain = CountMap::default();
14055            increment_pair_counts(&config, &mut post_retain, &r1, Some(&r2));
14056            retain_prefilter_saturated_counts(&mut post_retain, Some(gate));
14057
14058            let mut during_collection = CountMap::default();
14059            increment_pair_counts_with_prefilter(
14060                &config,
14061                &mut during_collection,
14062                &r1,
14063                Some(&r2),
14064                Some(gate),
14065            );
14066
14067            assert_eq!(during_collection, post_retain);
14068        }
14069    }
14070
14071    #[test]
14072    fn prefiltered_input_counts_use_thresholded_main_unique_estimates_above_prefilter_max() {
14073        let mut prefilter = PackedCountMinSketch::new(1024, 4, 2).unwrap();
14074        for bucket in 0..256 {
14075            let depth = if bucket < 128 { prefilter.max_count } else { 1 };
14076            prefilter.set_cell(bucket, depth);
14077        }
14078        prefilter.increments = 1_000;
14079
14080        let main = AtomicCountMinSketch::new(1024, 4).unwrap();
14081        for bucket in 0..128 {
14082            main.cells_by_hash[bucket].store(4, Ordering::Relaxed);
14083        }
14084        main.add_key_increments(1_000);
14085
14086        let counts = InputCounts::PrefilteredSketch {
14087            limit: prefilter.max_count,
14088            prefilter: PrefilterCountMinSketch::Packed(prefilter),
14089            main: Box::new(InputCounts::AtomicSketch(main)),
14090        };
14091
14092        let all_depth_estimated = counts.unique_kmers();
14093        let saturated_prefilter_estimated = counts.unique_kmers_at_least(2);
14094        let high_depth_estimated = counts.unique_kmers_at_least(4);
14095        let split = counts.unique_kmer_estimate_split().unwrap();
14096
14097        assert!(
14098            (70..=80).contains(&all_depth_estimated),
14099            "prefilter all-depth estimate was {all_depth_estimated}"
14100        );
14101        assert!(
14102            (30..=40).contains(&saturated_prefilter_estimated),
14103            "prefilter threshold estimate was {saturated_prefilter_estimated}"
14104        );
14105        assert!(
14106            (30..=40).contains(&high_depth_estimated),
14107            "main high-depth estimate was {high_depth_estimated}"
14108        );
14109        assert_eq!(split.low_depth_max, 3);
14110        assert_eq!(split.high_depth_min, 4);
14111        assert_eq!(split.high_depth_kmers, high_depth_estimated);
14112        assert_eq!(
14113            split.low_depth_kmers,
14114            all_depth_estimated.saturating_sub(high_depth_estimated)
14115        );
14116        assert!(
14117            (30..=50).contains(&split.low_depth_kmers),
14118            "prefilter low-depth split estimate was {}",
14119            split.low_depth_kmers
14120        );
14121    }
14122
14123    #[test]
14124    fn bounded_input_counts_builds_two_stage_prefiltered_sketch() {
14125        let dir = tempfile::tempdir().unwrap();
14126        let path = dir.path().join("reads.fq");
14127        write_fastq(
14128            &path,
14129            &[
14130                ("r1", b"ACGTACGT", b"IIIIIIII"),
14131                ("r2", b"ACGTACGT", b"IIIIIIII"),
14132                ("r3", b"ACGTACGT", b"IIIIIIII"),
14133            ],
14134        );
14135        let config = Config {
14136            in1: Some(path),
14137            k: 3,
14138            min_quality: 0,
14139            min_prob: 0.0,
14140            count_min: crate::cli::CountMinSettings {
14141                cells: Some(128),
14142                hashes: Some(2),
14143                bits: Some(32),
14144                memory_bytes: None,
14145            },
14146            prefilter: crate::cli::PrefilterSettings {
14147                enabled: false,
14148                force_disabled: false,
14149                cells: None,
14150                hashes: Some(2),
14151                bits: None,
14152                memory_bytes: Some(1024),
14153                memory_fraction_micros: None,
14154            },
14155            ..Config::default()
14156        };
14157
14158        let counts = build_input_counts(&config).unwrap();
14159        let InputCounts::PrefilteredSketch {
14160            prefilter,
14161            limit,
14162            main,
14163        } = counts
14164        else {
14165            panic!("prefilter memory plus bounded count-min should build a two-stage sketch");
14166        };
14167        assert_eq!(prefilter.bits(), DEFAULT_PREFILTER_BITS);
14168        assert_eq!(prefilter.max_count(), 3);
14169        assert_eq!(limit, prefilter.max_count());
14170        assert_eq!(prefilter.update_mode(), CountMinUpdateMode::Conservative);
14171        assert!(matches!(*main, InputCounts::AtomicSketch(_)));
14172    }
14173
14174    #[test]
14175    fn trusted_build_pass_filter_reduces_non_singleton_depths() {
14176        let config = Config {
14177            build_passes: 2,
14178            ..Config::default()
14179        };
14180        let mut counts = CountMap::default();
14181        counts.insert(KmerKey::Short(7), 1);
14182        counts.insert(KmerKey::Short(11), 2);
14183        counts.insert(KmerKey::Short(13), 3);
14184
14185        apply_trusted_build_pass_filter(&config, &mut counts);
14186
14187        assert_eq!(counts.get(&KmerKey::Short(7)), Some(&1));
14188        assert_eq!(counts.get(&KmerKey::Short(11)), Some(&1));
14189        assert_eq!(counts.get(&KmerKey::Short(13)), Some(&2));
14190    }
14191
14192    #[test]
14193    fn ecco_auto_disables_overlap_repair_when_java_style_sample_is_empty() {
14194        let dir = tempfile::tempdir().unwrap();
14195        let r1_path = dir.path().join("r1.fq");
14196        let r2_path = dir.path().join("r2.fq");
14197        let r1 = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
14198        let r2 = b"GTCGCATATTTCAAGCACTAATTCGCTGCGGCACAACTAA";
14199        let q = b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII";
14200        write_fastq(
14201            &r1_path,
14202            &[
14203                ("overlap1/1", r1, q),
14204                ("overlap2/1", r1, q),
14205                ("overlap3/1", r1, q),
14206                ("overlap4/1", r1, q),
14207            ],
14208        );
14209        write_fastq(
14210            &r2_path,
14211            &[
14212                ("overlap1/2", r2, q),
14213                ("overlap2/2", r2, q),
14214                ("overlap3/2", r2, q),
14215                ("overlap4/2", r2, q),
14216            ],
14217        );
14218        let config = Config {
14219            in1: Some(r1_path),
14220            in2: Some(r2_path),
14221            error_correct: true,
14222            error_correct_first: true,
14223            error_correct_final: true,
14224            overlap_error_correct_auto: true,
14225            ..Config::default()
14226        };
14227
14228        let resolved = resolve_overlap_error_correct_auto(&config).unwrap();
14229
14230        assert!(!resolved.overlap_error_correct_auto);
14231        assert!(!resolved.overlap_error_correct);
14232    }
14233
14234    #[test]
14235    fn ecco_auto_enables_overlap_repair_for_sampled_mergeable_pairs() {
14236        let dir = tempfile::tempdir().unwrap();
14237        let r1_path = dir.path().join("r1.fq");
14238        let r2_path = dir.path().join("r2.fq");
14239        let r1 = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
14240        let r2 = b"GTCGCATATTTCAAGCACTACTTCGCTGCGGCACAACTAA";
14241        let q = b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII";
14242        write_repeated_fastq(&r1_path, "overlap/1_", r1, q, 200);
14243        write_repeated_fastq(&r2_path, "overlap/2_", r2, q, 200);
14244        let config = Config {
14245            in1: Some(r1_path),
14246            in2: Some(r2_path),
14247            error_correct: true,
14248            error_correct_first: true,
14249            error_correct_final: true,
14250            overlap_error_correct_auto: true,
14251            ..Config::default()
14252        };
14253
14254        let resolved = resolve_overlap_error_correct_auto(&config).unwrap();
14255
14256        assert!(!resolved.overlap_error_correct_auto);
14257        assert!(resolved.overlap_error_correct);
14258    }
14259
14260    #[test]
14261    fn countup_abrc_controls_tossed_read_table_updates() {
14262        let keys = vec![KmerKey::Short(7), KmerKey::Short(11)];
14263        let mut input_counts = CountMap::default();
14264        input_counts.insert(keys[0].clone(), 3);
14265        input_counts.insert(keys[1].clone(), 3);
14266
14267        let base_config = Config {
14268            min_depth: 1,
14269            ..Config::default()
14270        };
14271        let mut kept_counts = OutputCounts::Exact(CountMap::default());
14272        update_countup_kept_counts_for_decision(
14273            &base_config,
14274            &mut kept_counts,
14275            &input_counts,
14276            &keys,
14277            true,
14278        );
14279        assert_eq!(kept_counts.unique_kmers(), 0);
14280
14281        let add_bad_config = Config {
14282            add_bad_reads_countup: true,
14283            ..base_config.clone()
14284        };
14285        update_countup_kept_counts_for_decision(
14286            &add_bad_config,
14287            &mut kept_counts,
14288            &input_counts,
14289            &keys,
14290            true,
14291        );
14292        assert_eq!(kept_counts.depth(&keys[0]), 1);
14293        assert_eq!(kept_counts.depth(&keys[1]), 1);
14294
14295        update_countup_kept_counts_for_decision(
14296            &base_config,
14297            &mut kept_counts,
14298            &input_counts,
14299            &keys,
14300            false,
14301        );
14302        assert_eq!(kept_counts.depth(&keys[0]), 2);
14303        assert_eq!(kept_counts.depth(&keys[1]), 2);
14304    }
14305
14306    #[test]
14307    fn countup_decision_plan_reuses_input_depth_gate_for_kept_updates() {
14308        let keys = vec![KmerKey::Short(7), KmerKey::Short(11), KmerKey::Short(13)];
14309        let mut input_counts = CountMap::default();
14310        input_counts.insert(keys[0].clone(), 0);
14311        input_counts.insert(keys[1].clone(), 3);
14312        input_counts.insert(keys[2].clone(), 4);
14313        let kept_counts = CountMap::default();
14314        let config = Config {
14315            min_depth: 2,
14316            min_kmers_over_min_depth: 1,
14317            target_depth: 10,
14318            add_bad_reads_countup: true,
14319            ..Config::default()
14320        };
14321
14322        let plan = countup_decision_plan(&config, &input_counts, &kept_counts, &keys, 10);
14323        assert_eq!(
14324            plan.toss,
14325            decide_countup_pair(&config, &input_counts, &kept_counts, &keys, 10)
14326        );
14327        assert_eq!(plan.eligible_key_indices, vec![1, 2]);
14328
14329        let mut planned_counts = OutputCounts::Exact(CountMap::default());
14330        update_countup_kept_counts_for_plan(&config, &mut planned_counts, &keys, &plan);
14331
14332        let mut replayed_counts = OutputCounts::Exact(CountMap::default());
14333        update_countup_kept_counts_for_decision(
14334            &config,
14335            &mut replayed_counts,
14336            &input_counts,
14337            &keys,
14338            plan.toss,
14339        );
14340
14341        assert_eq!(
14342            planned_counts.unique_kmers(),
14343            replayed_counts.unique_kmers()
14344        );
14345        assert_eq!(planned_counts.depth(&keys[0]), 0);
14346        assert_eq!(planned_counts.depth(&keys[1]), 1);
14347        assert_eq!(planned_counts.depth(&keys[2]), 1);
14348    }
14349
14350    #[test]
14351    fn countup_bounded_kept_counts_use_sketch_when_cells_are_constrained() {
14352        let keys = vec![KmerKey::Short(7), KmerKey::Short(11)];
14353        let mut input_counts = CountMap::default();
14354        input_counts.insert(keys[0].clone(), 3);
14355        input_counts.insert(keys[1].clone(), 3);
14356        let config = Config {
14357            min_depth: 1,
14358            count_min: crate::cli::CountMinSettings {
14359                cells: Some(1),
14360                hashes: Some(2),
14361                bits: Some(3),
14362                memory_bytes: None,
14363            },
14364            ..Config::default()
14365        };
14366        let mut kept_counts = new_output_counts(&config).unwrap();
14367
14368        update_countup_kept_counts_for_decision(
14369            &config,
14370            &mut kept_counts,
14371            &input_counts,
14372            &keys,
14373            false,
14374        );
14375
14376        let OutputCounts::Sketch(sketch) = kept_counts else {
14377            panic!("countup cells= should use a bounded kept-count sketch");
14378        };
14379        assert_eq!(sketch.words.len(), 1);
14380        assert_eq!(sketch.depth(&keys[0]), 2);
14381        assert_eq!(sketch.depth(&keys[1]), 2);
14382    }
14383
14384    #[test]
14385    fn countup_kept_count_sketch_uses_java_target_sized_cells() {
14386        let config = Config {
14387            count_up: true,
14388            target_depth: 100,
14389            threads: Some(1),
14390            count_min: crate::cli::CountMinSettings {
14391                cells: Some(10_000),
14392                hashes: Some(8),
14393                bits: Some(32),
14394                memory_bytes: None,
14395            },
14396            ..Config::default()
14397        };
14398
14399        let kept_counts = new_output_counts(&config).unwrap();
14400
14401        let OutputCounts::Sketch(sketch) = kept_counts else {
14402            panic!("countup kept-count table should use a packed sketch");
14403        };
14404        assert_eq!(sketch.bits, 8);
14405        assert_eq!(sketch.hashes, 3);
14406        assert_eq!(sketch.cells, 9_998);
14407        assert_eq!(
14408            sketch.layout.mask_seed,
14409            BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED
14410        );
14411    }
14412
14413    #[test]
14414    fn countup_kept_count_bits_use_adjusted_target_boundaries_like_bbnorm() {
14415        assert_eq!(
14416            countup_output_count_bits(&Config {
14417                count_up: true,
14418                target_depth: 16,
14419                ..Config::default()
14420            }),
14421            4
14422        );
14423        assert_eq!(
14424            countup_output_count_bits(&Config {
14425                count_up: true,
14426                target_depth: 17,
14427                ..Config::default()
14428            }),
14429            8
14430        );
14431        assert_eq!(
14432            countup_output_count_bits(&Config {
14433                count_up: true,
14434                target_depth: 268,
14435                ..Config::default()
14436            }),
14437            8
14438        );
14439        assert_eq!(
14440            countup_output_count_bits(&Config {
14441                count_up: true,
14442                target_depth: 269,
14443                ..Config::default()
14444            }),
14445            16
14446        );
14447    }
14448
14449    #[test]
14450    fn output_pair_analysis_is_only_required_for_rename_or_depth_bins() {
14451        assert!(!needs_output_pair_analysis(&Config::default()));
14452        assert!(needs_output_pair_analysis(&Config {
14453            rename_reads: true,
14454            ..Config::default()
14455        }));
14456        assert!(needs_output_pair_analysis(&Config {
14457            out_low1: Some(PathBuf::from("low.fq")),
14458            ..Config::default()
14459        }));
14460        assert!(needs_output_pair_analysis(&Config {
14461            out_high2: Some(PathBuf::from("high2.fq")),
14462            ..Config::default()
14463        }));
14464    }
14465
14466    #[test]
14467    fn countup_kept_count_sketch_uses_next_mask_seed_after_prefilter_and_main() {
14468        let config = Config {
14469            count_up: true,
14470            target_depth: 100,
14471            threads: Some(1),
14472            prefilter: crate::cli::PrefilterSettings {
14473                enabled: true,
14474                force_disabled: false,
14475                ..Default::default()
14476            },
14477            count_min: crate::cli::CountMinSettings {
14478                cells: Some(10_000),
14479                hashes: Some(3),
14480                bits: Some(32),
14481                memory_bytes: None,
14482            },
14483            ..Config::default()
14484        };
14485
14486        let kept_counts = new_output_counts(&config).unwrap();
14487
14488        let OutputCounts::Sketch(sketch) = kept_counts else {
14489            panic!("countup kept-count table should use a packed sketch");
14490        };
14491        assert_eq!(
14492            sketch.layout.mask_seed,
14493            BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED + BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP
14494        );
14495    }
14496
14497    #[test]
14498    fn multipass_caps_wide_count_min_bits_like_bbnorm() {
14499        let mut default_bits = Config {
14500            passes: 2,
14501            ..Config::default()
14502        };
14503        apply_bbtools_multipass_cell_bits_cap(&mut default_bits);
14504        assert_eq!(default_bits.count_min.bits, Some(16));
14505
14506        let mut explicit_wide_bits = Config {
14507            passes: 2,
14508            count_min: crate::cli::CountMinSettings {
14509                bits: Some(32),
14510                ..Default::default()
14511            },
14512            ..Config::default()
14513        };
14514        apply_bbtools_multipass_cell_bits_cap(&mut explicit_wide_bits);
14515        assert_eq!(explicit_wide_bits.count_min.bits, Some(16));
14516
14517        let mut explicit_narrow_bits = Config {
14518            passes: 2,
14519            count_min: crate::cli::CountMinSettings {
14520                bits: Some(8),
14521                ..Default::default()
14522            },
14523            ..Config::default()
14524        };
14525        apply_bbtools_multipass_cell_bits_cap(&mut explicit_narrow_bits);
14526        assert_eq!(explicit_narrow_bits.count_min.bits, Some(8));
14527
14528        let mut single_pass = Config {
14529            passes: 1,
14530            ..Config::default()
14531        };
14532        apply_bbtools_multipass_cell_bits_cap(&mut single_pass);
14533        assert_eq!(single_pass.count_min.bits, None);
14534    }
14535
14536    #[test]
14537    fn multipass_intermediate_pass_uses_bits1_like_bbnorm() {
14538        let config = Config {
14539            passes: 2,
14540            count_min_bits_first: Some(8),
14541            count_min: crate::cli::CountMinSettings {
14542                bits: Some(16),
14543                ..Default::default()
14544            },
14545            ..Config::default()
14546        };
14547
14548        let pass_config = pass_config_for_intermediate(
14549            &config,
14550            1,
14551            Path::new("in1.fq"),
14552            None,
14553            false,
14554            PathBuf::from("out1.fq"),
14555            None,
14556            None,
14557            None,
14558        );
14559
14560        assert_eq!(pass_config.count_min.bits, Some(8));
14561        assert_eq!(config.count_min.bits, Some(16));
14562    }
14563
14564    #[test]
14565    fn count_map_capacity_hint_uses_initialsize_and_prealloc() {
14566        let explicit = Config {
14567            table_initial_size: Some(1234),
14568            ..Config::default()
14569        };
14570        assert_eq!(count_map_capacity_hint(&explicit), Some(1234));
14571
14572        let paired_prealloc = Config {
14573            table_prealloc_fraction: Some(0.5),
14574            table_reads: Some(10),
14575            in2: Some(PathBuf::from("mate.fq")),
14576            k: 31,
14577            ..Config::default()
14578        };
14579        assert_eq!(preallocation_capacity_hint(&paired_prealloc), Some(700));
14580
14581        let larger_prealloc = Config {
14582            table_initial_size: Some(100),
14583            table_prealloc_fraction: Some(1.0),
14584            table_reads: Some(10),
14585            in2: Some(PathBuf::from("mate.fq")),
14586            k: 31,
14587            ..Config::default()
14588        };
14589        assert_eq!(count_map_capacity_hint(&larger_prealloc), Some(1400));
14590    }
14591
14592    #[test]
14593    fn countup_presort_prefers_low_error_reads_like_java() {
14594        let config = Config {
14595            k: 3,
14596            min_depth: 1,
14597            low_thresh: 1,
14598            high_thresh: 3,
14599            error_detect_ratio: 2,
14600            low_percentile: 0.20,
14601            ..Config::default()
14602        };
14603        let clean = SequenceRecord {
14604            id: "clean".to_string(),
14605            numeric_id: 2,
14606            bases: b"AAAAAAAAAA".to_vec(),
14607            qualities: Some(vec![b'I'; 10]),
14608        };
14609        let noisy = SequenceRecord {
14610            id: "noisy".to_string(),
14611            numeric_id: 1,
14612            bases: b"AAAAACCCCC".to_vec(),
14613            qualities: Some(vec![b'I'; 10]),
14614        };
14615        let mut input_counts = CountMap::default();
14616        for key in kmers_for_record(&clean, &config) {
14617            input_counts.insert(key, 10);
14618        }
14619
14620        let mut pairs = [
14621            CountupWorkPair {
14622                input_list_index: 0,
14623                sort_key: countup_sort_key(&config, &input_counts, &noisy, None, 0),
14624                r1: noisy,
14625                r2: None,
14626            },
14627            CountupWorkPair {
14628                input_list_index: 0,
14629                sort_key: countup_sort_key(&config, &input_counts, &clean, None, 1),
14630                r1: clean,
14631                r2: None,
14632            },
14633        ];
14634        pairs.sort_by(compare_countup_work_pairs);
14635
14636        assert_eq!(pairs[0].r1.id, "clean");
14637        assert_eq!(pairs[0].sort_key.errors, 0);
14638        assert!(pairs[1].sort_key.errors > pairs[0].sort_key.errors);
14639    }
14640
14641    #[test]
14642    fn countup_presort_tie_breaks_by_record_id_without_duplicate_key_id() {
14643        fn tied_pair(id: &str, original_index: usize) -> CountupWorkPair {
14644            CountupWorkPair {
14645                input_list_index: 0,
14646                sort_key: CountupSortKey {
14647                    errors: 0,
14648                    total_len: 8,
14649                    expected_errors: 0.0,
14650                    numeric_id: 0,
14651                    original_index,
14652                },
14653                r1: record(id, b"ACGTACGT"),
14654                r2: None,
14655            }
14656        }
14657
14658        let mut pairs = [tied_pair("read_b", 0), tied_pair("read_a", 1)];
14659        pairs.sort_by(compare_countup_work_pairs);
14660
14661        assert_eq!(pairs[0].r1.id, "read_a");
14662        assert_eq!(pairs[1].r1.id, "read_b");
14663    }
14664
14665    #[test]
14666    fn countup_spilled_runs_merge_like_in_memory_sort() {
14667        fn work_pair(
14668            id: &str,
14669            errors: usize,
14670            len: usize,
14671            original_index: usize,
14672        ) -> CountupWorkPair {
14673            CountupWorkPair {
14674                input_list_index: 0,
14675                sort_key: CountupSortKey {
14676                    errors,
14677                    total_len: len,
14678                    expected_errors: errors as f64,
14679                    numeric_id: original_index as u64,
14680                    original_index,
14681                },
14682                r1: record(id, b"ACGTACGT"),
14683                r2: None,
14684            }
14685        }
14686
14687        let config = Config::default();
14688        let mut temp_dir = None;
14689        let mut run_paths = Vec::new();
14690        let mut spill_summary = CountupSpillSummary::default();
14691        let mut first_run = vec![work_pair("worse", 2, 8, 2), work_pair("best", 0, 8, 0)];
14692        let mut second_run = vec![work_pair("longer", 1, 12, 1), work_pair("shorter", 1, 8, 3)];
14693        let mut expected = first_run.clone();
14694        expected.extend(second_run.clone());
14695        expected.sort_by(compare_countup_work_pairs);
14696
14697        spill_countup_run(
14698            &config,
14699            &mut temp_dir,
14700            &mut run_paths,
14701            &mut spill_summary,
14702            &mut first_run,
14703        )
14704        .unwrap();
14705        spill_countup_run(
14706            &config,
14707            &mut temp_dir,
14708            &mut run_paths,
14709            &mut spill_summary,
14710            &mut second_run,
14711        )
14712        .unwrap();
14713        spill_summary.final_runs = run_paths.len();
14714        let source = CountupWorkSource {
14715            temp_dir,
14716            inner: CountupWorkSourceInner::Spilled(run_paths),
14717        };
14718        let mut iter = source.into_iter().unwrap();
14719        let mut actual_ids = Vec::new();
14720        while let Some(pair) = iter.next_pair().unwrap() {
14721            actual_ids.push(pair.r1.id);
14722        }
14723        let expected_ids: Vec<_> = expected.into_iter().map(|pair| pair.r1.id).collect();
14724
14725        assert_eq!(actual_ids, expected_ids);
14726        assert_eq!(actual_ids, ["best", "longer", "shorter", "worse"]);
14727        assert_eq!(spill_summary.initial_runs, 2);
14728        assert_eq!(spill_summary.merge_runs, 0);
14729        assert_eq!(spill_summary.final_runs, 2);
14730        assert!(spill_summary.bytes_written > 0);
14731        assert_eq!(
14732            spill_summary.peak_live_bytes,
14733            spill_summary.final_live_bytes
14734        );
14735    }
14736
14737    #[test]
14738    fn countup_spill_live_limit_aborts_initial_run() {
14739        let config = Config {
14740            max_countup_spill_live_bytes: Some(0),
14741            ..Config::default()
14742        };
14743        let mut temp_dir = None;
14744        let mut run_paths = Vec::new();
14745        let mut spill_summary = CountupSpillSummary::default();
14746        let mut run = vec![CountupWorkPair {
14747            input_list_index: 0,
14748            sort_key: CountupSortKey {
14749                errors: 0,
14750                total_len: 8,
14751                expected_errors: 0.0,
14752                numeric_id: 0,
14753                original_index: 0,
14754            },
14755            r1: record("read", b"ACGTACGT"),
14756            r2: None,
14757        }];
14758
14759        let err = spill_countup_run(
14760            &config,
14761            &mut temp_dir,
14762            &mut run_paths,
14763            &mut spill_summary,
14764            &mut run,
14765        )
14766        .unwrap_err()
14767        .to_string();
14768
14769        assert!(err.contains("maxcountupspillbytes"), "{err}");
14770        assert_eq!(spill_summary.initial_runs, 1);
14771        assert!(spill_summary.peak_live_bytes > 0);
14772        assert_eq!(run_paths.len(), 1);
14773    }
14774
14775    #[test]
14776    fn countup_spill_final_live_limit_aborts_initial_run() {
14777        let config = Config {
14778            max_countup_spill_final_live_bytes: Some(0),
14779            ..Config::default()
14780        };
14781        let mut temp_dir = None;
14782        let mut run_paths = Vec::new();
14783        let mut spill_summary = CountupSpillSummary::default();
14784        let mut run = vec![CountupWorkPair {
14785            input_list_index: 0,
14786            sort_key: CountupSortKey {
14787                errors: 0,
14788                total_len: 8,
14789                expected_errors: 0.0,
14790                numeric_id: 0,
14791                original_index: 0,
14792            },
14793            r1: record("read", b"ACGTACGT"),
14794            r2: None,
14795        }];
14796
14797        let err = spill_countup_run(
14798            &config,
14799            &mut temp_dir,
14800            &mut run_paths,
14801            &mut spill_summary,
14802            &mut run,
14803        )
14804        .unwrap_err()
14805        .to_string();
14806
14807        assert!(err.contains("maxcountupspillfinallivebytes"), "{err}");
14808        assert_eq!(spill_summary.initial_runs, 1);
14809        assert!(spill_summary.final_live_bytes > 0);
14810        assert_eq!(run_paths.len(), 1);
14811    }
14812
14813    #[test]
14814    fn countup_spill_initial_run_limit_aborts_initial_run() {
14815        let config = Config {
14816            max_countup_spill_initial_runs: Some(0),
14817            ..Config::default()
14818        };
14819        let mut temp_dir = None;
14820        let mut run_paths = Vec::new();
14821        let mut spill_summary = CountupSpillSummary::default();
14822        let mut run = vec![CountupWorkPair {
14823            input_list_index: 0,
14824            sort_key: CountupSortKey {
14825                errors: 0,
14826                total_len: 8,
14827                expected_errors: 0.0,
14828                numeric_id: 0,
14829                original_index: 0,
14830            },
14831            r1: record("read", b"ACGTACGT"),
14832            r2: None,
14833        }];
14834
14835        let err = spill_countup_run(
14836            &config,
14837            &mut temp_dir,
14838            &mut run_paths,
14839            &mut spill_summary,
14840            &mut run,
14841        )
14842        .unwrap_err()
14843        .to_string();
14844
14845        assert!(err.contains("maxcountupspillinitialruns"), "{err}");
14846        assert_eq!(spill_summary.initial_runs, 1);
14847    }
14848
14849    #[test]
14850    fn countup_compacted_run_group_preserves_sorted_order() {
14851        fn work_pair(
14852            id: &str,
14853            errors: usize,
14854            len: usize,
14855            original_index: usize,
14856        ) -> CountupWorkPair {
14857            CountupWorkPair {
14858                input_list_index: 0,
14859                sort_key: CountupSortKey {
14860                    errors,
14861                    total_len: len,
14862                    expected_errors: errors as f64,
14863                    numeric_id: original_index as u64,
14864                    original_index,
14865                },
14866                r1: record(id, b"ACGTACGT"),
14867                r2: None,
14868            }
14869        }
14870
14871        let dir = tempfile::tempdir().unwrap();
14872        let mut all_pairs = Vec::new();
14873        let mut paths = Vec::new();
14874        for (run_index, mut run) in [
14875            vec![work_pair("c", 3, 8, 3), work_pair("a", 0, 8, 0)],
14876            vec![work_pair("d", 4, 8, 4), work_pair("b", 1, 8, 1)],
14877            vec![work_pair("e", 5, 8, 5), work_pair("aa", 1, 12, 2)],
14878        ]
14879        .into_iter()
14880        .enumerate()
14881        {
14882            all_pairs.extend(run.clone());
14883            run.sort_by(compare_countup_work_pairs);
14884            let path = dir.path().join(format!("run-{run_index}.bin"));
14885            write_countup_run(&path, &run).unwrap();
14886            paths.push(path);
14887        }
14888        all_pairs.sort_by(compare_countup_work_pairs);
14889
14890        let merged = dir.path().join("merged.bin");
14891        let merged_bytes = merge_countup_run_group(&paths, &merged).unwrap();
14892        let mut reader = CountupRunReader::open(&merged).unwrap();
14893        let mut actual_ids = Vec::new();
14894        while let Some(pair) = reader.next_pair().unwrap() {
14895            actual_ids.push(pair.r1.id);
14896        }
14897        let expected_ids: Vec<_> = all_pairs.into_iter().map(|pair| pair.r1.id).collect();
14898
14899        assert_eq!(actual_ids, expected_ids);
14900        assert_eq!(merged_bytes, merged.metadata().unwrap().len());
14901    }
14902
14903    #[test]
14904    fn countup_compaction_tracks_peak_and_final_temp_bytes() {
14905        let dir = tempfile::tempdir().unwrap();
14906        let mut paths = Vec::new();
14907        let mut spill_summary = CountupSpillSummary::default();
14908
14909        for run_index in 0..=COUNTUP_SORT_MERGE_FANIN {
14910            let path = dir.path().join(format!("run-{run_index}.bin"));
14911            let pair = CountupWorkPair {
14912                input_list_index: 0,
14913                sort_key: CountupSortKey {
14914                    errors: run_index,
14915                    total_len: 8,
14916                    expected_errors: run_index as f64,
14917                    numeric_id: run_index as u64,
14918                    original_index: run_index,
14919                },
14920                r1: record(&format!("read-{run_index}"), b"ACGTACGT"),
14921                r2: None,
14922            };
14923            let bytes = write_countup_run(&path, &[pair]).unwrap();
14924            spill_summary.note_initial_run(bytes);
14925            paths.push(path);
14926        }
14927
14928        let initial_live_bytes = spill_summary.final_live_bytes;
14929        compact_countup_runs(&Config::default(), &mut paths, &mut spill_summary).unwrap();
14930        spill_summary.final_runs = paths.len();
14931        let final_live_from_files: u64 = paths
14932            .iter()
14933            .map(|path| path.metadata().unwrap().len())
14934            .sum();
14935
14936        assert_eq!(spill_summary.initial_runs, COUNTUP_SORT_MERGE_FANIN + 1);
14937        assert_eq!(spill_summary.merge_runs, 2);
14938        assert_eq!(spill_summary.final_runs, 2);
14939        assert_eq!(spill_summary.final_live_bytes, final_live_from_files);
14940        assert!(spill_summary.bytes_written > initial_live_bytes);
14941        assert!(spill_summary.peak_live_bytes >= initial_live_bytes);
14942    }
14943
14944    #[test]
14945    fn countup_spill_write_limit_aborts_compaction() {
14946        let dir = tempfile::tempdir().unwrap();
14947        let mut paths = Vec::new();
14948        let mut spill_summary = CountupSpillSummary::default();
14949
14950        for run_index in 0..=COUNTUP_SORT_MERGE_FANIN {
14951            let path = dir.path().join(format!("run-{run_index}.bin"));
14952            let pair = CountupWorkPair {
14953                input_list_index: 0,
14954                sort_key: CountupSortKey {
14955                    errors: run_index,
14956                    total_len: 8,
14957                    expected_errors: run_index as f64,
14958                    numeric_id: run_index as u64,
14959                    original_index: run_index,
14960                },
14961                r1: record(&format!("read-{run_index}"), b"ACGTACGT"),
14962                r2: None,
14963            };
14964            let bytes = write_countup_run(&path, &[pair]).unwrap();
14965            spill_summary.note_initial_run(bytes);
14966            paths.push(path);
14967        }
14968        let config = Config {
14969            max_countup_spill_write_bytes: Some(spill_summary.bytes_written),
14970            ..Config::default()
14971        };
14972
14973        let err = compact_countup_runs(&config, &mut paths, &mut spill_summary)
14974            .unwrap_err()
14975            .to_string();
14976
14977        assert!(err.contains("maxcountupspillwritebytes"), "{err}");
14978        assert!(spill_summary.merge_runs > 0);
14979        assert!(spill_summary.bytes_written > config.max_countup_spill_write_bytes.unwrap());
14980    }
14981
14982    #[test]
14983    fn countup_spill_run_limits_abort_compaction() {
14984        let dir = tempfile::tempdir().unwrap();
14985        let mut paths = Vec::new();
14986        let mut spill_summary = CountupSpillSummary::default();
14987
14988        for run_index in 0..=COUNTUP_SORT_MERGE_FANIN {
14989            let path = dir.path().join(format!("run-{run_index}.bin"));
14990            let pair = CountupWorkPair {
14991                input_list_index: 0,
14992                sort_key: CountupSortKey {
14993                    errors: run_index,
14994                    total_len: 8,
14995                    expected_errors: run_index as f64,
14996                    numeric_id: run_index as u64,
14997                    original_index: run_index,
14998                },
14999                r1: record(&format!("read-{run_index}"), b"ACGTACGT"),
15000                r2: None,
15001            };
15002            let bytes = write_countup_run(&path, &[pair]).unwrap();
15003            spill_summary.note_initial_run(bytes);
15004            paths.push(path);
15005        }
15006        let merge_limited = Config {
15007            max_countup_spill_merge_runs: Some(0),
15008            ..Config::default()
15009        };
15010        let mut merge_limited_paths = paths.clone();
15011        let err =
15012            compact_countup_runs(&merge_limited, &mut merge_limited_paths, &mut spill_summary)
15013                .unwrap_err()
15014                .to_string();
15015        assert!(err.contains("maxcountupspillmergeruns"), "{err}");
15016
15017        let mut spill_summary = CountupSpillSummary::default();
15018        let mut paths = Vec::new();
15019        for run_index in 0..=COUNTUP_SORT_MERGE_FANIN {
15020            let path = dir.path().join(format!("final-run-{run_index}.bin"));
15021            let pair = CountupWorkPair {
15022                input_list_index: 0,
15023                sort_key: CountupSortKey {
15024                    errors: run_index,
15025                    total_len: 8,
15026                    expected_errors: run_index as f64,
15027                    numeric_id: run_index as u64,
15028                    original_index: run_index,
15029                },
15030                r1: record(&format!("final-read-{run_index}"), b"ACGTACGT"),
15031                r2: None,
15032            };
15033            let bytes = write_countup_run(&path, &[pair]).unwrap();
15034            spill_summary.note_initial_run(bytes);
15035            paths.push(path);
15036        }
15037        let final_limited = Config {
15038            max_countup_spill_final_runs: Some(1),
15039            ..Config::default()
15040        };
15041        let err = compact_countup_runs(&final_limited, &mut paths, &mut spill_summary)
15042            .unwrap_err()
15043            .to_string();
15044        assert!(err.contains("maxcountupspillfinalruns"), "{err}");
15045    }
15046
15047    #[test]
15048    fn countup_run_reader_uses_large_spill_buffer() {
15049        let dir = tempfile::tempdir().unwrap();
15050        let path = dir.path().join("run.bin");
15051        let pair = CountupWorkPair {
15052            input_list_index: 0,
15053            sort_key: CountupSortKey {
15054                errors: 0,
15055                total_len: 8,
15056                expected_errors: 0.0,
15057                numeric_id: 0,
15058                original_index: 0,
15059            },
15060            r1: record("read", b"ACGTACGT"),
15061            r2: None,
15062        };
15063
15064        write_countup_run(&path, &[pair]).unwrap();
15065        let reader = CountupRunReader::open(&path).unwrap();
15066
15067        assert_eq!(reader.reader.capacity(), COUNTUP_RUN_IO_BUFFER_CAPACITY);
15068    }
15069
15070    #[test]
15071    fn countup_work_pair_memory_hint_tracks_payload_size() {
15072        let small = CountupWorkPair {
15073            input_list_index: 0,
15074            sort_key: CountupSortKey {
15075                errors: 0,
15076                total_len: 4,
15077                expected_errors: 0.0,
15078                numeric_id: 0,
15079                original_index: 0,
15080            },
15081            r1: record("small", b"ACGT"),
15082            r2: None,
15083        };
15084        let large = CountupWorkPair {
15085            input_list_index: 0,
15086            sort_key: CountupSortKey {
15087                errors: 0,
15088                total_len: 400,
15089                expected_errors: 0.0,
15090                numeric_id: 1,
15091                original_index: 1,
15092            },
15093            r1: record("large", &vec![b'A'; 400]),
15094            r2: Some(record("large/2", &vec![b'C'; 400])),
15095        };
15096
15097        assert!(countup_work_pair_memory_hint(&large) > countup_work_pair_memory_hint(&small));
15098    }
15099
15100    #[test]
15101    fn countup_work_candidate_memory_hint_tracks_payload_size() {
15102        let small = CountupWorkCandidate {
15103            input_list_index: 0,
15104            original_index: 0,
15105            rand: 0.0,
15106            r1: record("small", b"ACGT"),
15107            r2: None,
15108        };
15109        let large = CountupWorkCandidate {
15110            input_list_index: 0,
15111            original_index: 1,
15112            rand: 0.0,
15113            r1: record("large", &vec![b'A'; 400]),
15114            r2: Some(record("large/2", &vec![b'C'; 400])),
15115        };
15116
15117        assert!(
15118            countup_work_candidate_memory_hint(&large) > countup_work_candidate_memory_hint(&small)
15119        );
15120    }
15121
15122    #[test]
15123    fn countup_prepass_chunk_ready_respects_pair_and_byte_limits() {
15124        assert!(!countup_prepass_chunk_ready(
15125            COUNTUP_PREPASS_CHUNK_PAIR_LIMIT - 1,
15126            COUNTUP_PREPASS_CHUNK_BYTE_LIMIT - 1
15127        ));
15128        assert!(countup_prepass_chunk_ready(
15129            COUNTUP_PREPASS_CHUNK_PAIR_LIMIT,
15130            0
15131        ));
15132        assert!(countup_prepass_chunk_ready(
15133            1,
15134            COUNTUP_PREPASS_CHUNK_BYTE_LIMIT
15135        ));
15136    }
15137
15138    #[test]
15139    fn countup_prepass_carries_tossed_reads_only_with_abrc() {
15140        let config = Config {
15141            k: 3,
15142            min_length: 11,
15143            target_depth: 2,
15144            min_depth: 1,
15145            min_kmers_over_min_depth: 3,
15146            ..Config::default()
15147        };
15148        let prepass = countup_prepass_config(&config);
15149        assert_eq!(prepass.target_depth, 8);
15150        assert_eq!(prepass.min_depth, 0);
15151        assert_eq!(prepass.min_kmers_over_min_depth, 1);
15152
15153        let input_counts = CountMap::default();
15154        let mut filtered = record("short", b"AAAAAAAAAA");
15155        assert!(
15156            !countup_prepass_pair(&prepass, false, &input_counts, &mut filtered, None, 0.0,)
15157                .include
15158        );
15159
15160        let mut carried = record("short", b"AAAAAAAAAA");
15161        assert!(
15162            countup_prepass_pair(&prepass, true, &input_counts, &mut carried, None, 0.0,).include
15163        );
15164    }
15165
15166    #[test]
15167    fn countup_prepass_requires_both_mates_bad_like_java() {
15168        let config = Config {
15169            count_up: true,
15170            toss_error_reads: true,
15171            require_both_bad: false,
15172            k: 3,
15173            target_depth: 100,
15174            max_depth: Some(1000),
15175            min_depth: 1,
15176            min_kmers_over_min_depth: 1,
15177            error_detect_ratio: 2,
15178            high_thresh: 2,
15179            low_thresh: 1,
15180            ..Config::default()
15181        };
15182        let prepass = countup_prepass_config(&config);
15183        assert!(!config.require_both_bad);
15184        assert!(prepass.require_both_bad);
15185
15186        let mut bad_mate = record("bad", b"AAACCC");
15187        let mut good_mate = record("good", b"GGGGGG");
15188        let mut input_counts = CountMap::default();
15189        let bad_keys = kmers_for_record(&bad_mate, &prepass);
15190        for key in &bad_keys {
15191            input_counts.insert(key.clone(), 10);
15192        }
15193        input_counts.insert(bad_keys[1].clone(), 1);
15194        input_counts.insert(bad_keys[2].clone(), 1);
15195        for key in kmers_for_record(&good_mate, &prepass) {
15196            input_counts.insert(key, 10);
15197        }
15198
15199        assert!(analyze_pair(&prepass, &input_counts, &bad_mate, None).error1);
15200        assert!(!analyze_pair(&prepass, &input_counts, &good_mate, None).error1);
15201        assert!(
15202            countup_prepass_pair(
15203                &prepass,
15204                false,
15205                &input_counts,
15206                &mut bad_mate,
15207                Some(&mut good_mate),
15208                0.0,
15209            )
15210            .include
15211        );
15212    }
15213
15214    #[test]
15215    fn countup_prepass_reuses_decision_analysis_for_sort_key_without_ecc() {
15216        let config = Config {
15217            count_up: true,
15218            k: 3,
15219            min_depth: 1,
15220            min_kmers_over_min_depth: 1,
15221            target_depth: 100,
15222            max_depth: Some(1000),
15223            ..Config::default()
15224        };
15225        let prepass = countup_prepass_config(&config);
15226        let mut read = record("read42", b"ACGTACGT");
15227        let mut input_counts = CountMap::default();
15228        for key in kmers_for_record(&read, &prepass) {
15229            input_counts.insert(key, 10);
15230        }
15231
15232        let result = countup_prepass_pair(&prepass, false, &input_counts, &mut read, None, 0.0);
15233        let reused_key =
15234            countup_sort_key_from_analysis(&read, None, 42, result.sort_analysis.as_ref().unwrap());
15235        let replayed_key = countup_sort_key(&prepass, &input_counts, &read, None, 42);
15236
15237        assert!(result.include);
15238        assert_eq!(reused_key.errors, replayed_key.errors);
15239        assert_eq!(reused_key.total_len, replayed_key.total_len);
15240        assert_eq!(reused_key.numeric_id, replayed_key.numeric_id);
15241        assert_eq!(reused_key.original_index, replayed_key.original_index);
15242        assert_eq!(reused_key.expected_errors, replayed_key.expected_errors);
15243    }
15244
15245    #[test]
15246    fn countup_work_candidates_match_sequential_prepass_sort_keys() {
15247        let config = Config {
15248            count_up: true,
15249            k: 3,
15250            min_depth: 1,
15251            min_kmers_over_min_depth: 1,
15252            target_depth: 100,
15253            max_depth: Some(1000),
15254            ..Config::default()
15255        };
15256        let prepass = countup_prepass_config(&config);
15257        let clean = record("clean", b"ACGTACGT");
15258        let noisy = record("noisy", b"AAAACCCC");
15259        let mut input_counts = CountMap::default();
15260        for key in kmers_for_record(&clean, &prepass) {
15261            input_counts.insert(key, 10);
15262        }
15263        let candidates = vec![
15264            CountupWorkCandidate {
15265                input_list_index: 0,
15266                original_index: 0,
15267                rand: 0.0,
15268                r1: noisy.clone(),
15269                r2: None,
15270            },
15271            CountupWorkCandidate {
15272                input_list_index: 0,
15273                original_index: 1,
15274                rand: 0.0,
15275                r1: clean.clone(),
15276                r2: None,
15277            },
15278        ];
15279        let mut actual =
15280            process_countup_work_candidates(&config, &prepass, &input_counts, candidates);
15281        let mut expected = vec![
15282            CountupWorkPair {
15283                input_list_index: 0,
15284                sort_key: countup_sort_key(&prepass, &input_counts, &noisy, None, 0),
15285                r1: noisy,
15286                r2: None,
15287            },
15288            CountupWorkPair {
15289                input_list_index: 0,
15290                sort_key: countup_sort_key(&prepass, &input_counts, &clean, None, 1),
15291                r1: clean,
15292                r2: None,
15293            },
15294        ];
15295        actual.sort_by(compare_countup_work_pairs);
15296        expected.sort_by(compare_countup_work_pairs);
15297
15298        let actual_ids: Vec<_> = actual.iter().map(|pair| pair.r1.id.as_str()).collect();
15299        let expected_ids: Vec<_> = expected.iter().map(|pair| pair.r1.id.as_str()).collect();
15300        assert_eq!(actual_ids, expected_ids);
15301        for (actual, expected) in actual.iter().zip(&expected) {
15302            assert_eq!(actual.sort_key.errors, expected.sort_key.errors);
15303            assert_eq!(actual.sort_key.total_len, expected.sort_key.total_len);
15304            assert_eq!(
15305                actual.sort_key.original_index,
15306                expected.sort_key.original_index
15307            );
15308        }
15309    }
15310
15311    #[test]
15312    fn countup_length_filter_respects_keepall_override() {
15313        let read = record("short", b"ACGT");
15314        let filter_config = Config {
15315            min_length: 5,
15316            ..Config::default()
15317        };
15318        assert!(countup_length_toss(&filter_config, &read, None));
15319
15320        let keepall_config = Config {
15321            keep_all: true,
15322            ..filter_config
15323        };
15324        assert!(!countup_length_toss(&keepall_config, &read, None));
15325    }
15326
15327    #[test]
15328    fn countup_tossbadreads_applies_java_error_spike_rules() {
15329        let keys: Vec<_> = (0..20).map(KmerKey::Short).collect();
15330        let mut input_counts = CountMap::default();
15331        let mut kept_counts = CountMap::default();
15332        for (index, key) in keys.iter().enumerate() {
15333            let input_depth = if index < 8 { 1 } else { 10 };
15334            let kept_depth = if index < 8 { 0 } else { 10 };
15335            input_counts.insert(key.clone(), input_depth);
15336            kept_counts.insert(key.clone(), kept_depth);
15337        }
15338        let base_config = Config {
15339            min_depth: 1,
15340            min_kmers_over_min_depth: 1,
15341            target_depth: 10,
15342            low_thresh: 1,
15343            high_thresh: 10,
15344            error_detect_ratio: 2,
15345            ..Config::default()
15346        };
15347
15348        assert!(!decide_countup_pair(
15349            &base_config,
15350            &input_counts,
15351            &kept_counts,
15352            &keys,
15353            10,
15354        ));
15355
15356        let toss_errors_config = Config {
15357            toss_error_reads: true,
15358            ..base_config.clone()
15359        };
15360        assert!(decide_countup_pair(
15361            &toss_errors_config,
15362            &input_counts,
15363            &kept_counts,
15364            &keys,
15365            10,
15366        ));
15367
15368        let keepall_config = Config {
15369            keep_all: true,
15370            ..toss_errors_config
15371        };
15372        assert!(!decide_countup_pair(
15373            &keepall_config,
15374            &input_counts,
15375            &kept_counts,
15376            &keys,
15377            10,
15378        ));
15379    }
15380
15381    #[test]
15382    fn java_rng_matches_known_first_doubles() {
15383        let mut rng = JavaXoshiro::new(0);
15384        let values = [
15385            rng.next_double(),
15386            rng.next_double(),
15387            rng.next_double(),
15388            rng.next_double(),
15389        ];
15390        let expected = [
15391            0.02774461029305808,
15392            0.9419058303890074,
15393            0.3687890049137593,
15394            0.8390756877056451,
15395        ];
15396        for (actual, expected) in values.into_iter().zip(expected) {
15397            assert!((actual - expected).abs() < f64::EPSILON);
15398        }
15399    }
15400
15401    #[test]
15402    fn nondeterministic_seed_varies_between_requests() {
15403        let first = nondeterministic_seed();
15404        let second = nondeterministic_seed();
15405        assert_ne!(first, second);
15406    }
15407
15408    #[test]
15409    fn deterministic_coin_uses_java_read_rand_shape() {
15410        assert_eq!(deterministic_coin(Some(0.0), 7), 1);
15411        assert_eq!(deterministic_coin(Some(0.5), 7), 4);
15412        assert_eq!(deterministic_coin(Some(0.999_999), 7), 7);
15413    }
15414
15415    #[test]
15416    fn qtrim_right_uses_java_optimal_quality_scoring() {
15417        let config = Config {
15418            trim_right: true,
15419            trim_quality: 10.0,
15420            ..Config::default()
15421        };
15422        let mut read = quality_record("r1", b"ACGTACGT", b"IIII!!!!");
15423
15424        trim_record(&config, &mut read);
15425
15426        assert_eq!(read.bases, b"ACGT");
15427        assert_eq!(read.qualities.as_deref(), Some(&b"IIII"[..]));
15428    }
15429
15430    #[test]
15431    fn qtrim_left_uses_java_optimal_quality_scoring() {
15432        let config = Config {
15433            trim_left: true,
15434            trim_quality: 10.0,
15435            ..Config::default()
15436        };
15437        let mut read = quality_record("r1", b"ACGTACGT", b"!!!!IIII");
15438
15439        trim_record(&config, &mut read);
15440
15441        assert_eq!(read.bases, b"ACGT");
15442        assert_eq!(read.qualities.as_deref(), Some(&b"IIII"[..]));
15443    }
15444
15445    #[test]
15446    fn ecc_corrects_single_substitution_from_exact_counts() {
15447        let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15448        let mut mutant = clean.to_vec();
15449        mutant[14] = b'A';
15450        assert_ne!(mutant, clean);
15451
15452        let config = Config {
15453            k: 7,
15454            min_quality: 0,
15455            min_prob: 0.0,
15456            error_correct: true,
15457            passes: 1,
15458            ..Config::default()
15459        };
15460        let mut counts = CountMap::default();
15461        for i in 0..30 {
15462            increment_pair_counts(
15463                &config,
15464                &mut counts,
15465                &record(&format!("clean{i}"), clean),
15466                None,
15467            );
15468        }
15469        increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15470
15471        let mut read = record("mutant", &mutant);
15472        let result = correct_read_errors(&config, &counts, &mut read);
15473
15474        assert_eq!(result.corrected, 1);
15475        assert!(!result.uncorrectable);
15476        assert_eq!(read.bases, clean);
15477    }
15478
15479    #[test]
15480    fn ecc_flags_high_quality_suspect_error_as_uncorrectable() {
15481        let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15482        let mut mutant = clean.to_vec();
15483        mutant[14] = b'A';
15484        let config = Config {
15485            k: 7,
15486            min_quality: 0,
15487            min_prob: 0.0,
15488            error_correct: true,
15489            max_quality_to_correct: 0,
15490            passes: 1,
15491            ..Config::default()
15492        };
15493        let mut counts = CountMap::default();
15494        for i in 0..30 {
15495            increment_pair_counts(
15496                &config,
15497                &mut counts,
15498                &record(&format!("clean{i}"), clean),
15499                None,
15500            );
15501        }
15502        increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15503
15504        let mut read = record("mutant", &mutant);
15505        let result = correct_read_errors(&config, &counts, &mut read);
15506
15507        assert_eq!(result.corrected, 0);
15508        assert!(result.uncorrectable);
15509        assert_eq!(read.bases, mutant);
15510    }
15511
15512    #[test]
15513    fn ecc_pair_rollback_restores_corrected_mate_when_partner_is_uncorrectable() {
15514        let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15515        let mut mutant = clean.to_vec();
15516        mutant[14] = b'A';
15517        let config = Config {
15518            k: 7,
15519            min_quality: 0,
15520            min_prob: 0.0,
15521            error_correct: true,
15522            max_quality_to_correct: 20,
15523            passes: 1,
15524            ..Config::default()
15525        };
15526        let mut counts = CountMap::default();
15527        for i in 0..30 {
15528            increment_pair_counts(
15529                &config,
15530                &mut counts,
15531                &record(&format!("clean{i}"), clean),
15532                None,
15533            );
15534        }
15535        increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15536
15537        let low_quality = vec![b'!'; mutant.len()];
15538        let high_quality = vec![b'I'; mutant.len()];
15539        let mut correctable = quality_record("lowq", &mutant, &low_quality);
15540        let mut uncorrectable = quality_record("highq", &mutant, &high_quality);
15541        let original_correctable = correctable.clone();
15542        let original_uncorrectable = uncorrectable.clone();
15543
15544        let result = correct_pair_errors_with_rollback(
15545            &config,
15546            &counts,
15547            &mut correctable,
15548            Some(&mut uncorrectable),
15549        );
15550
15551        assert!(result.corrected > 0);
15552        assert!(result.uncorrectable);
15553        assert_eq!(correctable.bases, original_correctable.bases);
15554        assert_eq!(
15555            correctable.qualities.as_deref(),
15556            original_correctable.qualities.as_deref()
15557        );
15558        assert_eq!(uncorrectable.bases, original_uncorrectable.bases);
15559    }
15560
15561    #[test]
15562    fn ecc_marks_uncorrectable_errors_when_requested() {
15563        let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15564        let mut mutant = clean.to_vec();
15565        mutant[14] = b'A';
15566        let config = Config {
15567            k: 7,
15568            min_quality: 0,
15569            min_prob: 0.0,
15570            error_correct: true,
15571            max_quality_to_correct: 0,
15572            mark_uncorrectable_errors: true,
15573            passes: 1,
15574            ..Config::default()
15575        };
15576        let mut counts = CountMap::default();
15577        for i in 0..30 {
15578            increment_pair_counts(
15579                &config,
15580                &mut counts,
15581                &record(&format!("clean{i}"), clean),
15582                None,
15583            );
15584        }
15585        increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15586
15587        let mut read = record("mutant", &mutant);
15588        let result = correct_read_errors(&config, &counts, &mut read);
15589
15590        assert_eq!(result.corrected, 0);
15591        assert_eq!(result.marked, 1);
15592        assert!(result.uncorrectable);
15593        assert_eq!(read.bases, mutant);
15594        assert_eq!(read.qualities.as_ref().unwrap()[14], b'2');
15595    }
15596
15597    #[test]
15598    fn ecc_mark_only_reduces_suspect_base_quality() {
15599        let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15600        let mut mutant = clean.to_vec();
15601        mutant[14] = b'A';
15602        let config = Config {
15603            k: 7,
15604            min_quality: 0,
15605            min_prob: 0.0,
15606            error_correct: true,
15607            mark_errors_only: true,
15608            passes: 1,
15609            ..Config::default()
15610        };
15611        let mut counts = CountMap::default();
15612        for i in 0..30 {
15613            increment_pair_counts(
15614                &config,
15615                &mut counts,
15616                &record(&format!("clean{i}"), clean),
15617                None,
15618            );
15619        }
15620        increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15621
15622        let mut read = record("mutant", &mutant);
15623        let result = correct_read_errors(&config, &counts, &mut read);
15624
15625        assert_eq!(result.marked, 1);
15626        assert_eq!(read.bases, mutant);
15627        assert_eq!(read.qualities.as_ref().unwrap()[14], b'2');
15628    }
15629
15630    #[test]
15631    fn ecc_mark_only_marks_all_detected_sites_even_when_ecclimit_is_low() {
15632        let config = Config {
15633            k: 7,
15634            prefix_len: 2,
15635            max_errors_to_correct: 1,
15636            correct_from_right: false,
15637            ..Config::default()
15638        };
15639        let mut read = quality_record("marked", b"ACGTTGCATGTC", b"IIIIIIIIIIII");
15640        let coverage = vec![30, 30, 0, 30, 30, 0];
15641
15642        let result = mark_read_errors(&config, &mut read, &coverage);
15643
15644        assert_eq!(result.marked, 2);
15645        let qualities = read.qualities.as_deref().unwrap();
15646        assert_eq!(qualities[8], b'2');
15647        assert_eq!(qualities[11], b'2');
15648    }
15649
15650    #[test]
15651    fn overlap_ecc_repairs_lower_quality_mate_base() {
15652        let r1_bases = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
15653        let r2_clean = b"GTCGCATATTTCAAGCACTACTTCGCTGCGGCACAACTAA";
15654        let mut r2_bases = r2_clean.to_vec();
15655        r2_bases[20] = b'A';
15656        let mut r1 = quality_record("r1", r1_bases, b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII");
15657        let mut r2 = quality_record("r2", &r2_bases, b"IIIIIIIIIIIIIIIIIIII#IIIIIIIIIIIIIIIIIII");
15658        let config = Config {
15659            overlap_error_correct: true,
15660            max_quality_to_correct: 20,
15661            ..Config::default()
15662        };
15663
15664        let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15665
15666        assert_eq!(result.corrected, 1);
15667        assert_eq!(r1.bases, r1_bases);
15668        assert_eq!(r2.bases, r2_clean);
15669        assert_eq!(
15670            r1.qualities.as_deref(),
15671            Some(&b"SSSSSSSSSSSSSSSSSSSGSSSSSSSSSSSSSSSSSSSS"[..])
15672        );
15673        assert_eq!(
15674            r2.qualities.as_deref(),
15675            Some(&b"SSSSSSSSSSSSSSSSSSSSGSSSSSSSSSSSSSSSSSSS"[..])
15676        );
15677    }
15678
15679    #[test]
15680    fn overlap_ecc_skips_short_pairs_like_java_strict_mode() {
15681        let r1_bases = b"ACGTTGCATGTCAGTA";
15682        let r2_clean = b"TACTGACATGCAACGT";
15683        let mut r2_bases = r2_clean.to_vec();
15684        r2_bases[9] = b'T';
15685        let mut r1 = quality_record("r1", r1_bases, b"IIIIIIIIIIIIIIII");
15686        let mut r2 = quality_record("r2", &r2_bases, b"IIIIIIIII!IIIIII");
15687        let config = Config {
15688            overlap_error_correct: true,
15689            max_quality_to_correct: 20,
15690            ..Config::default()
15691        };
15692
15693        let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15694
15695        assert_eq!(result.corrected, 0);
15696        assert_eq!(r1.bases, r1_bases);
15697        assert_eq!(r2.bases, r2_bases);
15698    }
15699
15700    #[test]
15701    fn overlap_ecc_skips_ambiguous_repetitive_pairs_like_java_strict_mode() {
15702        let r1_bases = b"ACGTTGCATGTCAGTAACGTTGCATGTCAGTAACGTTGCA";
15703        let r2_clean = b"TGCAACGTTACTGACATGCAACGTTACTGACATGCAACGT";
15704        let mut r2_bases = r2_clean.to_vec();
15705        r2_bases[20] = b'C';
15706        let mut r1 = quality_record("r1", r1_bases, b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII");
15707        let mut r2 = quality_record("r2", &r2_bases, b"IIIIIIIIIIIIIIIIIIII!IIIIIIIIIIIIIIIIIII");
15708        let config = Config {
15709            overlap_error_correct: true,
15710            max_quality_to_correct: 20,
15711            ..Config::default()
15712        };
15713
15714        let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15715
15716        assert_eq!(result.corrected, 0);
15717        assert_eq!(r1.bases, r1_bases);
15718        assert_eq!(r2.bases, r2_bases);
15719    }
15720
15721    #[test]
15722    fn overlap_entropy_gate_keeps_java_strict_floor_for_high_entropy_fixture() {
15723        let bases = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
15724        assert_eq!(overlap_entropy_min_overlap(bases), 12);
15725    }
15726
15727    #[test]
15728    fn overlap_entropy_gate_raises_min_overlap_for_low_complexity_reads() {
15729        let bases = b"AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT";
15730        assert_eq!(overlap_entropy_min_overlap(bases), 32);
15731    }
15732
15733    #[test]
15734    fn overlap_ecc_rejects_high_confidence_mismatch_like_java_expected_filter() {
15735        let r1_bases = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
15736        let mut r2_bases = b"GTCGCATATTTCAAGCACTACTTCGCTGCGGCACAACTAA".to_vec();
15737        r2_bases[20] = b'A';
15738        let mut r1 = quality_record("r1", r1_bases, b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII");
15739        let mut r2 = quality_record("r2", &r2_bases, b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII");
15740        let config = Config {
15741            overlap_error_correct: true,
15742            max_quality_to_correct: 41,
15743            ..Config::default()
15744        };
15745
15746        let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15747
15748        assert_eq!(result.corrected, 0);
15749        assert_eq!(r1.bases, r1_bases);
15750        assert_eq!(r2.bases, r2_bases);
15751        assert_eq!(
15752            r1.qualities.as_deref(),
15753            Some(&b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"[..])
15754        );
15755        assert_eq!(
15756            r2.qualities.as_deref(),
15757            Some(&b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"[..])
15758        );
15759    }
15760
15761    #[test]
15762    fn overlap_ecc_rejects_low_confidence_tie_under_java_strict_mode() {
15763        let r1_bases = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
15764        let mut r2_bases = b"GTCGCATATTTCAAGCACTACTTCGCTGCGGCACAACTAA".to_vec();
15765        r2_bases[20] = b'A';
15766        let mut r1 = quality_record("r1", r1_bases, b"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
15767        let mut r2 = quality_record("r2", &r2_bases, b"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
15768        let config = Config {
15769            overlap_error_correct: true,
15770            max_quality_to_correct: 41,
15771            ..Config::default()
15772        };
15773
15774        let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15775
15776        assert_eq!(result.corrected, 0);
15777        assert_eq!(r1.bases, r1_bases);
15778        assert_eq!(r2.bases, r2_bases);
15779        assert_eq!(
15780            r1.qualities.as_deref(),
15781            Some(&b"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"[..])
15782        );
15783        assert_eq!(
15784            r2.qualities.as_deref(),
15785            Some(&b"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"[..])
15786        );
15787    }
15788
15789    #[test]
15790    fn overlap_ecc_rejects_quality_weighted_multimismatch_candidate_like_java() {
15791        let r1_bases = b"CAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAA";
15792        let r2_bases = b"TTTTGCTAACGCGTCTGGCATCTCAACAGGCATTGGTTAC";
15793        let mut r1 = quality_record(
15794            "r1",
15795            r1_bases,
15796            b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII",
15797        );
15798        let mut r2 = quality_record("r2", r2_bases, b"IIIII!I'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII");
15799        let original_r1 = r1.clone();
15800        let original_r2 = r2.clone();
15801        let config = Config {
15802            overlap_error_correct: true,
15803            max_quality_to_correct: 41,
15804            ..Config::default()
15805        };
15806
15807        let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15808
15809        assert_eq!(result.corrected, 0);
15810        assert_eq!(r1.bases, original_r1.bases);
15811        assert_eq!(r2.bases, original_r2.bases);
15812    }
15813
15814    #[test]
15815    fn trim_after_marking_defers_qtrim_until_after_ecc_marking() {
15816        let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15817        let mut mutant = clean.to_vec();
15818        mutant[26] = b'A';
15819        let config = Config {
15820            k: 7,
15821            min_quality: 0,
15822            min_prob: 0.0,
15823            error_correct: true,
15824            mark_errors_only: true,
15825            trim_after_marking: true,
15826            trim_right: true,
15827            trim_optimal: false,
15828            trim_quality: 20.0,
15829            keep_all: true,
15830            passes: 1,
15831            ..Config::default()
15832        };
15833        let mut counts = CountMap::default();
15834        for i in 0..30 {
15835            increment_pair_counts(
15836                &config,
15837                &mut counts,
15838                &record(&format!("clean{i}"), clean),
15839                None,
15840            );
15841        }
15842        increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15843
15844        let input = vec![(0, record("mutant", &mutant), None, 0.0)];
15845        let pairs = normalize_pair_chunk(&config, &counts, &input);
15846
15847        assert_eq!(pairs[0].out_r1.bases, b"ACGTTGCATGTCAGTACCGTAACGTTAC");
15848        assert_eq!(
15849            pairs[0].out_r1.qualities.as_deref(),
15850            Some(&b"IIIIIIIIIIIIIIIIIIIIIIIIIIII"[..])
15851        );
15852    }
15853
15854    #[test]
15855    fn bad_kmer_fraction_lowers_dynamic_toss_target_like_bbnorm() {
15856        let config = Config {
15857            target_depth: 100,
15858            max_depth: Some(125),
15859            target_bad_percent_low: 0.2,
15860            target_bad_percent_high: 0.8,
15861            ..Config::default()
15862        };
15863        let clean = PairAnalysis::default();
15864        assert_eq!(dynamic_depth_limits(&config, &clean), (100, 125));
15865
15866        let noisy = PairAnalysis {
15867            low_kmer_count: 5,
15868            total_kmer_count: 10,
15869            ..PairAnalysis::default()
15870        };
15871        assert_eq!(dynamic_depth_limits(&config, &noisy), (35, 35));
15872    }
15873
15874    #[test]
15875    fn multipass_bad_depth_targets_match_java_pass_shape() {
15876        let config = Config {
15877            passes: 3,
15878            target_depth: 100,
15879            target_bad_percent_low: 0.2,
15880            target_bad_percent_high: 0.8,
15881            ..Config::default()
15882        };
15883
15884        let first_target = intermediate_target_depth(&config, 1);
15885        assert_eq!(first_target, 400);
15886        assert_eq!(
15887            intermediate_bad_depth_targets(&config, 1, first_target),
15888            (30, 120)
15889        );
15890
15891        let second_target = intermediate_target_depth(&config, 2);
15892        assert_eq!(second_target, 200);
15893        assert_eq!(
15894            intermediate_bad_depth_targets(&config, 2, second_target),
15895            (20, 80)
15896        );
15897    }
15898
15899    #[test]
15900    fn qtrim_keeps_java_min_result_shape_for_all_bad_reads() {
15901        let config = Config {
15902            trim_right: true,
15903            trim_quality: 10.0,
15904            ..Config::default()
15905        };
15906        let mut read = quality_record("r1", b"ACGT", b"!!!!");
15907
15908        trim_record(&config, &mut read);
15909
15910        assert_eq!(read.bases, b"A");
15911        assert_eq!(read.qualities.as_deref(), Some(&b"!"[..]));
15912    }
15913
15914    #[test]
15915    fn qtrim_window_uses_java_sliding_threshold() {
15916        let config = Config {
15917            trim_right: true,
15918            trim_quality: 10.0,
15919            trim_optimal: false,
15920            trim_window: true,
15921            trim_window_length: 4,
15922            ..Config::default()
15923        };
15924        let mut read = quality_record("r1", b"ACGTACGTACGT", b"IIIIIII!!!!!");
15925
15926        trim_record(&config, &mut read);
15927
15928        assert_eq!(read.bases, b"ACGTACG");
15929        assert_eq!(read.qualities.as_deref(), Some(&b"IIIIIII"[..]));
15930    }
15931
15932    #[test]
15933    fn output_hash_patterns_match_bbnorm_pair_expansion() {
15934        let paths = prepare_output_paths(Some(Path::new("reads#.fq")), None, true);
15935        assert_eq!(paths.first, Some(PathBuf::from("reads1.fq")));
15936        assert_eq!(paths.second, Some(PathBuf::from("reads2.fq")));
15937
15938        let paths = prepare_output_paths(
15939            Some(Path::new("reads#.fq")),
15940            Some(Path::new("mate.fq")),
15941            true,
15942        );
15943        assert_eq!(paths.first, Some(PathBuf::from("reads1.fq")));
15944        assert_eq!(paths.second, Some(PathBuf::from("mate.fq")));
15945
15946        let paths = prepare_output_paths(Some(Path::new("single#.fq")), None, false);
15947        assert_eq!(paths.first, Some(PathBuf::from("single1.fq")));
15948        assert_eq!(paths.second, None);
15949    }
15950}