1use crate::cli::{CARDINALITY_MAX_BUCKETS, Config};
2use crate::kmer::{
3 KmerKey, canonical_short_code, for_each_kmer_for_record, unfiltered_kmer_windows_for_record,
4};
5use crate::peaks::write_peaks;
6use crate::seqio::{
7 BaseSettings, QualitySettings, SeqFormat, SequenceReader, SequenceRecord, SequenceSettings,
8 SequenceWriter, create_output_with_append, detect_interleaved_input_with_gzip_threads,
9};
10use anyhow::{Context, Result, bail, ensure};
11use rayon::prelude::*;
12use rustc_hash::FxHashMap;
13use std::alloc::{Layout, alloc_zeroed};
14use std::cmp::Ordering as CmpOrdering;
15use std::collections::{BTreeMap, BinaryHeap};
16use std::fs;
17use std::io::{BufReader, BufWriter, ErrorKind, Read, Write};
18use std::path::{Path, PathBuf};
19use std::process::{Child, ChildStdin, ChildStdout, Command, Stdio};
20use std::sync::{
21 Mutex, OnceLock,
22 atomic::{AtomicU32, AtomicU64, AtomicUsize, Ordering},
23};
24use std::time::{Instant, SystemTime, UNIX_EPOCH};
25
26pub type CountMap = FxHashMap<KmerKey, u64>;
27
28#[derive(Debug, Clone, Default, PartialEq, Eq)]
29pub struct CardinalityEstimate {
30 pub k: usize,
31 pub buckets: usize,
32 pub estimated_unique_kmers: u64,
33}
34
35struct KmerCardinalityEstimator {
36 k: usize,
37 buckets: usize,
38 seed: u64,
39 registers: Vec<u8>,
40}
41
42impl KmerCardinalityEstimator {
43 fn from_config(config: &Config) -> Self {
44 let buckets = config.cardinality.buckets.clamp(1, CARDINALITY_MAX_BUCKETS);
45 Self {
46 k: config.cardinality.k.unwrap_or(config.k),
47 buckets,
48 seed: config.cardinality.seed,
49 registers: vec![0; buckets],
50 }
51 }
52
53 fn observe_pair(&mut self, config: &Config, r1: &SequenceRecord, r2: Option<&SequenceRecord>) {
54 self.observe_record(config, r1);
55 if let Some(mate) = r2 {
56 self.observe_record(config, mate);
57 }
58 }
59
60 fn observe_record(&mut self, config: &Config, record: &SequenceRecord) {
61 for_each_kmer_for_record(record, config, |kmer| self.observe_key(&kmer));
62 }
63
64 fn observe_key(&mut self, key: &KmerKey) {
65 let raw = raw_kmer_key(key);
66 let kind_salt = match key {
67 KmerKey::Short(_) => 0x9E37_79B9_7F4A_7C15,
68 KmerKey::LongHash(_) => 0xD1B5_4A32_D192_ED03,
69 };
70 let hash = mix_seed(raw ^ self.seed ^ kind_salt);
71 let bucket = (((hash as u128) * (self.buckets as u128)) >> 64) as usize;
72 let rank_hash = mix_seed(hash ^ 0x94D0_49BB_1331_11EB);
73 let rank = rank_hash.leading_zeros().saturating_add(1).min(64) as u8;
74 if let Some(slot) = self.registers.get_mut(bucket) {
75 *slot = (*slot).max(rank);
76 }
77 }
78
79 fn estimate(&self) -> CardinalityEstimate {
80 let m = self.buckets as f64;
81 let zero_count = self
82 .registers
83 .iter()
84 .filter(|&®ister| register == 0)
85 .count();
86 let inverse_sum: f64 = self
87 .registers
88 .iter()
89 .map(|®ister| 2f64.powi(-(i32::from(register))))
90 .sum();
91 let raw_estimate = hll_alpha(self.buckets) * m * m / inverse_sum.max(f64::MIN_POSITIVE);
92 let corrected = if raw_estimate <= 2.5 * m && zero_count > 0 {
93 m * (m / zero_count as f64).ln()
94 } else {
95 raw_estimate
96 };
97 CardinalityEstimate {
98 k: self.k,
99 buckets: self.buckets,
100 estimated_unique_kmers: corrected.round().max(0.0) as u64,
101 }
102 }
103}
104
105fn hll_alpha(buckets: usize) -> f64 {
106 match buckets {
107 16 => 0.673,
108 32 => 0.697,
109 64 => 0.709,
110 _ => 0.7213 / (1.0 + 1.079 / buckets as f64),
111 }
112}
113
114trait CountLookup: Sync {
115 fn depth(&self, key: &KmerKey) -> u64;
116 fn unique_kmers(&self) -> usize;
117 fn unique_kmers_at_least(&self, min_depth: u64) -> usize;
118}
119
120impl CountLookup for CountMap {
121 fn depth(&self, key: &KmerKey) -> u64 {
122 self.get(key).copied().unwrap_or(0)
123 }
124
125 fn unique_kmers(&self) -> usize {
126 self.len()
127 }
128
129 fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
130 if min_depth <= 1 {
131 return self.len();
132 }
133 self.values().filter(|&&depth| depth >= min_depth).count()
134 }
135}
136
137enum InputCounts {
138 Exact(CountMap),
139 Sketch(PackedCountMinSketch),
140 AtomicSketch(AtomicCountMinSketch),
141 AtomicPackedSketch(AtomicPackedCountMinSketch),
142 PrefilteredSketch {
143 prefilter: PrefilterCountMinSketch,
144 limit: u64,
145 main: Box<InputCounts>,
146 },
147}
148
149#[derive(Clone, Copy)]
150struct PrefilterGate<'a> {
151 sketch: &'a PrefilterCountMinSketch,
152 limit: u64,
153}
154
155impl<'a> PrefilterGate<'a> {
156 fn new(sketch: &'a PrefilterCountMinSketch, limit: u64) -> Self {
157 Self {
158 sketch,
159 limit: limit.min(sketch.max_count()),
160 }
161 }
162
163 fn should_count_in_main(&self, key: &KmerKey) -> bool {
164 self.sketch.depth(key) >= self.limit
165 }
166}
167
168impl CountLookup for InputCounts {
169 fn depth(&self, key: &KmerKey) -> u64 {
170 match self {
171 Self::Exact(counts) => counts.depth(key),
172 Self::Sketch(sketch) => sketch.depth(key),
173 Self::AtomicSketch(sketch) => sketch.depth(key),
174 Self::AtomicPackedSketch(sketch) => sketch.depth(key),
175 Self::PrefilteredSketch {
176 prefilter,
177 limit,
178 main,
179 } => {
180 let prefilter_depth = prefilter.depth(key);
181 if prefilter_depth < *limit {
182 prefilter_depth
183 } else {
184 main.depth(key)
185 }
186 }
187 }
188 }
189
190 fn unique_kmers(&self) -> usize {
191 match self {
192 Self::Exact(counts) => counts.unique_kmers(),
193 Self::Sketch(sketch) => sketch.unique_kmers(),
194 Self::AtomicSketch(sketch) => sketch.unique_kmers(),
195 Self::AtomicPackedSketch(sketch) => sketch.unique_kmers(),
196 Self::PrefilteredSketch { prefilter, .. } => prefilter.unique_kmers(),
197 }
198 }
199
200 fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
201 match self {
202 Self::Exact(counts) => counts.unique_kmers_at_least(min_depth),
203 Self::Sketch(sketch) => sketch.unique_kmers_at_least(min_depth),
204 Self::AtomicSketch(sketch) => sketch.unique_kmers_at_least(min_depth),
205 Self::AtomicPackedSketch(sketch) => sketch.unique_kmers_at_least(min_depth),
206 Self::PrefilteredSketch {
207 prefilter,
208 limit,
209 main,
210 } => {
211 if min_depth < *limit {
212 prefilter.unique_kmers_at_least(min_depth)
213 } else {
214 main.unique_kmers_at_least(min_depth)
215 }
216 }
217 }
218 }
219}
220
221impl InputCounts {
222 #[cfg(test)]
223 fn unique_kmer_estimate_split(&self) -> Option<UniqueKmerEstimateSplit> {
224 self.unique_kmer_estimate().1
225 }
226
227 fn unique_kmer_estimate(&self) -> (usize, Option<UniqueKmerEstimateSplit>) {
228 match self {
229 Self::PrefilteredSketch {
230 prefilter, main, ..
231 } => {
232 let low_depth_max = prefilter.max_count();
233 let high_depth_min = low_depth_max.saturating_add(1);
234 let total = prefilter.unique_kmers();
235 let high_depth_kmers = main.unique_kmers_at_least(high_depth_min);
236 (
237 total,
238 Some(UniqueKmerEstimateSplit {
239 low_depth_max,
240 low_depth_kmers: total.saturating_sub(high_depth_kmers),
241 high_depth_min,
242 high_depth_kmers,
243 }),
244 )
245 }
246 _ => (self.unique_kmers(), None),
247 }
248 }
249
250 fn sketch_layouts(&self) -> Vec<SketchLayoutSummary> {
251 let mut layouts = Vec::new();
252 self.append_sketch_layouts(&mut layouts, "input_main");
253 layouts
254 }
255
256 fn append_sketch_layouts(&self, layouts: &mut Vec<SketchLayoutSummary>, table: &'static str) {
257 match self {
258 Self::Exact(_) => {}
259 Self::Sketch(sketch) => layouts.push(sketch.layout_summary(table, None)),
260 Self::AtomicSketch(sketch) => layouts.push(sketch.layout_summary(table, None)),
261 Self::AtomicPackedSketch(sketch) => layouts.push(sketch.layout_summary(table, None)),
262 Self::PrefilteredSketch {
263 prefilter,
264 limit,
265 main,
266 } => {
267 layouts.push(prefilter.layout_summary("input_prefilter", Some(*limit)));
268 main.append_sketch_layouts(layouts, "input_main");
269 }
270 }
271 }
272}
273
274enum OutputCounts {
275 Exact(CountMap),
276 Sketch(PackedCountMinSketch),
277 AtomicSketch(AtomicCountMinSketch),
278}
279
280impl CountLookup for OutputCounts {
281 fn depth(&self, key: &KmerKey) -> u64 {
282 match self {
283 Self::Exact(counts) => counts.depth(key),
284 Self::Sketch(sketch) => sketch.depth(key),
285 Self::AtomicSketch(sketch) => sketch.depth(key),
286 }
287 }
288
289 fn unique_kmers(&self) -> usize {
290 match self {
291 Self::Exact(counts) => counts.unique_kmers(),
292 Self::Sketch(sketch) => sketch.unique_kmers(),
293 Self::AtomicSketch(sketch) => sketch.unique_kmers(),
294 }
295 }
296
297 fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
298 match self {
299 Self::Exact(counts) => counts.unique_kmers_at_least(min_depth),
300 Self::Sketch(sketch) => sketch.unique_kmers_at_least(min_depth),
301 Self::AtomicSketch(sketch) => sketch.unique_kmers_at_least(min_depth),
302 }
303 }
304}
305
306impl OutputCounts {
307 #[cfg(test)]
308 #[cfg(test)]
309 #[cfg(test)]
310 fn depth_hist(&self, hist_len: usize) -> Vec<u64> {
311 match self {
312 Self::Exact(counts) => count_map_depth_hist(counts, hist_len),
313 Self::Sketch(sketch) => sketch.depth_hist(hist_len),
314 Self::AtomicSketch(sketch) => sketch.depth_hist(hist_len),
315 }
316 }
317
318 fn sparse_depth_hist(&self, hist_len: usize) -> SparseHist {
319 match self {
320 Self::Exact(counts) => count_map_sparse_depth_hist(counts, hist_len),
321 Self::Sketch(sketch) => sketch.sparse_depth_hist(hist_len),
322 Self::AtomicSketch(sketch) => sketch.sparse_depth_hist(hist_len),
323 }
324 }
325
326 fn append_sketch_layouts(&self, layouts: &mut Vec<SketchLayoutSummary>, table: &'static str) {
327 match self {
328 Self::Exact(_) => {}
329 Self::Sketch(sketch) => layouts.push(sketch.layout_summary(table, None)),
330 Self::AtomicSketch(sketch) => layouts.push(sketch.layout_summary(table, None)),
331 }
332 }
333}
334
335#[derive(Debug, Clone)]
336struct PackedCountMinSketch {
337 cells: usize,
338 hashes: usize,
339 bits: u8,
340 max_count: u64,
341 layout: KCountArrayLayout,
342 update_mode: CountMinUpdateMode,
343 words: Vec<u64>,
344 increments: u64,
345 occupied_slots: usize,
346 tracked_slots: Option<Vec<usize>>,
347}
348
349#[derive(Debug, Clone, Copy, PartialEq, Eq)]
350enum CountMinUpdateMode {
351 Conservative,
352 Independent,
353}
354
355impl CountMinUpdateMode {
356 fn as_str(self) -> &'static str {
357 match self {
358 Self::Conservative => "conservative",
359 Self::Independent => "independent",
360 }
361 }
362}
363
364struct AtomicCountMinSketch {
365 cells: usize,
366 hashes: usize,
367 max_count: u32,
368 layout: KCountArrayLayout,
369 update_mode: CountMinUpdateMode,
370 parallel_replay: bool,
371 cells_by_hash: Vec<AtomicU32>,
372 locks: Vec<Mutex<()>>,
373 increments: AtomicU64,
374 occupied_slots: AtomicUsize,
375}
376
377enum PrefilterCountMinSketch {
378 Packed(PackedCountMinSketch),
379 AtomicPacked(AtomicPackedCountMinSketch),
380}
381
382struct AtomicPackedCountMinSketch {
383 cells: usize,
384 hashes: usize,
385 bits: u8,
386 max_count: u64,
387 layout: KCountArrayLayout,
388 update_mode: CountMinUpdateMode,
389 words: Vec<AtomicU64>,
390 locks: Vec<Mutex<()>>,
391 increments: AtomicU64,
392 occupied_slots: AtomicUsize,
393}
394
395const BBTOOLS_HASH_BITS: u32 = 6;
396const BBTOOLS_HASH_ARRAY_LENGTH: usize = 1 << BBTOOLS_HASH_BITS;
397const BBTOOLS_HASH_CELL_MASK: u64 = (BBTOOLS_HASH_ARRAY_LENGTH as u64) - 1;
398const BBTOOLS_LONG_MAX_VALUE: u64 = i64::MAX as u64;
399type BbtoolsHashMaskTable = [[u64; BBTOOLS_HASH_ARRAY_LENGTH]; 8];
400type BbtoolsHashMaskRef = &'static BbtoolsHashMaskTable;
401type BbtoolsHashMaskCache = FxHashMap<u64, BbtoolsHashMaskRef>;
402
403#[derive(Debug, Clone, Copy)]
404struct KCountArrayLayout {
405 array_mask: u64,
406 array_bits: u32,
407 cells_per_array: usize,
408 mask_seed: u64,
409 masks: BbtoolsHashMaskRef,
410}
411
412const COUNT_PARALLEL_CHUNK_SIZE: usize = 8192;
413const COUNT_CHUNK_LOCAL_MAP_MAX_CAPACITY: usize = 131_072;
414const COUNTUP_SORT_RUN_PAIR_LIMIT: usize = 65_536;
415const COUNTUP_SORT_RUN_BYTE_LIMIT: usize = 64 * 1024 * 1024;
416const COUNTUP_SORT_MERGE_FANIN: usize = 128;
417const COUNTUP_RUN_IO_BUFFER_CAPACITY: usize = 1024 * 1024;
418const COUNTUP_PREPASS_CHUNK_PAIR_LIMIT: usize = 1024;
419const COUNTUP_PREPASS_CHUNK_BYTE_LIMIT: usize = 16 * 1024 * 1024;
420const HIST_PARALLEL_CHUNK_SIZE: usize = 1024;
421const NORMALIZE_PARALLEL_CHUNK_SIZE: usize = 1024;
422const PAIRED_ANALYSIS_JOIN_MIN_BASES: usize = 1024;
423const COVERAGE_PAR_SORT_MIN_WINDOWS: usize = 4096;
424const OVERLAP_AUTO_SAMPLE_PAIRS: u64 = 1_000_000;
425const ATOMIC_SKETCH_PAR_REPLAY_MIN_KEYS: usize = 16_384;
426const PACKED_SKETCH_TRACKED_SLOT_LIMIT: usize = 8_000_000;
427const OVERLAP_AUTO_SAMPLE_INTERVAL: u64 = 100;
428const OVERLAP_AUTO_ENABLE_FRACTION: f64 = 0.25;
429const DEFAULT_PREFILTER_CELLS: usize = 1 << 20;
430const DEFAULT_PREFILTER_BITS: u8 = 2;
431const DEFAULT_PREFILTER_FRACTION_MICROS: u32 = 350_000;
432const OUTPUT_COUNT_MIN_AUTO_FRACTION_MICROS: u32 = 250_000;
433const OUTPUT_COUNT_MIN_AUTO_MIN_MEMORY_BYTES: usize = 64 * 1024 * 1024;
434const AUTO_COUNT_MIN_FALLBACK_MEMORY_BYTES: usize = 2 * 1024 * 1024 * 1024;
435const AUTO_COUNT_MIN_MAX_MEMORY_BYTES: usize = 2 * 1024 * 1024 * 1024;
436const AUTO_COUNT_MIN_MIN_MEMORY_BYTES: usize = 256 * 1024 * 1024;
437const BBTOOLS_MEMORY_HEADROOM_BYTES: usize = 96_000_000;
438const EXPLICIT_COUNT_MIN_SAFE_MEMORY_PERCENT: usize = 85;
439const BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS: usize = 2;
440const BBTOOLS_KCOUNT_ARRAY_SHARD_MIN_CELLS: usize = 64;
441const BBTOOLS_KCOUNT_ARRAY_MAX_HASHES: usize = 8;
442const BBTOOLS_KCOUNT_ARRAY_LOCKS: usize = 1999;
443const BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED: u64 = 0;
444const BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED: u64 = 7;
445const BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP: u64 = 7;
446const BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED: u64 =
447 BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED + BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP;
448const PEAK_COMPACT_ZERO_TAIL: usize = 32;
449static NONDETERMINISTIC_SEED_COUNTER: AtomicU64 = AtomicU64::new(0);
450type AnalysisPair = (SequenceRecord, Option<SequenceRecord>, Option<f64>);
451type NormalizationInput = (usize, SequenceRecord, Option<SequenceRecord>, f64);
452type SparseHist = FxHashMap<usize, u64>;
453type SparseReadDepthHist = FxHashMap<usize, (u64, u64)>;
454
455struct InputHistSinks<'a> {
456 depth: Option<&'a mut SparseHist>,
457 read: Option<&'a mut SparseReadDepthHist>,
458}
459
460#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
461pub struct UniqueKmerEstimateSplit {
462 pub low_depth_max: u64,
463 pub low_depth_kmers: usize,
464 pub high_depth_min: u64,
465 pub high_depth_kmers: usize,
466}
467
468#[derive(Debug, Clone, PartialEq, Eq)]
469pub struct SketchLayoutSummary {
470 pub table: &'static str,
471 pub kind: &'static str,
472 pub cells: usize,
473 pub hashes: usize,
474 pub bits: u8,
475 pub arrays: usize,
476 pub cells_per_array: usize,
477 pub mask_seed: u64,
478 pub update_mode: &'static str,
479 pub max_count: u64,
480 pub memory_bytes: usize,
481 pub prefilter_limit: Option<u64>,
482}
483
484#[derive(Debug, Clone, Default, PartialEq, Eq)]
485pub struct StageTiming {
486 pub name: &'static str,
487 pub elapsed_micros: u128,
488}
489
490#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
491pub struct CountupSpillSummary {
492 pub initial_runs: usize,
493 pub merge_runs: usize,
494 pub final_runs: usize,
495 pub bytes_written: u64,
496 pub peak_live_bytes: u64,
497 pub final_live_bytes: u64,
498}
499
500impl CountupSpillSummary {
501 pub fn has_spills(&self) -> bool {
502 self.initial_runs > 0 || self.merge_runs > 0 || self.bytes_written > 0
503 }
504
505 fn note_initial_run(&mut self, bytes: u64) {
506 self.initial_runs = self.initial_runs.saturating_add(1);
507 self.note_written(bytes);
508 }
509
510 fn note_merge_run(&mut self, bytes: u64) {
511 self.merge_runs = self.merge_runs.saturating_add(1);
512 self.note_written(bytes);
513 }
514
515 fn note_written(&mut self, bytes: u64) {
516 self.bytes_written = self.bytes_written.saturating_add(bytes);
517 self.final_live_bytes = self.final_live_bytes.saturating_add(bytes);
518 self.peak_live_bytes = self.peak_live_bytes.max(self.final_live_bytes);
519 }
520
521 fn note_removed(&mut self, bytes: u64) {
522 self.final_live_bytes = self.final_live_bytes.saturating_sub(bytes);
523 }
524}
525
526#[derive(Debug, Clone, Default, PartialEq, Eq)]
527pub struct RunSummary {
528 pub reads_in: u64,
529 pub bases_in: u64,
530 pub reads_kept: u64,
531 pub reads_tossed: u64,
532 pub bases_kept: u64,
533 pub bases_tossed: u64,
534 pub unique_kmers_in: usize,
535 pub unique_kmers_in_split: Option<UniqueKmerEstimateSplit>,
536 pub unique_kmers_out: Option<usize>,
537 pub cardinality_in: Option<CardinalityEstimate>,
538 pub cardinality_out: Option<CardinalityEstimate>,
539 pub sketch_layouts: Vec<SketchLayoutSummary>,
540 pub stage_timings: Vec<StageTiming>,
541 pub countup_spill: CountupSpillSummary,
542}
543
544#[derive(Debug, Clone, Default)]
545struct ReadAnalysis {
546 depth_al: Option<u64>,
547 true_depth: Option<u64>,
548 min_true_depth: Option<u64>,
549 low_kmer_count: usize,
550 total_kmer_count: usize,
551 error: bool,
552 had_kmer_windows: bool,
553 coverage_desc: Vec<i64>,
554}
555
556#[derive(Debug, Clone, Default)]
557struct PairAnalysis {
558 read1: ReadAnalysis,
559 read2: Option<ReadAnalysis>,
560 depth_proxy_al: Option<u64>,
561 max_true_depth: Option<u64>,
562 low_kmer_count: usize,
563 total_kmer_count: usize,
564 error1: bool,
565 error2: bool,
566}
567
568#[derive(Debug, Clone, Default)]
569struct PairDecision {
570 toss: bool,
571 analysis: PairAnalysis,
572}
573
574#[derive(Debug, Clone, Default, PartialEq, Eq)]
575struct CountupDecisionPlan {
576 toss: bool,
577 eligible_key_indices: Vec<usize>,
578}
579
580#[derive(Debug, Clone)]
581struct NormalizedPair {
582 input_list_index: usize,
583 r1: SequenceRecord,
584 r2: Option<SequenceRecord>,
585 out_r1: SequenceRecord,
586 out_r2: Option<SequenceRecord>,
587 decision: PairDecision,
588 uncorrectable: bool,
589 read_count: u64,
590 base_count: u64,
591}
592
593#[derive(Debug, Clone)]
594struct CountupWorkPair {
595 input_list_index: usize,
596 sort_key: CountupSortKey,
597 r1: SequenceRecord,
598 r2: Option<SequenceRecord>,
599}
600
601#[derive(Debug, Clone)]
602struct CountupWorkCandidate {
603 input_list_index: usize,
604 original_index: usize,
605 rand: f64,
606 r1: SequenceRecord,
607 r2: Option<SequenceRecord>,
608}
609
610struct CountupWorkBuild {
611 source: CountupWorkSource,
612 input_hist: Option<SparseHist>,
613 input_read_hist: Option<SparseReadDepthHist>,
614 input_hist_elapsed_micros: u128,
615 format1: SeqFormat,
616 format2: Option<SeqFormat>,
617 spill_summary: CountupSpillSummary,
618}
619
620struct CountupChunkBuild {
621 work_pairs: Vec<CountupWorkPair>,
622 depth_hist: SparseHist,
623 read_hist: SparseReadDepthHist,
624}
625
626struct CountupInputHistAccumulator<'a> {
627 wants_depth_hist: bool,
628 wants_read_hist: bool,
629 depth_hist: &'a mut SparseHist,
630 read_hist: &'a mut SparseReadDepthHist,
631}
632
633#[derive(Debug, Clone)]
634struct CountupSortKey {
635 errors: usize,
636 total_len: usize,
637 expected_errors: f64,
638 numeric_id: u64,
639 original_index: usize,
640}
641
642struct CountupPrepassResult {
643 include: bool,
644 sort_analysis: Option<PairAnalysis>,
645}
646
647struct CountupWorkSource {
648 temp_dir: Option<tempfile::TempDir>,
649 inner: CountupWorkSourceInner,
650}
651
652enum CountupWorkSourceInner {
653 Memory(Vec<CountupWorkPair>),
654 Spilled(Vec<PathBuf>),
655}
656
657struct CountupWorkIter {
658 _temp_dir: Option<tempfile::TempDir>,
659 inner: CountupWorkIterInner,
660}
661
662enum CountupWorkIterInner {
663 Memory(std::vec::IntoIter<CountupWorkPair>),
664 Spilled(CountupRunMerger),
665}
666
667struct CountupRunMerger {
668 readers: Vec<CountupRunReader>,
669 heap: BinaryHeap<CountupRunHead>,
670}
671
672struct CountupRunReader {
673 reader: BufReader<fs::File>,
674}
675
676struct CountupRunHead {
677 pair: CountupWorkPair,
678 run_index: usize,
679}
680
681#[derive(Debug, Clone, Copy, Default)]
682struct CorrectionResult {
683 corrected: usize,
684 marked: usize,
685 uncorrectable: bool,
686}
687
688#[derive(Debug, Clone, Copy)]
689struct CorrectionTarget {
690 low: i64,
691 lower_bound: i64,
692 upper_bound: i64,
693 mult: i64,
694}
695
696#[derive(Debug, Clone)]
697struct InputLists {
698 first: Vec<PathBuf>,
699 second: Option<Vec<PathBuf>>,
700}
701
702#[derive(Debug, Clone, Default)]
703struct ReadDepthHistogram {
704 reads: Vec<u64>,
705 bases: Vec<u64>,
706}
707
708impl ReadDepthHistogram {
709 fn new(len: usize) -> Self {
710 Self {
711 reads: vec![0; len],
712 bases: vec![0; len],
713 }
714 }
715}
716
717#[derive(Debug, Clone, Copy, Default)]
718struct BaseCounts {
719 a: u64,
720 c: u64,
721 g: u64,
722 t: u64,
723 n: u64,
724}
725
726impl BaseCounts {
727 fn total(self) -> u64 {
728 self.a + self.c + self.g + self.t + self.n
729 }
730}
731
732#[derive(Debug, Clone, Default)]
733struct BaseContentHistogram {
734 first: Vec<BaseCounts>,
735 second: Vec<BaseCounts>,
736}
737
738#[derive(Debug, Clone, Copy, Default)]
739struct MatchCounts {
740 matches: u64,
741 n: u64,
742}
743
744#[derive(Debug, Clone, Default)]
745struct AlignmentFallbackHistograms {
746 first_match: Vec<MatchCounts>,
747 second_match: Vec<MatchCounts>,
748 quality_match: Vec<u64>,
749 read_count: u64,
750 base_count: u64,
751 pair_count: u64,
752 paired: bool,
753}
754
755#[derive(Debug, Clone, Default)]
756struct QualitySideHistograms {
757 overall: Vec<u64>,
758 first_counts: Vec<u64>,
759 second_counts: Vec<u64>,
760 first_avg: Vec<u64>,
761 second_avg: Vec<u64>,
762 first_by_pos: Vec<Vec<u64>>,
763 second_by_pos: Vec<Vec<u64>>,
764 paired: bool,
765}
766
767#[derive(Debug, Clone, Default)]
768struct ReadLocalSideHistograms {
769 quality: Option<QualitySideHistograms>,
770 length: Option<ReadDepthHistogram>,
771 gc: Option<ReadDepthHistogram>,
772 base: Option<BaseContentHistogram>,
773 entropy: Option<Vec<u64>>,
774 identity: Option<ReadDepthHistogram>,
775 alignment: Option<AlignmentFallbackHistograms>,
776 barcodes: Option<BTreeMap<String, u64>>,
777}
778
779#[derive(Debug, Clone)]
780struct JavaXoshiro {
781 s0: u64,
782 s1: u64,
783 s2: u64,
784 s3: u64,
785}
786
787impl JavaXoshiro {
788 fn new(seed: u64) -> Self {
789 let mut rng = Self {
790 s0: seed,
791 s1: mix_seed(seed),
792 s2: 0,
793 s3: 0,
794 };
795 rng.s2 = mix_seed(rng.s1);
796 rng.s3 = mix_seed(rng.s2);
797 if rng.s0 == 0 && rng.s1 == 0 && rng.s2 == 0 && rng.s3 == 0 {
798 rng.s0 = 0x5DEECE66D;
799 rng.s1 = 0xB;
800 rng.s2 = 0xCCA;
801 rng.s3 = 0xF00;
802 }
803 for _ in 0..4 {
804 rng.next_long();
805 }
806 rng
807 }
808
809 fn next_long(&mut self) -> u64 {
810 let result = self.s0.wrapping_add(self.s3);
811 let t = self.s1 << 17;
812
813 self.s2 ^= self.s0;
814 self.s3 ^= self.s1;
815 self.s1 ^= self.s2;
816 self.s0 ^= self.s3;
817
818 self.s2 ^= t;
819 self.s3 = self.s3.rotate_left(45);
820
821 result
822 }
823
824 fn next_double(&mut self) -> f64 {
825 ((self.next_long() >> 11) as f64) * (1.0 / ((1u64 << 53) as f64))
826 }
827}
828
829fn run_random_seed(config: &Config) -> u64 {
830 if config.deterministic {
831 0
832 } else {
833 nondeterministic_seed()
834 }
835}
836
837fn nondeterministic_seed() -> u64 {
838 let nanos = SystemTime::now()
839 .duration_since(UNIX_EPOCH)
840 .map(|duration| duration.as_nanos() as u64)
841 .unwrap_or(0);
842 let counter = NONDETERMINISTIC_SEED_COUNTER.fetch_add(1, Ordering::Relaxed);
843 nanos ^ ((std::process::id() as u64) << 32) ^ mix_seed(counter.wrapping_add(0x9E37_79B9))
844}
845
846fn mix_seed(mut x: u64) -> u64 {
847 x = x.wrapping_add(0x9E37_79B9_7F4A_7C15);
848 x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
849 x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
850 x ^ (x >> 31)
851}
852
853struct PrimaryReaders {
854 r1: SequenceReader,
855 r2: Option<SequenceReader>,
856 interleaved: bool,
857 input_list1: Vec<PathBuf>,
858 input_list2: Option<Vec<PathBuf>>,
859 input_list_index: usize,
860 settings: SequenceSettings,
861 limit_per_file: Option<u64>,
862 pairs_seen_in_file: u64,
863 format1: SeqFormat,
864 format2: Option<SeqFormat>,
865 next_pair_numeric_id: u64,
866 gzip_threads: Option<usize>,
867}
868
869impl PrimaryReaders {
870 fn open(config: &Config, limit_per_file: Option<u64>) -> Result<Self> {
871 let in1 = config.in1.as_ref().context("missing in1")?;
872 let sequence_settings = sequence_settings(config);
873 let input_list = primary_input_lists(config);
874 let first_path = input_list
875 .as_ref()
876 .and_then(|paths| paths.first.first())
877 .unwrap_or(in1);
878 let r2_path = input_list
879 .as_ref()
880 .and_then(|paths| paths.second.as_ref())
881 .and_then(|paths| paths.first())
882 .or(config.in2.as_ref());
883 let gzip_threads = gzip_threads_for_paths(
884 config.gzip_threads,
885 [Some(first_path.as_path()), r2_path.map(PathBuf::as_path)],
886 );
887 let r1 =
888 open_sequence_reader_with_gzip_threads(first_path, sequence_settings, gzip_threads)?;
889 let interleaved = input_list.is_none()
890 && config.in2.is_none()
891 && (config.interleaved
892 || (config.test_interleaved
893 && detect_interleaved_input_with_gzip_threads(
894 first_path,
895 sequence_settings,
896 config.gzip_threads,
897 )?));
898 let r2 = r2_path
899 .map(|path| {
900 open_sequence_reader_with_gzip_threads(path, sequence_settings, gzip_threads)
901 })
902 .transpose()?;
903 if let Some(r2_ref) = &r2
904 && r1.format() != r2_ref.format()
905 {
906 bail!("paired inputs must use the same FASTA/FASTQ format");
907 }
908 let format1 = r1.format();
909 let format2 = if interleaved {
910 Some(format1)
911 } else {
912 r2.as_ref().map(SequenceReader::format)
913 };
914 Ok(Self {
915 r1,
916 r2,
917 interleaved,
918 input_list1: input_list
919 .as_ref()
920 .map(|paths| paths.first.clone())
921 .unwrap_or_default(),
922 input_list2: input_list.and_then(|paths| paths.second),
923 input_list_index: 0,
924 settings: sequence_settings,
925 limit_per_file,
926 pairs_seen_in_file: 0,
927 format1,
928 format2,
929 next_pair_numeric_id: 0,
930 gzip_threads: config.gzip_threads,
931 })
932 }
933
934 fn format1(&self) -> SeqFormat {
935 self.format1
936 }
937
938 fn format2(&self) -> Option<SeqFormat> {
939 self.format2
940 }
941
942 fn input_list_index(&self) -> usize {
943 self.input_list_index
944 }
945
946 fn next_pair(&mut self) -> Result<Option<(SequenceRecord, Option<SequenceRecord>)>> {
947 if !self.input_list1.is_empty() {
948 return self.next_list_record();
949 }
950 if limit_reached(self.limit_per_file, self.pairs_seen_in_file) {
951 return Ok(None);
952 }
953 let r1 = self.r1.next_record()?;
954 if self.interleaved {
955 return match r1 {
956 Some(mut record) => {
957 let mut mate = self
958 .r1
959 .next_record()?
960 .context("interleaved input ended after an unmatched first mate record")?;
961 record.numeric_id = self.next_pair_numeric_id;
962 mate.numeric_id = self.next_pair_numeric_id;
963 self.next_pair_numeric_id += 1;
964 self.pairs_seen_in_file += 1;
965 Ok(Some((record, Some(mate))))
966 }
967 None => Ok(None),
968 };
969 }
970
971 let r2 = match &mut self.r2 {
972 Some(reader) => reader.next_record()?,
973 None => None,
974 };
975
976 match (r1, r2) {
977 (None, None) => Ok(None),
978 (Some(record), mate) => {
979 self.pairs_seen_in_file += 1;
980 Ok(Some((record, mate)))
981 }
982 (None, Some(_)) => bail!("in2 has more records than in1"),
983 }
984 }
985
986 fn next_list_record(&mut self) -> Result<Option<(SequenceRecord, Option<SequenceRecord>)>> {
987 loop {
988 if limit_reached(self.limit_per_file, self.pairs_seen_in_file) {
989 if !self.advance_list_reader()? {
990 return Ok(None);
991 }
992 continue;
993 }
994 let had_r2 = self.r2.is_some();
995 let r1 = self.r1.next_record()?;
996 let r2 = match &mut self.r2 {
997 Some(reader) => reader.next_record()?,
998 None => None,
999 };
1000 match (r1, r2) {
1001 (Some(record), Some(mate)) => {
1002 self.pairs_seen_in_file += 1;
1003 return Ok(Some((record, Some(mate))));
1004 }
1005 (Some(record), None) if !had_r2 => {
1006 self.pairs_seen_in_file += 1;
1007 return Ok(Some((record, None)));
1008 }
1009 (Some(_), None) => bail!("in2 has fewer records than in1"),
1010 (None, Some(_)) => bail!("in2 has more records than in1"),
1011 (None, None) => {
1012 if !self.advance_list_reader()? {
1013 return Ok(None);
1014 }
1015 }
1016 }
1017 }
1018 }
1019
1020 fn advance_list_reader(&mut self) -> Result<bool> {
1021 if self.input_list_index + 1 >= self.input_list1.len() {
1022 return Ok(false);
1023 }
1024 self.input_list_index += 1;
1025 let path = &self.input_list1[self.input_list_index];
1026 let second_path = self
1027 .input_list2
1028 .as_ref()
1029 .and_then(|paths| paths.get(self.input_list_index));
1030 let gzip_threads = gzip_threads_for_paths(
1031 self.gzip_threads,
1032 [Some(path.as_path()), second_path.map(PathBuf::as_path)],
1033 );
1034 let reader =
1035 SequenceReader::from_path_with_gzip_threads(path, self.settings, gzip_threads)?;
1036 if reader.format() != self.format1 {
1037 bail!("comma-separated input list entries must use the same FASTA/FASTQ format");
1038 }
1039 self.r2 = self
1040 .input_list2
1041 .as_ref()
1042 .and_then(|paths| paths.get(self.input_list_index))
1043 .map(|path| {
1044 SequenceReader::from_path_with_gzip_threads(path, self.settings, gzip_threads)
1045 })
1046 .transpose()?;
1047 if let Some(r2_ref) = &self.r2
1048 && Some(r2_ref.format()) != self.format2
1049 {
1050 bail!("comma-separated paired input list entries must use the same FASTA/FASTQ format");
1051 }
1052 self.r1 = reader;
1053 self.pairs_seen_in_file = 0;
1054 Ok(true)
1055 }
1056}
1057
1058fn open_sequence_reader(
1059 config: &Config,
1060 path: &Path,
1061 settings: SequenceSettings,
1062) -> Result<SequenceReader> {
1063 SequenceReader::from_path_with_gzip_threads(path, settings, config.gzip_threads)
1064}
1065
1066fn open_sequence_reader_with_gzip_threads(
1067 path: &Path,
1068 settings: SequenceSettings,
1069 gzip_threads: Option<usize>,
1070) -> Result<SequenceReader> {
1071 SequenceReader::from_path_with_gzip_threads(path, settings, gzip_threads)
1072}
1073
1074fn open_paired_sequence_readers(
1075 config: &Config,
1076 path1: &Path,
1077 path2: &Path,
1078 settings: SequenceSettings,
1079) -> Result<(SequenceReader, SequenceReader)> {
1080 let gzip_threads = gzip_threads_for_paths(config.gzip_threads, [Some(path1), Some(path2)]);
1081 let reader1 = open_sequence_reader_with_gzip_threads(path1, settings, gzip_threads)?;
1082 let reader2 = open_sequence_reader_with_gzip_threads(path2, settings, gzip_threads)?;
1083 Ok((reader1, reader2))
1084}
1085
1086fn gzip_threads_for_paths<'a>(
1087 gzip_threads: Option<usize>,
1088 paths: impl IntoIterator<Item = Option<&'a Path>>,
1089) -> Option<usize> {
1090 let gzip_streams = paths
1091 .into_iter()
1092 .flatten()
1093 .filter(|path| path_uses_gzip(path))
1094 .count();
1095 gzip_threads_for_streams(gzip_threads, gzip_streams)
1096}
1097
1098fn gzip_threads_for_streams(gzip_threads: Option<usize>, gzip_streams: usize) -> Option<usize> {
1099 gzip_threads.map(|threads| {
1100 if threads <= 1 || gzip_streams <= 1 {
1101 threads
1102 } else {
1103 (threads / gzip_streams).max(1)
1104 }
1105 })
1106}
1107
1108fn path_uses_gzip(path: &Path) -> bool {
1109 path.extension()
1110 .and_then(|ext| ext.to_str())
1111 .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
1112}
1113
1114struct OptionalWriters {
1115 interleaved_output: bool,
1116 current_output_list_index: usize,
1117 keep_plan: OutputPathPlan,
1118 toss_plan: OutputPathPlan,
1119 low_plan: OutputPathPlan,
1120 mid_plan: OutputPathPlan,
1121 high_plan: OutputPathPlan,
1122 uncorrected_plan: OutputPathPlan,
1123 keep1: Option<SequenceWriter>,
1124 keep2: Option<SequenceWriter>,
1125 toss1: Option<SequenceWriter>,
1126 toss2: Option<SequenceWriter>,
1127 low1: Option<SequenceWriter>,
1128 low2: Option<SequenceWriter>,
1129 mid1: Option<SequenceWriter>,
1130 mid2: Option<SequenceWriter>,
1131 high1: Option<SequenceWriter>,
1132 high2: Option<SequenceWriter>,
1133 uncorrected1: Option<SequenceWriter>,
1134 uncorrected2: Option<SequenceWriter>,
1135}
1136
1137impl OptionalWriters {
1138 fn open(config: &Config, _format1: SeqFormat, format2: Option<SeqFormat>) -> Result<Self> {
1139 if format2.is_none() && has_second_output(config) {
1140 bail!(
1141 "second-output paths require paired input; interleaved auto-detection did not detect paired records"
1142 );
1143 }
1144 let paired = format2.is_some();
1145 let input_list_len = primary_input_lists(config)
1146 .map(|paths| paths.first.len())
1147 .unwrap_or(1);
1148 let keep_plan = prepare_output_path_plan(
1149 config.out1.as_deref(),
1150 config.out2.as_deref(),
1151 paired,
1152 input_list_len,
1153 )?;
1154 let toss_plan = prepare_output_path_plan(
1155 config.out_toss1.as_deref(),
1156 config.out_toss2.as_deref(),
1157 paired,
1158 input_list_len,
1159 )?;
1160 let low_plan = prepare_output_path_plan(
1161 config.out_low1.as_deref(),
1162 config.out_low2.as_deref(),
1163 paired,
1164 input_list_len,
1165 )?;
1166 let mid_plan = prepare_output_path_plan(
1167 config.out_mid1.as_deref(),
1168 config.out_mid2.as_deref(),
1169 paired,
1170 input_list_len,
1171 )?;
1172 let high_plan = prepare_output_path_plan(
1173 config.out_high1.as_deref(),
1174 config.out_high2.as_deref(),
1175 paired,
1176 input_list_len,
1177 )?;
1178 let uncorrected_plan = prepare_output_path_plan(
1179 config.out_uncorrected1.as_deref(),
1180 config.out_uncorrected2.as_deref(),
1181 paired,
1182 input_list_len,
1183 )?;
1184 let output_gzip_threads = output_gzip_threads_for_plans(
1185 config.gzip_threads,
1186 [
1187 &keep_plan,
1188 &toss_plan,
1189 &low_plan,
1190 &mid_plan,
1191 &high_plan,
1192 &uncorrected_plan,
1193 ],
1194 0,
1195 )?;
1196 let (keep1, keep2) = open_output_pair(
1197 keep_plan.pair_for_index(0)?,
1198 config.overwrite,
1199 config.append,
1200 config.quality_out_offset,
1201 config.fake_quality,
1202 config.fasta_wrap,
1203 output_gzip_threads,
1204 )?;
1205 let (toss1, toss2) = open_output_pair(
1206 toss_plan.pair_for_index(0)?,
1207 config.overwrite,
1208 config.append,
1209 config.quality_out_offset,
1210 config.fake_quality,
1211 config.fasta_wrap,
1212 output_gzip_threads,
1213 )?;
1214 let (low1, low2) = open_output_pair(
1215 low_plan.pair_for_index(0)?,
1216 config.overwrite,
1217 config.append,
1218 config.quality_out_offset,
1219 config.fake_quality,
1220 config.fasta_wrap,
1221 output_gzip_threads,
1222 )?;
1223 let (mid1, mid2) = open_output_pair(
1224 mid_plan.pair_for_index(0)?,
1225 config.overwrite,
1226 config.append,
1227 config.quality_out_offset,
1228 config.fake_quality,
1229 config.fasta_wrap,
1230 output_gzip_threads,
1231 )?;
1232 let (high1, high2) = open_output_pair(
1233 high_plan.pair_for_index(0)?,
1234 config.overwrite,
1235 config.append,
1236 config.quality_out_offset,
1237 config.fake_quality,
1238 config.fasta_wrap,
1239 output_gzip_threads,
1240 )?;
1241 let (uncorrected1, uncorrected2) = open_output_pair(
1242 uncorrected_plan.pair_for_index(0)?,
1243 config.overwrite,
1244 config.append,
1245 config.quality_out_offset,
1246 config.fake_quality,
1247 config.fasta_wrap,
1248 output_gzip_threads,
1249 )?;
1250 Ok(Self {
1251 interleaved_output: paired,
1252 current_output_list_index: 0,
1253 keep_plan,
1254 toss_plan,
1255 low_plan,
1256 mid_plan,
1257 high_plan,
1258 uncorrected_plan,
1259 keep1,
1260 keep2,
1261 toss1,
1262 toss2,
1263 low1,
1264 low2,
1265 mid1,
1266 mid2,
1267 high1,
1268 high2,
1269 uncorrected1,
1270 uncorrected2,
1271 })
1272 }
1273
1274 fn sync_to_input_list_index(&mut self, config: &Config, index: usize) -> Result<()> {
1275 if self.current_output_list_index == index {
1276 return Ok(());
1277 }
1278 self.flush()?;
1279 let output_gzip_threads = output_gzip_threads_for_plans(
1280 config.gzip_threads,
1281 [
1282 &self.keep_plan,
1283 &self.toss_plan,
1284 &self.low_plan,
1285 &self.mid_plan,
1286 &self.high_plan,
1287 &self.uncorrected_plan,
1288 ],
1289 index,
1290 )?;
1291 reopen_output_pair_if_fanout(
1292 &self.keep_plan,
1293 index,
1294 &mut self.keep1,
1295 &mut self.keep2,
1296 config,
1297 output_gzip_threads,
1298 )?;
1299 reopen_output_pair_if_fanout(
1300 &self.toss_plan,
1301 index,
1302 &mut self.toss1,
1303 &mut self.toss2,
1304 config,
1305 output_gzip_threads,
1306 )?;
1307 reopen_output_pair_if_fanout(
1308 &self.low_plan,
1309 index,
1310 &mut self.low1,
1311 &mut self.low2,
1312 config,
1313 output_gzip_threads,
1314 )?;
1315 reopen_output_pair_if_fanout(
1316 &self.mid_plan,
1317 index,
1318 &mut self.mid1,
1319 &mut self.mid2,
1320 config,
1321 output_gzip_threads,
1322 )?;
1323 reopen_output_pair_if_fanout(
1324 &self.high_plan,
1325 index,
1326 &mut self.high1,
1327 &mut self.high2,
1328 config,
1329 output_gzip_threads,
1330 )?;
1331 reopen_output_pair_if_fanout(
1332 &self.uncorrected_plan,
1333 index,
1334 &mut self.uncorrected1,
1335 &mut self.uncorrected2,
1336 config,
1337 output_gzip_threads,
1338 )?;
1339 self.current_output_list_index = index;
1340 Ok(())
1341 }
1342
1343 fn write_pair(
1344 &mut self,
1345 toss: bool,
1346 r1: &SequenceRecord,
1347 r2: Option<&SequenceRecord>,
1348 ) -> Result<()> {
1349 if toss {
1350 write_to_optional_pair(
1351 &mut self.toss1,
1352 &mut self.toss2,
1353 self.interleaved_output,
1354 r1,
1355 r2,
1356 )?;
1357 } else {
1358 write_to_optional_pair(
1359 &mut self.keep1,
1360 &mut self.keep2,
1361 self.interleaved_output,
1362 r1,
1363 r2,
1364 )?;
1365 }
1366 Ok(())
1367 }
1368
1369 fn write_depth_bin(
1370 &mut self,
1371 config: &Config,
1372 analysis: &PairAnalysis,
1373 r1: &SequenceRecord,
1374 r2: Option<&SequenceRecord>,
1375 ) -> Result<()> {
1376 let d1 = bin_depth(analysis.read1.depth_al);
1377 let d2 = analysis
1378 .read2
1379 .as_ref()
1380 .map(|read| bin_depth(read.depth_al))
1381 .unwrap_or(-1);
1382 let target = if d1 < config.low_bin_depth && d2 < config.low_bin_depth {
1383 DepthBin::Low
1384 } else if (d1 < config.low_bin_depth || d1 > config.high_bin_depth)
1385 && (d2 < config.low_bin_depth || d2 >= config.high_bin_depth)
1386 {
1387 DepthBin::High
1388 } else {
1389 DepthBin::Mid
1390 };
1391
1392 match target {
1393 DepthBin::Low => write_to_optional_pair(
1394 &mut self.low1,
1395 &mut self.low2,
1396 self.interleaved_output,
1397 r1,
1398 r2,
1399 )?,
1400 DepthBin::Mid => write_to_optional_pair(
1401 &mut self.mid1,
1402 &mut self.mid2,
1403 self.interleaved_output,
1404 r1,
1405 r2,
1406 )?,
1407 DepthBin::High => write_to_optional_pair(
1408 &mut self.high1,
1409 &mut self.high2,
1410 self.interleaved_output,
1411 r1,
1412 r2,
1413 )?,
1414 }
1415 Ok(())
1416 }
1417
1418 fn write_uncorrected(
1419 &mut self,
1420 r1: &SequenceRecord,
1421 r2: Option<&SequenceRecord>,
1422 ) -> Result<()> {
1423 write_to_optional_pair(
1424 &mut self.uncorrected1,
1425 &mut self.uncorrected2,
1426 self.interleaved_output,
1427 r1,
1428 r2,
1429 )
1430 }
1431
1432 fn flush(&mut self) -> Result<()> {
1433 for writer in [
1434 self.keep1.as_mut(),
1435 self.keep2.as_mut(),
1436 self.toss1.as_mut(),
1437 self.toss2.as_mut(),
1438 self.low1.as_mut(),
1439 self.low2.as_mut(),
1440 self.mid1.as_mut(),
1441 self.mid2.as_mut(),
1442 self.high1.as_mut(),
1443 self.high2.as_mut(),
1444 self.uncorrected1.as_mut(),
1445 self.uncorrected2.as_mut(),
1446 ]
1447 .into_iter()
1448 .flatten()
1449 {
1450 writer.flush()?;
1451 }
1452 Ok(())
1453 }
1454}
1455
1456#[derive(Debug, Clone, Copy)]
1457enum DepthBin {
1458 Low,
1459 Mid,
1460 High,
1461}
1462
1463fn write_to_optional_pair(
1464 writer1: &mut Option<SequenceWriter>,
1465 writer2: &mut Option<SequenceWriter>,
1466 interleaved_output: bool,
1467 r1: &SequenceRecord,
1468 r2: Option<&SequenceRecord>,
1469) -> Result<()> {
1470 if let Some(writer) = writer1.as_mut() {
1471 writer.write_record(r1)?;
1472 if interleaved_output && writer2.is_none() {
1473 if let Some(mate) = r2 {
1474 writer.write_record(mate)?;
1475 }
1476 return Ok(());
1477 }
1478 }
1479 if let (Some(writer), Some(mate)) = (writer2.as_mut(), r2) {
1480 writer.write_record(mate)?;
1481 }
1482 Ok(())
1483}
1484
1485fn has_second_output(config: &Config) -> bool {
1486 config.out2.is_some()
1487 || config.out_toss2.is_some()
1488 || config.out_low2.is_some()
1489 || config.out_mid2.is_some()
1490 || config.out_high2.is_some()
1491 || config.out_uncorrected2.is_some()
1492}
1493
1494fn depth_bin_outputs_enabled(config: &Config) -> bool {
1495 config.out_low1.is_some()
1496 || config.out_low2.is_some()
1497 || config.out_mid1.is_some()
1498 || config.out_mid2.is_some()
1499 || config.out_high1.is_some()
1500 || config.out_high2.is_some()
1501}
1502
1503fn needs_output_pair_analysis(config: &Config) -> bool {
1504 config.rename_reads || depth_bin_outputs_enabled(config)
1505}
1506
1507#[derive(Debug, Clone)]
1508struct OutputPathPair {
1509 first: Option<PathBuf>,
1510 second: Option<PathBuf>,
1511}
1512
1513#[derive(Debug, Clone)]
1514struct OutputPathPlan {
1515 pairs: Vec<OutputPathPair>,
1516 fanout: bool,
1517}
1518
1519impl OutputPathPlan {
1520 fn pair_for_index(&self, index: usize) -> Result<&OutputPathPair> {
1521 if self.fanout {
1522 self.pairs
1523 .get(index)
1524 .with_context(|| format!("missing output path list entry for input {}", index + 1))
1525 } else {
1526 self.pairs.first().context("missing output path plan entry")
1527 }
1528 }
1529}
1530
1531fn prepare_output_paths(
1532 first: Option<&Path>,
1533 second: Option<&Path>,
1534 paired: bool,
1535) -> OutputPathPair {
1536 let second = match second {
1537 Some(path) => Some(path.to_path_buf()),
1538 None if paired => first.and_then(|path| replace_hash_in_path(path, "2")),
1539 None => None,
1540 };
1541 let first =
1542 first.map(|path| replace_hash_in_path(path, "1").unwrap_or_else(|| path.to_path_buf()));
1543 OutputPathPair { first, second }
1544}
1545
1546fn prepare_output_path_plan(
1547 first: Option<&Path>,
1548 second: Option<&Path>,
1549 paired: bool,
1550 input_list_len: usize,
1551) -> Result<OutputPathPlan> {
1552 if input_list_len > 1
1553 && let Some(first_values) = output_path_values(first)
1554 && first_values.len() > 1
1555 {
1556 let second_values = output_path_values(second);
1557 let fanout_len = second_values
1558 .as_ref()
1559 .map(|values| first_values.len().min(values.len()))
1560 .unwrap_or(first_values.len());
1561 let mut pairs = Vec::with_capacity(fanout_len);
1562 for index in 0..fanout_len {
1563 let mut first_path = first_values[index].clone();
1564 let second_path = if let Some(values) = &second_values {
1565 Some(values[index].clone())
1566 } else if paired {
1567 if let Some(second_path) = replace_hash_in_path(&first_path, "2") {
1568 first_path = replace_hash_in_path(&first_path, "1").unwrap_or(first_path);
1569 Some(second_path)
1570 } else {
1571 None
1572 }
1573 } else {
1574 None
1575 };
1576 pairs.push(OutputPathPair {
1577 first: Some(first_path),
1578 second: second_path,
1579 });
1580 }
1581 return Ok(OutputPathPlan {
1582 pairs,
1583 fanout: true,
1584 });
1585 }
1586
1587 if input_list_len > 1
1588 && let Some(second_values) = output_path_values(second)
1589 && second_values.len() > 1
1590 {
1591 let first_path =
1592 first.map(|path| replace_hash_in_path(path, "1").unwrap_or_else(|| path.to_path_buf()));
1593 return Ok(OutputPathPlan {
1594 pairs: vec![OutputPathPair {
1595 first: first_path,
1596 second: Some(second_values[0].clone()),
1597 }],
1598 fanout: false,
1599 });
1600 }
1601
1602 Ok(OutputPathPlan {
1603 pairs: vec![prepare_output_paths(first, second, paired)],
1604 fanout: false,
1605 })
1606}
1607
1608fn output_path_values(path: Option<&Path>) -> Option<Vec<PathBuf>> {
1609 let path = path?;
1610 if path.exists() {
1611 return Some(vec![path.to_path_buf()]);
1612 }
1613 let text = path.to_string_lossy();
1614 if text.contains(',') {
1615 let paths = split_path_list(&text);
1616 if paths.len() > 1 {
1617 return Some(paths);
1618 }
1619 }
1620 Some(vec![path.to_path_buf()])
1621}
1622
1623fn reopen_output_pair_if_fanout(
1624 plan: &OutputPathPlan,
1625 index: usize,
1626 first: &mut Option<SequenceWriter>,
1627 second: &mut Option<SequenceWriter>,
1628 config: &Config,
1629 gzip_threads: Option<usize>,
1630) -> Result<()> {
1631 if !plan.fanout {
1632 return Ok(());
1633 }
1634 *first = None;
1635 *second = None;
1636 let (new_first, new_second) = open_output_pair(
1637 plan.pair_for_index(index)?,
1638 config.overwrite,
1639 config.append,
1640 config.quality_out_offset,
1641 config.fake_quality,
1642 config.fasta_wrap,
1643 gzip_threads,
1644 )?;
1645 *first = new_first;
1646 *second = new_second;
1647 Ok(())
1648}
1649
1650fn output_gzip_threads_for_plans<'a>(
1651 gzip_threads: Option<usize>,
1652 plans: impl IntoIterator<Item = &'a OutputPathPlan>,
1653 index: usize,
1654) -> Result<Option<usize>> {
1655 let mut gzip_streams = 0usize;
1656 for plan in plans {
1657 gzip_streams =
1658 gzip_streams.saturating_add(output_pair_gzip_streams(plan.pair_for_index(index)?));
1659 }
1660 Ok(gzip_threads_for_streams(gzip_threads, gzip_streams))
1661}
1662
1663fn output_pair_gzip_streams(pair: &OutputPathPair) -> usize {
1664 [pair.first.as_deref(), pair.second.as_deref()]
1665 .into_iter()
1666 .flatten()
1667 .filter(|path| path_uses_gzip(path))
1668 .count()
1669}
1670
1671fn open_output_pair(
1672 pair: &OutputPathPair,
1673 overwrite: bool,
1674 append: bool,
1675 quality_out_offset: u8,
1676 fake_quality: u8,
1677 fasta_wrap: usize,
1678 gzip_threads: Option<usize>,
1679) -> Result<(Option<SequenceWriter>, Option<SequenceWriter>)> {
1680 let first = open_sequence_writer(
1681 pair.first.as_deref(),
1682 overwrite,
1683 append,
1684 quality_out_offset,
1685 fake_quality,
1686 fasta_wrap,
1687 gzip_threads,
1688 )?;
1689 let second = open_sequence_writer(
1690 pair.second.as_deref(),
1691 overwrite,
1692 append,
1693 quality_out_offset,
1694 fake_quality,
1695 fasta_wrap,
1696 gzip_threads,
1697 )?;
1698 Ok((first, second))
1699}
1700
1701fn replace_hash_in_path(path: &Path, replacement: &str) -> Option<PathBuf> {
1702 let text = path.to_string_lossy();
1703 if text.contains('#') {
1704 Some(PathBuf::from(text.replacen('#', replacement, 1)))
1705 } else {
1706 None
1707 }
1708}
1709
1710fn bin_depth(depth: Option<u64>) -> i64 {
1711 depth
1712 .and_then(|value| i64::try_from(value).ok())
1713 .unwrap_or(-1)
1714}
1715
1716pub fn run(config: &Config) -> Result<RunSummary> {
1717 let resolved_config;
1718 let config = if config.overlap_error_correct_auto {
1719 resolved_config = resolve_overlap_error_correct_auto(config)?;
1720 &resolved_config
1721 } else {
1722 config
1723 };
1724 if config.passes > 1 {
1725 return run_multipass(config);
1726 }
1727 run_single_pass(config)
1728}
1729
1730fn resolve_overlap_error_correct_auto(config: &Config) -> Result<Config> {
1731 let mut resolved = config.clone();
1732 resolved.overlap_error_correct_auto = false;
1733 resolved.overlap_error_correct = sampled_overlap_fraction(config)?
1734 .is_some_and(|fraction| fraction > OVERLAP_AUTO_ENABLE_FRACTION);
1735 Ok(resolved)
1736}
1737
1738fn sampled_overlap_fraction(config: &Config) -> Result<Option<f64>> {
1739 let mut readers = PrimaryReaders::open(config, Some(OVERLAP_AUTO_SAMPLE_PAIRS))?;
1740 let mut sampled = 0u64;
1741 let mut seen = 0u64;
1742 let mut mergeable = 0u64;
1743 while let Some((r1, r2)) = readers.next_pair()? {
1744 let Some(r2) = r2 else {
1745 return Ok(None);
1746 };
1747 seen += 1;
1748 if !seen.is_multiple_of(OVERLAP_AUTO_SAMPLE_INTERVAL) {
1749 continue;
1750 }
1751 sampled += 1;
1752 if best_pair_overlap(&r1, &r2).is_some() {
1753 mergeable += 1;
1754 }
1755 }
1756 if sampled == 0 {
1757 Ok(None)
1758 } else {
1759 Ok(Some(mergeable as f64 / sampled as f64))
1760 }
1761}
1762
1763fn run_multipass(config: &Config) -> Result<RunSummary> {
1764 let mut multipass_config = config.clone();
1765 apply_bbtools_multipass_cell_bits_cap(&mut multipass_config);
1766 let config = &multipass_config;
1767 let temp_dir = managed_temp_dir(config, "bbnorm-rs-multipass-")?;
1768 let paired = config.in2.is_some() || config.interleaved;
1769 let separate_pair_outputs = paired && config.out2.is_some();
1770 let temp_ext = temp_sequence_extension(config);
1771 let mut last_in1 = config.in1.clone().context("missing in1")?;
1772 let mut last_in2 = config.in2.clone();
1773 let mut last_interleaved = config.interleaved;
1774
1775 for pass in 1..config.passes {
1776 let temp1 = temp_dir.path().join(format!("pass{pass}.r1.{temp_ext}"));
1777 let temp2 = separate_pair_outputs
1778 .then(|| temp_dir.path().join(format!("pass{pass}.r2.{temp_ext}")));
1779 let mut pass_config = pass_config_for_intermediate(
1780 config,
1781 pass,
1782 &last_in1,
1783 last_in2.as_deref(),
1784 last_interleaved,
1785 temp1.clone(),
1786 temp2.clone(),
1787 None,
1788 None,
1789 );
1790 run_single_pass(&pass_config)
1791 .with_context(|| format!("running Rust multipass intermediate pass {pass}"))?;
1792
1793 last_in1 = temp1;
1794 last_in2 = temp2;
1795 last_interleaved = paired && last_in2.is_none();
1796 pass_config.notes.clear();
1797 }
1798
1799 let mut final_config = config.clone();
1800 final_config.in1 = Some(last_in1);
1801 final_config.in2 = last_in2;
1802 final_config.interleaved = last_interleaved;
1803 final_config.test_interleaved = !last_interleaved && final_config.in2.is_none();
1804 final_config.extra.clear();
1805 final_config.hist_in = None;
1806 final_config.rhist_in = None;
1807 final_config.peaks_in = None;
1808 final_config.match_hist_out = None;
1809 final_config.insert_hist_out = None;
1810 final_config.quality_accuracy_hist_out = None;
1811 final_config.indel_hist_out = None;
1812 final_config.error_hist_out = None;
1813 final_config.quality_hist_out = None;
1814 final_config.base_quality_hist_out = None;
1815 final_config.quality_count_hist_out = None;
1816 final_config.average_quality_hist_out = None;
1817 final_config.overall_base_quality_hist_out = None;
1818 final_config.length_hist_out = None;
1819 final_config.gc_hist_out = None;
1820 final_config.base_hist_out = None;
1821 final_config.entropy_hist_out = None;
1822 final_config.identity_hist_out = None;
1823 final_config.target_bad_percent_low = 1.0;
1824 final_config.target_bad_percent_high = 1.0;
1825 final_config.error_correct = config.error_correct_final;
1826 final_config.overlap_error_correct = config.overlap_error_correct && config.error_correct_final;
1827 final_config.passes = 1;
1828 let final_toss1 = config.out_toss1.as_ref().map(|_| {
1829 temp_dir
1830 .path()
1831 .join(format!("pass{}.final.toss1.{temp_ext}", config.passes))
1832 });
1833 let final_toss2 = config.out_toss2.as_ref().map(|_| {
1834 temp_dir
1835 .path()
1836 .join(format!("pass{}.final.toss2.{temp_ext}", config.passes))
1837 });
1838 final_config.out_toss1 = final_toss1.clone();
1839 final_config.out_toss2 = final_toss2.clone();
1840
1841 let summary = run_single_pass(&final_config).context("running Rust multipass final pass")?;
1842
1843 if let Some(path) = final_toss1
1844 && let Some(output) = config.out_toss1.as_deref()
1845 {
1846 write_multipass_fragments(
1847 &[path],
1848 output,
1849 config.overwrite,
1850 config.append,
1851 "multipass toss output",
1852 )?;
1853 }
1854 if let Some(path) = final_toss2
1855 && let Some(output) = config.out_toss2.as_deref()
1856 {
1857 write_multipass_fragments(
1858 &[path],
1859 output,
1860 config.overwrite,
1861 config.append,
1862 "multipass paired toss output",
1863 )?;
1864 }
1865 Ok(summary)
1866}
1867
1868fn apply_bbtools_multipass_cell_bits_cap(config: &mut Config) {
1869 if config.passes > 1 && config.count_min.bits.unwrap_or(32) > 16 {
1870 config.count_min.bits = Some(16);
1871 }
1872}
1873
1874fn managed_temp_dir(config: &Config, prefix: &str) -> Result<tempfile::TempDir> {
1875 let mut builder = tempfile::Builder::new();
1876 builder.prefix(prefix);
1877 if config.use_temp_dir
1878 && let Some(dir) = config.temp_dir.as_deref()
1879 {
1880 fs::create_dir_all(dir)
1881 .with_context(|| format!("creating temporary directory parent {}", dir.display()))?;
1882 return builder
1883 .tempdir_in(dir)
1884 .with_context(|| format!("creating managed temporary directory in {}", dir.display()));
1885 }
1886 builder
1887 .tempdir()
1888 .context("creating managed temporary directory")
1889}
1890
1891fn write_multipass_fragments(
1892 fragments: &[PathBuf],
1893 output: &Path,
1894 overwrite: bool,
1895 append: bool,
1896 label: &str,
1897) -> Result<()> {
1898 let mut writer = create_output_with_append(output, overwrite, append)
1899 .with_context(|| format!("opening {label} {}", output.display()))?;
1900 for fragment in fragments {
1901 if fragment.exists() {
1902 let mut input = std::fs::File::open(fragment)
1903 .with_context(|| format!("opening multipass fragment {}", fragment.display()))?;
1904 std::io::copy(&mut input, &mut writer)
1905 .with_context(|| format!("copying multipass fragment {}", fragment.display()))?;
1906 }
1907 }
1908 writer
1909 .flush()
1910 .with_context(|| format!("flushing {label} {}", output.display()))?;
1911 Ok(())
1912}
1913
1914#[allow(clippy::too_many_arguments)]
1915fn pass_config_for_intermediate(
1916 config: &Config,
1917 pass: usize,
1918 in1: &Path,
1919 in2: Option<&Path>,
1920 interleaved: bool,
1921 out1: PathBuf,
1922 out2: Option<PathBuf>,
1923 out_toss1: Option<PathBuf>,
1924 out_toss2: Option<PathBuf>,
1925) -> Config {
1926 let mut pass_config = config.clone();
1927 let target = intermediate_target_depth(config, pass);
1928 let (target_bad_low, target_bad_high) = intermediate_bad_depth_targets(config, pass, target);
1929 pass_config.in1 = Some(in1.to_path_buf());
1930 pass_config.in2 = in2.map(Path::to_path_buf);
1931 pass_config.interleaved = interleaved;
1932 pass_config.test_interleaved = !interleaved && pass_config.in2.is_none();
1933 pass_config.extra = if pass == 1 {
1934 config.extra.clone()
1935 } else {
1936 Vec::new()
1937 };
1938 pass_config.out1 = Some(out1);
1939 pass_config.out2 = out2;
1940 pass_config.out_toss1 = out_toss1;
1941 pass_config.out_toss2 = out_toss2;
1942 pass_config.out_low1 = None;
1943 pass_config.out_low2 = None;
1944 pass_config.out_mid1 = None;
1945 pass_config.out_mid2 = None;
1946 pass_config.out_high1 = None;
1947 pass_config.out_high2 = None;
1948 pass_config.out_uncorrected1 = None;
1949 pass_config.out_uncorrected2 = None;
1950 pass_config.hist_in = (pass == 1).then(|| config.hist_in.clone()).flatten();
1951 pass_config.rhist_in = (pass == 1).then(|| config.rhist_in.clone()).flatten();
1952 pass_config.peaks_in = (pass == 1).then(|| config.peaks_in.clone()).flatten();
1953 pass_config.match_hist_out = (pass == 1).then(|| config.match_hist_out.clone()).flatten();
1954 pass_config.insert_hist_out = (pass == 1)
1955 .then(|| config.insert_hist_out.clone())
1956 .flatten();
1957 pass_config.quality_accuracy_hist_out = (pass == 1)
1958 .then(|| config.quality_accuracy_hist_out.clone())
1959 .flatten();
1960 pass_config.indel_hist_out = (pass == 1).then(|| config.indel_hist_out.clone()).flatten();
1961 pass_config.error_hist_out = (pass == 1).then(|| config.error_hist_out.clone()).flatten();
1962 pass_config.quality_hist_out = (pass == 1)
1963 .then(|| config.quality_hist_out.clone())
1964 .flatten();
1965 pass_config.base_quality_hist_out = (pass == 1)
1966 .then(|| config.base_quality_hist_out.clone())
1967 .flatten();
1968 pass_config.quality_count_hist_out = (pass == 1)
1969 .then(|| config.quality_count_hist_out.clone())
1970 .flatten();
1971 pass_config.average_quality_hist_out = (pass == 1)
1972 .then(|| config.average_quality_hist_out.clone())
1973 .flatten();
1974 pass_config.overall_base_quality_hist_out = (pass == 1)
1975 .then(|| config.overall_base_quality_hist_out.clone())
1976 .flatten();
1977 pass_config.length_hist_out = (pass == 1)
1978 .then(|| config.length_hist_out.clone())
1979 .flatten();
1980 pass_config.gc_hist_out = (pass == 1).then(|| config.gc_hist_out.clone()).flatten();
1981 pass_config.base_hist_out = (pass == 1).then(|| config.base_hist_out.clone()).flatten();
1982 pass_config.entropy_hist_out = (pass == 1)
1983 .then(|| config.entropy_hist_out.clone())
1984 .flatten();
1985 pass_config.identity_hist_out = (pass == 1)
1986 .then(|| config.identity_hist_out.clone())
1987 .flatten();
1988 pass_config.hist_out = None;
1989 pass_config.rhist_out = None;
1990 pass_config.peaks_out = None;
1991 if let Some(bits) = config.count_min_bits_first {
1992 pass_config.count_min.bits = Some(bits);
1993 }
1994 pass_config.target_depth = target;
1995 pass_config.target_bad_percent_low = target_bad_low as f64 / target as f64;
1996 pass_config.target_bad_percent_high = target_bad_high as f64 / target as f64;
1997 pass_config.max_depth = Some(target + target / 4);
1998 pass_config.min_depth =
1999 config
2000 .min_depth
2001 .min(if config.passes > 2 && pass < config.passes - 1 {
2002 2
2003 } else {
2004 3
2005 });
2006 pass_config.min_kmers_over_min_depth = if config.passes > 2 && pass < config.passes - 1 {
2007 config.min_kmers_over_min_depth.min(5)
2008 } else {
2009 config.min_kmers_over_min_depth
2010 };
2011 pass_config.depth_percentile = (config.depth_percentile.max(0.4) * 1.2).min(0.8);
2012 pass_config.toss_error_reads = if config.passes > 2 && pass < config.passes - 1 {
2013 false
2014 } else {
2015 config.toss_error_reads_first
2016 };
2017 pass_config.discard_bad_only = if config.passes > 2 && pass < config.passes - 1 {
2018 true
2019 } else {
2020 config.discard_bad_only_first
2021 };
2022 pass_config.low_percentile = if config.passes > 2 && pass < config.passes - 1 {
2023 0.0
2024 } else {
2025 config.low_percentile
2026 };
2027 pass_config.error_detect_ratio = if config.passes > 2 && pass < config.passes - 1 {
2028 if config.error_detect_ratio > 100 {
2029 100 + (config.error_detect_ratio - 100) / 2
2030 } else {
2031 config.error_detect_ratio
2032 }
2033 } else {
2034 config.error_detect_ratio
2035 };
2036 pass_config.fix_spikes = false;
2037 pass_config.count_up = false;
2038 pass_config.error_correct = config.error_correct_first;
2039 pass_config.overlap_error_correct = config.overlap_error_correct && config.error_correct_first;
2040 pass_config.rename_reads = false;
2041 pass_config.overwrite = true;
2042 pass_config.append = false;
2043 pass_config.passes = 1;
2044 pass_config.notes.clear();
2045 pass_config
2046}
2047
2048fn intermediate_target_depth(config: &Config, pass: usize) -> u64 {
2049 if config.passes > 2 && pass == config.passes - 1 {
2050 config
2051 .target_depth_first
2052 .unwrap_or_else(|| config.target_depth.saturating_mul(2))
2053 } else if config.passes > 2 {
2054 config
2055 .target_depth_first
2056 .map(|target| target.saturating_mul(2))
2057 .unwrap_or_else(|| config.target_depth.saturating_mul(4))
2058 } else {
2059 config
2060 .target_depth_first
2061 .unwrap_or_else(|| config.target_depth.saturating_mul(4))
2062 }
2063}
2064
2065fn intermediate_bad_depth_targets(config: &Config, pass: usize, target: u64) -> (u64, u64) {
2066 let early_multiplier = if config.passes > 2 && pass < config.passes - 1 {
2067 1.5
2068 } else {
2069 1.0
2070 };
2071 let target_f = config.target_depth as f64;
2072 let low = (target_f * config.target_bad_percent_low * early_multiplier)
2073 .ceil()
2074 .max(1.0) as u64;
2075 let high = (target_f * config.target_bad_percent_high * early_multiplier)
2076 .ceil()
2077 .max(1.0) as u64;
2078 let low = low.min(target);
2079 let high = high.min(target).max(low);
2080 (low, high)
2081}
2082
2083fn temp_sequence_extension(config: &Config) -> &'static str {
2084 for path in [
2085 config.out1.as_ref(),
2086 config.in1.as_ref(),
2087 config.out2.as_ref(),
2088 config.in2.as_ref(),
2089 ]
2090 .into_iter()
2091 .flatten()
2092 {
2093 let text = path.to_string_lossy().to_ascii_lowercase();
2094 if text.ends_with(".fa")
2095 || text.ends_with(".fasta")
2096 || text.ends_with(".fna")
2097 || text.ends_with(".fa.gz")
2098 || text.ends_with(".fasta.gz")
2099 || text.ends_with(".fna.gz")
2100 {
2101 return "fa";
2102 }
2103 }
2104 "fq"
2105}
2106
2107fn cardinality_kmer_config(config: &Config) -> Config {
2108 let mut cardinality_config = config.clone();
2109 if let Some(k) = config.cardinality.k {
2110 cardinality_config.k = k;
2111 }
2112 if config.cardinality.min_probability > 0.0 {
2113 cardinality_config.min_prob = config.cardinality.min_probability;
2114 }
2115 cardinality_config
2116}
2117
2118fn estimate_primary_cardinality(
2119 config: &Config,
2120 cardinality_config: &Config,
2121) -> Result<CardinalityEstimate> {
2122 let mut estimator = KmerCardinalityEstimator::from_config(config);
2123 let mut readers = PrimaryReaders::open(config, config.table_reads)?;
2124 let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
2125
2126 while let Some((r1, r2)) = readers.next_pair()? {
2127 chunk.push((r1, r2));
2128 if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
2129 observe_cardinality_chunk(&mut estimator, cardinality_config, &chunk);
2130 chunk.clear();
2131 }
2132 }
2133 if !chunk.is_empty() {
2134 observe_cardinality_chunk(&mut estimator, cardinality_config, &chunk);
2135 }
2136 Ok(estimator.estimate())
2137}
2138
2139fn observe_cardinality_chunk(
2140 estimator: &mut KmerCardinalityEstimator,
2141 config: &Config,
2142 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
2143) {
2144 for (r1, r2) in pairs {
2145 estimator.observe_pair(config, r1, r2.as_ref());
2146 }
2147}
2148
2149fn run_single_pass(config: &Config) -> Result<RunSummary> {
2150 if config.count_up {
2151 return run_countup(config);
2152 }
2153 let mut stage_timings = Vec::new();
2154 let cardinality_config = cardinality_kmer_config(config);
2155 let random_seed = run_random_seed(config);
2156 let input_counts = build_input_counts_with_stage_timings(config, &mut stage_timings)?;
2157 let input_cardinality = if config.cardinality.input {
2158 let started = Instant::now();
2159 let estimate = estimate_primary_cardinality(config, &cardinality_config)?;
2160 record_stage_timing(&mut stage_timings, "input_cardinality", started);
2161 Some(estimate)
2162 } else {
2163 None
2164 };
2165
2166 let wants_input_hist = config.hist_in.is_some() || config.peaks_in.is_some();
2167 let wants_input_rhist = config.rhist_in.is_some();
2168 let fuse_input_hist_with_normalize =
2169 (wants_input_hist || wants_input_rhist) && !config.trim_after_marking;
2170 let mut input_rhist_written_with_hist = false;
2171 let started = Instant::now();
2172 let mut fused_input_hist = fuse_input_hist_with_normalize.then(SparseHist::default);
2173 let mut fused_input_read_hist =
2174 fuse_input_hist_with_normalize.then(SparseReadDepthHist::default);
2175 if fuse_input_hist_with_normalize {
2176 input_rhist_written_with_hist = wants_input_rhist;
2177 } else if wants_input_hist && wants_input_rhist {
2178 let (hist, read_hist) =
2179 collect_primary_sparse_hist_and_read_hist(config, &input_counts, None, random_seed)?;
2180 if let Some(path) = &config.hist_in {
2181 write_sparse_depth_hist(path, &hist, config.hist_len, config)?;
2182 }
2183 if let Some(path) = &config.peaks_in {
2184 let dense_hist = sparse_hist_to_peak_dense(&hist, config.hist_len);
2185 write_peaks(path, &dense_hist, config)?;
2186 }
2187 if let Some(path) = &config.rhist_in {
2188 write_sparse_read_depth_hist(path, &read_hist, config.hist_len, config)?;
2189 input_rhist_written_with_hist = true;
2190 }
2191 } else if wants_input_hist {
2192 let hist = collect_primary_sparse_hist(config, &input_counts, None, random_seed)?;
2193 if let Some(path) = &config.hist_in {
2194 write_sparse_depth_hist(path, &hist, config.hist_len, config)?;
2195 }
2196 if let Some(path) = &config.peaks_in {
2197 let dense_hist = sparse_hist_to_peak_dense(&hist, config.hist_len);
2198 write_peaks(path, &dense_hist, config)?;
2199 }
2200 }
2201 record_stage_timing(&mut stage_timings, "input_hist", started);
2202
2203 if input_rhist_written_with_hist {
2204 record_stage_timing(&mut stage_timings, "input_rhist", Instant::now());
2205 } else if let Some(path) = &config.rhist_in {
2206 let started = Instant::now();
2207 let hist = collect_primary_sparse_read_hist(config, &input_counts, None, random_seed)?;
2208 write_sparse_read_depth_hist(path, &hist, config.hist_len, config)?;
2209 record_stage_timing(&mut stage_timings, "input_rhist", started);
2210 }
2211
2212 let started = Instant::now();
2213 emit_read_local_side_outputs(config)?;
2214 record_stage_timing(&mut stage_timings, "side_outputs", started);
2215
2216 let started = Instant::now();
2217 let mut output_counts =
2218 if config.hist_out.is_some() || config.rhist_out.is_some() || config.peaks_out.is_some() {
2219 Some(new_output_counts(config)?)
2220 } else {
2221 None
2222 };
2223 let mut output_cardinality = config
2224 .cardinality
2225 .output
2226 .then(|| KmerCardinalityEstimator::from_config(config));
2227 record_stage_timing(&mut stage_timings, "output_count_init", started);
2228
2229 let started = Instant::now();
2230 let mut summary = normalize_primary(
2231 config,
2232 &input_counts,
2233 output_counts.as_mut(),
2234 output_cardinality.as_mut(),
2235 &cardinality_config,
2236 random_seed,
2237 InputHistSinks {
2238 depth: fused_input_hist.as_mut(),
2239 read: fused_input_read_hist.as_mut(),
2240 },
2241 )?;
2242 record_stage_timing(&mut stage_timings, "normalize", started);
2243
2244 if let Some(hist) = fused_input_hist.as_ref() {
2245 if let Some(path) = &config.hist_in {
2246 write_sparse_depth_hist(path, hist, config.hist_len, config)?;
2247 }
2248 if let Some(path) = &config.peaks_in {
2249 let dense_hist = sparse_hist_to_peak_dense(hist, config.hist_len);
2250 write_peaks(path, &dense_hist, config)?;
2251 }
2252 }
2253 if let (Some(path), Some(read_hist)) = (&config.rhist_in, fused_input_read_hist.as_ref()) {
2254 write_sparse_read_depth_hist(path, read_hist, config.hist_len, config)?;
2255 }
2256
2257 let started = Instant::now();
2258 (summary.unique_kmers_in, summary.unique_kmers_in_split) = input_counts.unique_kmer_estimate();
2259 summary.cardinality_in = input_cardinality;
2260 summary.cardinality_out = output_cardinality
2261 .as_ref()
2262 .map(KmerCardinalityEstimator::estimate);
2263 summary.sketch_layouts = input_counts.sketch_layouts();
2264 if let Some(counts) = output_counts.as_mut() {
2265 apply_output_count_adjustments(config, counts);
2266 }
2267 summary.unique_kmers_out = output_counts.as_ref().map(CountLookup::unique_kmers);
2268 if let Some(counts) = output_counts.as_ref() {
2269 counts.append_sketch_layouts(&mut summary.sketch_layouts, "output_kept");
2270 }
2271 record_stage_timing(&mut stage_timings, "summary_counts", started);
2272
2273 let wants_output_hist = config.hist_out.is_some() || config.peaks_out.is_some();
2274 let wants_output_rhist = config.rhist_out.is_some();
2275 let mut output_rhist_written_with_hist = false;
2276 let started = Instant::now();
2277 if let Some(counts) = &output_counts {
2278 if wants_output_hist && wants_output_rhist {
2279 let (hist, read_hist) = collect_primary_sparse_hist_and_read_hist(
2280 config,
2281 counts,
2282 Some(&input_counts),
2283 random_seed,
2284 )?;
2285 if let Some(path) = &config.hist_out {
2286 write_sparse_depth_hist(path, &hist, config.hist_len, config)?;
2287 }
2288 if let Some(path) = &config.peaks_out {
2289 let dense_hist = sparse_hist_to_peak_dense(&hist, config.hist_len);
2290 write_peaks(path, &dense_hist, config)?;
2291 }
2292 if let Some(path) = &config.rhist_out {
2293 write_sparse_read_depth_hist(path, &read_hist, config.hist_len, config)?;
2294 output_rhist_written_with_hist = true;
2295 }
2296 } else if wants_output_hist {
2297 let hist =
2298 collect_primary_sparse_hist(config, counts, Some(&input_counts), random_seed)?;
2299 if let Some(path) = &config.hist_out {
2300 write_sparse_depth_hist(path, &hist, config.hist_len, config)?;
2301 }
2302 if let Some(path) = &config.peaks_out {
2303 let dense_hist = sparse_hist_to_peak_dense(&hist, config.hist_len);
2304 write_peaks(path, &dense_hist, config)?;
2305 }
2306 }
2307 }
2308 record_stage_timing(&mut stage_timings, "output_hist", started);
2309
2310 if output_rhist_written_with_hist {
2311 record_stage_timing(&mut stage_timings, "output_rhist", Instant::now());
2312 } else if let (Some(path), Some(counts)) = (&config.rhist_out, &output_counts) {
2313 let started = Instant::now();
2314 let hist =
2315 collect_primary_sparse_read_hist(config, counts, Some(&input_counts), random_seed)?;
2316 write_sparse_read_depth_hist(path, &hist, config.hist_len, config)?;
2317 record_stage_timing(&mut stage_timings, "output_rhist", started);
2318 }
2319
2320 summary.stage_timings = stage_timings;
2321 Ok(summary)
2322}
2323
2324fn run_countup(config: &Config) -> Result<RunSummary> {
2325 let mut stage_timings = Vec::new();
2326 let cardinality_config = cardinality_kmer_config(config);
2327 let input_counts = build_input_counts_with_stage_timings(config, &mut stage_timings)?;
2328 let input_cardinality = if config.cardinality.input {
2329 let started = Instant::now();
2330 let estimate = estimate_primary_cardinality(config, &cardinality_config)?;
2331 record_stage_timing(&mut stage_timings, "input_cardinality", started);
2332 Some(estimate)
2333 } else {
2334 None
2335 };
2336
2337 let wants_input_hist = config.hist_in.is_some() || config.peaks_in.is_some();
2338 let wants_input_rhist = config.rhist_in.is_some();
2339
2340 let started = Instant::now();
2341 emit_read_local_side_outputs(config)?;
2342 record_stage_timing(&mut stage_timings, "side_outputs", started);
2343
2344 let random_seed = run_random_seed(config);
2345 let started = Instant::now();
2346 let work_build = collect_countup_work_source(
2347 config,
2348 &input_counts,
2349 random_seed,
2350 wants_input_hist,
2351 wants_input_rhist,
2352 )?;
2353 let countup_work_elapsed = started
2354 .elapsed()
2355 .as_micros()
2356 .saturating_sub(work_build.input_hist_elapsed_micros);
2357 record_stage_timing_micros(
2358 &mut stage_timings,
2359 "input_hist",
2360 work_build.input_hist_elapsed_micros,
2361 );
2362 if let (Some(path), Some(hist)) = (&config.hist_in, &work_build.input_hist) {
2363 write_sparse_depth_hist(path, hist, config.hist_len, config)?;
2364 }
2365 if let (Some(path), Some(hist)) = (&config.peaks_in, &work_build.input_hist) {
2366 let dense_hist = sparse_hist_to_peak_dense(hist, config.hist_len);
2367 write_peaks(path, &dense_hist, config)?;
2368 }
2369 if let (Some(path), Some(hist)) = (&config.rhist_in, &work_build.input_read_hist) {
2370 write_sparse_read_depth_hist(path, hist, config.hist_len, config)?;
2371 }
2372 record_stage_timing_micros(&mut stage_timings, "input_rhist", 0);
2373 record_stage_timing_micros(
2374 &mut stage_timings,
2375 "countup_work_source",
2376 countup_work_elapsed,
2377 );
2378 let format1 = work_build.format1;
2379 let format2 = work_build.format2;
2380 let countup_spill = work_build.spill_summary;
2381 let mut work_pairs = work_build.source.into_iter()?;
2382 let mut writers = OptionalWriters::open(config, format1, format2)?;
2383 let mut summary = RunSummary {
2384 cardinality_in: input_cardinality,
2385 countup_spill,
2386 ..RunSummary::default()
2387 };
2388 let mut kept_counts = new_output_counts(config)?;
2389 let mut output_cardinality = config
2390 .cardinality
2391 .output
2392 .then(|| KmerCardinalityEstimator::from_config(config));
2393 let adjusted_target = ((config.target_depth as f64) * 0.95).round().max(1.0) as u64;
2394 let started = Instant::now();
2395
2396 while let Some(CountupWorkPair {
2397 input_list_index,
2398 mut r1,
2399 mut r2,
2400 ..
2401 }) = work_pairs.next_pair()?
2402 {
2403 writers.sync_to_input_list_index(config, input_list_index)?;
2404 let keys = unique_pair_kmers(config, &r1, r2.as_ref());
2405 let mut decision_plan =
2406 countup_decision_plan(config, &input_counts, &kept_counts, &keys, adjusted_target);
2407 if countup_length_toss(config, &r1, r2.as_ref()) {
2408 decision_plan.toss = true;
2409 }
2410 update_countup_kept_counts_for_plan(config, &mut kept_counts, &keys, &decision_plan);
2411
2412 let output_analysis = needs_output_pair_analysis(config)
2413 .then(|| analyze_pair(config, &input_counts, &r1, r2.as_ref()));
2414 let mut correction = CorrectionResult::default();
2415 if config.error_correct && !decision_plan.toss {
2416 correction =
2417 correct_pair_errors_with_rollback(config, &input_counts, &mut r1, r2.as_mut());
2418 }
2419 if config.trim_after_marking && config.error_correct {
2420 trim_pair(config, &mut r1, r2.as_mut());
2421 }
2422 let (out_r1, out_r2) = match output_analysis.as_ref() {
2423 Some(analysis) => maybe_rename_pair(config, &r1, r2.as_ref(), analysis),
2424 None => (r1.clone(), r2.clone()),
2425 };
2426 let read_count = 1 + u64::from(r2.is_some());
2427 let base_count = r1.len() as u64 + r2.as_ref().map(|r| r.len() as u64).unwrap_or(0);
2428
2429 summary.reads_in += read_count;
2430 summary.bases_in += base_count;
2431 if decision_plan.toss {
2432 summary.reads_tossed += read_count;
2433 summary.bases_tossed += base_count;
2434 } else {
2435 summary.reads_kept += read_count;
2436 summary.bases_kept += base_count;
2437 if let Some(estimator) = output_cardinality.as_mut() {
2438 estimator.observe_pair(&cardinality_config, &r1, r2.as_ref());
2439 }
2440 }
2441 writers.write_pair(decision_plan.toss, &out_r1, out_r2.as_ref())?;
2442 if correction.uncorrectable {
2443 writers.write_uncorrected(&r1, r2.as_ref())?;
2444 }
2445 if let Some(analysis) = output_analysis.as_ref()
2446 && depth_bin_outputs_enabled(config)
2447 {
2448 writers.write_depth_bin(config, analysis, &out_r1, out_r2.as_ref())?;
2449 }
2450 }
2451 writers.flush()?;
2452 record_stage_timing(&mut stage_timings, "countup_normalize", started);
2453
2454 let started = Instant::now();
2455 if config.hist_out.is_some() || config.peaks_out.is_some() || config.rhist_out.is_some() {
2456 apply_output_count_adjustments(config, &mut kept_counts);
2457 }
2458 record_stage_timing(&mut stage_timings, "output_count_adjust", started);
2459
2460 let started = Instant::now();
2461 let output_hist = if config.hist_out.is_some() || config.peaks_out.is_some() {
2462 Some(kept_counts.sparse_depth_hist(config.hist_len))
2463 } else {
2464 None
2465 };
2466 if let (Some(path), Some(hist)) = (&config.hist_out, &output_hist) {
2467 write_sparse_depth_hist(path, hist, config.hist_len, config)?;
2468 }
2469 if let (Some(path), Some(hist)) = (&config.peaks_out, &output_hist) {
2470 let dense_hist = sparse_hist_to_peak_dense(hist, config.hist_len);
2471 write_peaks(path, &dense_hist, config)?;
2472 }
2473 record_stage_timing(&mut stage_timings, "output_hist", started);
2474
2475 if let Some(path) = &config.rhist_out {
2476 let started = Instant::now();
2477 let hist = collect_primary_sparse_read_hist(config, &kept_counts, Some(&input_counts), 0)?;
2478 write_sparse_read_depth_hist(path, &hist, config.hist_len, config)?;
2479 record_stage_timing(&mut stage_timings, "output_rhist", started);
2480 }
2481
2482 let started = Instant::now();
2483 (summary.unique_kmers_in, summary.unique_kmers_in_split) = input_counts.unique_kmer_estimate();
2484 summary.unique_kmers_out = Some(kept_counts.unique_kmers());
2485 summary.cardinality_out = output_cardinality
2486 .as_ref()
2487 .map(KmerCardinalityEstimator::estimate);
2488 summary.sketch_layouts = input_counts.sketch_layouts();
2489 kept_counts.append_sketch_layouts(&mut summary.sketch_layouts, "countup_kept");
2490 record_stage_timing(&mut stage_timings, "summary_counts", started);
2491 summary.stage_timings = stage_timings;
2492 Ok(summary)
2493}
2494
2495fn record_stage_timing(timings: &mut Vec<StageTiming>, name: &'static str, started: Instant) {
2496 timings.push(StageTiming {
2497 name,
2498 elapsed_micros: started.elapsed().as_micros(),
2499 });
2500}
2501
2502fn record_stage_timing_micros(
2503 timings: &mut Vec<StageTiming>,
2504 name: &'static str,
2505 elapsed_micros: u128,
2506) {
2507 timings.push(StageTiming {
2508 name,
2509 elapsed_micros,
2510 });
2511}
2512
2513fn collect_countup_work_source(
2514 config: &Config,
2515 input_counts: &dyn CountLookup,
2516 random_seed: u64,
2517 wants_input_hist: bool,
2518 wants_input_rhist: bool,
2519) -> Result<CountupWorkBuild> {
2520 let mut readers = PrimaryReaders::open(config, config.max_reads)?;
2521 let format1 = readers.format1();
2522 let format2 = readers.format2();
2523 let presort_config = countup_prepass_config(config);
2524 let mut rng = JavaXoshiro::new(random_seed);
2525 let mut work_pairs = Vec::new();
2526 let mut work_pair_bytes = 0usize;
2527 let mut run_paths = Vec::new();
2528 let mut temp_dir = None;
2529 let mut spill_summary = CountupSpillSummary::default();
2530 let mut input_hist = wants_input_hist.then(SparseHist::default);
2531 let mut input_read_hist = wants_input_rhist.then(SparseReadDepthHist::default);
2532 let mut input_hist_elapsed_micros = 0u128;
2533 let mut candidates = Vec::with_capacity(COUNTUP_PREPASS_CHUNK_PAIR_LIMIT);
2534 let mut candidate_bytes = 0usize;
2535 let mut original_index = 0usize;
2536 while let Some((r1, r2)) = readers.next_pair()? {
2537 let candidate = CountupWorkCandidate {
2538 input_list_index: readers.input_list_index(),
2539 original_index,
2540 rand: rng.next_double(),
2541 r1,
2542 r2,
2543 };
2544 candidate_bytes =
2545 candidate_bytes.saturating_add(countup_work_candidate_memory_hint(&candidate));
2546 candidates.push(candidate);
2547 if countup_prepass_chunk_ready(candidates.len(), candidate_bytes) {
2548 let chunk = std::mem::take(&mut candidates);
2549 let chunk_build = process_countup_work_candidate_chunk(
2550 config,
2551 &presort_config,
2552 input_counts,
2553 wants_input_hist,
2554 wants_input_rhist,
2555 chunk,
2556 );
2557 let hist_started = Instant::now();
2558 if let Some(input_hist) = input_hist.as_mut() {
2559 merge_sparse_hist(input_hist, chunk_build.depth_hist);
2560 }
2561 if let Some(input_read_hist) = input_read_hist.as_mut() {
2562 merge_sparse_read_depth_hist(input_read_hist, chunk_build.read_hist);
2563 }
2564 input_hist_elapsed_micros =
2565 input_hist_elapsed_micros.saturating_add(hist_started.elapsed().as_micros());
2566 append_countup_work_pairs(
2567 config,
2568 &mut temp_dir,
2569 &mut run_paths,
2570 &mut spill_summary,
2571 &mut work_pairs,
2572 &mut work_pair_bytes,
2573 chunk_build.work_pairs,
2574 )?;
2575 candidates = Vec::with_capacity(COUNTUP_PREPASS_CHUNK_PAIR_LIMIT);
2576 candidate_bytes = 0;
2577 }
2578 original_index += 1;
2579 }
2580 if !candidates.is_empty() {
2581 let chunk_build = process_countup_work_candidate_chunk(
2582 config,
2583 &presort_config,
2584 input_counts,
2585 wants_input_hist,
2586 wants_input_rhist,
2587 candidates,
2588 );
2589 let hist_started = Instant::now();
2590 if let Some(input_hist) = input_hist.as_mut() {
2591 merge_sparse_hist(input_hist, chunk_build.depth_hist);
2592 }
2593 if let Some(input_read_hist) = input_read_hist.as_mut() {
2594 merge_sparse_read_depth_hist(input_read_hist, chunk_build.read_hist);
2595 }
2596 input_hist_elapsed_micros =
2597 input_hist_elapsed_micros.saturating_add(hist_started.elapsed().as_micros());
2598 append_countup_work_pairs(
2599 config,
2600 &mut temp_dir,
2601 &mut run_paths,
2602 &mut spill_summary,
2603 &mut work_pairs,
2604 &mut work_pair_bytes,
2605 chunk_build.work_pairs,
2606 )?;
2607 }
2608 let source = if run_paths.is_empty() {
2609 work_pairs.sort_by(compare_countup_work_pairs);
2610 CountupWorkSource {
2611 temp_dir: None,
2612 inner: CountupWorkSourceInner::Memory(work_pairs),
2613 }
2614 } else {
2615 if !work_pairs.is_empty() {
2616 spill_countup_run(
2617 config,
2618 &mut temp_dir,
2619 &mut run_paths,
2620 &mut spill_summary,
2621 &mut work_pairs,
2622 )?;
2623 }
2624 compact_countup_runs(config, &mut run_paths, &mut spill_summary)?;
2625 spill_summary.final_runs = run_paths.len();
2626 enforce_countup_spill_limits(config, &spill_summary, run_paths.len())?;
2627 CountupWorkSource {
2628 temp_dir,
2629 inner: CountupWorkSourceInner::Spilled(run_paths),
2630 }
2631 };
2632 Ok(CountupWorkBuild {
2633 source,
2634 input_hist,
2635 input_read_hist,
2636 input_hist_elapsed_micros,
2637 format1,
2638 format2,
2639 spill_summary,
2640 })
2641}
2642
2643fn countup_prepass_chunk_ready(candidate_count: usize, candidate_bytes: usize) -> bool {
2644 candidate_count >= COUNTUP_PREPASS_CHUNK_PAIR_LIMIT
2645 || candidate_bytes >= COUNTUP_PREPASS_CHUNK_BYTE_LIMIT
2646}
2647
2648fn process_countup_work_candidates(
2649 config: &Config,
2650 presort_config: &Config,
2651 input_counts: &dyn CountLookup,
2652 candidates: Vec<CountupWorkCandidate>,
2653) -> Vec<CountupWorkPair> {
2654 candidates
2655 .into_par_iter()
2656 .filter_map(|candidate| {
2657 countup_work_pair_from_candidate(config, presort_config, input_counts, candidate)
2658 })
2659 .collect()
2660}
2661
2662fn process_countup_work_candidate_chunk(
2663 config: &Config,
2664 presort_config: &Config,
2665 input_counts: &dyn CountLookup,
2666 wants_depth_hist: bool,
2667 wants_read_hist: bool,
2668 candidates: Vec<CountupWorkCandidate>,
2669) -> CountupChunkBuild {
2670 if !wants_depth_hist && !wants_read_hist {
2671 return CountupChunkBuild {
2672 work_pairs: process_countup_work_candidates(
2673 config,
2674 presort_config,
2675 input_counts,
2676 candidates,
2677 ),
2678 depth_hist: SparseHist::default(),
2679 read_hist: SparseReadDepthHist::default(),
2680 };
2681 }
2682
2683 candidates
2684 .into_par_iter()
2685 .fold(
2686 || CountupChunkBuild {
2687 work_pairs: Vec::new(),
2688 depth_hist: SparseHist::default(),
2689 read_hist: SparseReadDepthHist::default(),
2690 },
2691 |mut local, candidate| {
2692 let mut hist = CountupInputHistAccumulator {
2693 wants_depth_hist,
2694 wants_read_hist,
2695 depth_hist: &mut local.depth_hist,
2696 read_hist: &mut local.read_hist,
2697 };
2698 if let Some(work_pair) = countup_work_pair_from_candidate_with_input_hists(
2699 config,
2700 presort_config,
2701 input_counts,
2702 candidate,
2703 &mut hist,
2704 ) {
2705 local.work_pairs.push(work_pair);
2706 }
2707 local
2708 },
2709 )
2710 .reduce(
2711 || CountupChunkBuild {
2712 work_pairs: Vec::new(),
2713 depth_hist: SparseHist::default(),
2714 read_hist: SparseReadDepthHist::default(),
2715 },
2716 |mut left, mut right| {
2717 left.work_pairs.append(&mut right.work_pairs);
2718 merge_sparse_hist(&mut left.depth_hist, right.depth_hist);
2719 merge_sparse_read_depth_hist(&mut left.read_hist, right.read_hist);
2720 left
2721 },
2722 )
2723}
2724
2725fn countup_work_pair_from_candidate(
2726 config: &Config,
2727 presort_config: &Config,
2728 input_counts: &dyn CountLookup,
2729 mut candidate: CountupWorkCandidate,
2730) -> Option<CountupWorkPair> {
2731 if !config.trim_after_marking {
2732 trim_pair(config, &mut candidate.r1, candidate.r2.as_mut());
2733 }
2734 let prepass_result = countup_prepass_pair(
2735 presort_config,
2736 config.add_bad_reads_countup,
2737 input_counts,
2738 &mut candidate.r1,
2739 candidate.r2.as_mut(),
2740 candidate.rand,
2741 );
2742 countup_work_pair_from_prepass_result(presort_config, input_counts, candidate, prepass_result)
2743}
2744
2745fn countup_work_pair_from_candidate_with_input_hists(
2746 config: &Config,
2747 presort_config: &Config,
2748 input_counts: &dyn CountLookup,
2749 mut candidate: CountupWorkCandidate,
2750 hist: &mut CountupInputHistAccumulator<'_>,
2751) -> Option<CountupWorkPair> {
2752 if config.trim_after_marking {
2753 let mut hist_r1 = candidate.r1.clone();
2754 let mut hist_r2 = candidate.r2.clone();
2755 trim_pair(config, &mut hist_r1, hist_r2.as_mut());
2756 let hist_analysis = analyze_pair(config, input_counts, &hist_r1, hist_r2.as_ref());
2757 increment_countup_input_hists_from_analysis(
2758 config,
2759 hist,
2760 &hist_r1,
2761 hist_r2.as_ref(),
2762 &hist_analysis,
2763 );
2764 } else {
2765 trim_pair(config, &mut candidate.r1, candidate.r2.as_mut());
2766 let (hist_analysis, prepass_analysis) = analyze_pair_for_two_configs(
2767 config,
2768 presort_config,
2769 input_counts,
2770 &candidate.r1,
2771 candidate.r2.as_ref(),
2772 );
2773 increment_countup_input_hists_from_analysis(
2774 config,
2775 hist,
2776 &candidate.r1,
2777 candidate.r2.as_ref(),
2778 &hist_analysis,
2779 );
2780 let prepass_result = countup_prepass_pair_from_analysis(
2781 presort_config,
2782 config.add_bad_reads_countup,
2783 input_counts,
2784 &mut candidate.r1,
2785 candidate.r2.as_mut(),
2786 candidate.rand,
2787 prepass_analysis,
2788 );
2789 return countup_work_pair_from_prepass_result(
2790 presort_config,
2791 input_counts,
2792 candidate,
2793 prepass_result,
2794 );
2795 }
2796
2797 if !config.trim_after_marking {
2798 trim_pair(config, &mut candidate.r1, candidate.r2.as_mut());
2799 }
2800 let prepass_result = countup_prepass_pair(
2801 presort_config,
2802 config.add_bad_reads_countup,
2803 input_counts,
2804 &mut candidate.r1,
2805 candidate.r2.as_mut(),
2806 candidate.rand,
2807 );
2808 countup_work_pair_from_prepass_result(presort_config, input_counts, candidate, prepass_result)
2809}
2810
2811fn increment_countup_input_hists_from_analysis(
2812 config: &Config,
2813 hist: &mut CountupInputHistAccumulator<'_>,
2814 r1: &SequenceRecord,
2815 r2: Option<&SequenceRecord>,
2816 analysis: &PairAnalysis,
2817) {
2818 if hist.wants_depth_hist {
2819 increment_sparse_hist_from_analysis(hist.depth_hist, &analysis.read1, config.hist_len);
2820 if let Some(read2_analysis) = &analysis.read2 {
2821 increment_sparse_hist_from_analysis(hist.depth_hist, read2_analysis, config.hist_len);
2822 }
2823 }
2824 if hist.wants_read_hist {
2825 increment_sparse_read_hist(hist.read_hist, &analysis.read1, r1.len(), config.hist_len);
2826 if let (Some(read2_analysis), Some(read2)) = (&analysis.read2, r2) {
2827 increment_sparse_read_hist(
2828 hist.read_hist,
2829 read2_analysis,
2830 read2.len(),
2831 config.hist_len,
2832 );
2833 }
2834 }
2835}
2836
2837fn countup_work_pair_from_prepass_result(
2838 presort_config: &Config,
2839 input_counts: &dyn CountLookup,
2840 candidate: CountupWorkCandidate,
2841 prepass_result: CountupPrepassResult,
2842) -> Option<CountupWorkPair> {
2843 if !prepass_result.include {
2844 return None;
2845 }
2846 let sort_key = prepass_result.sort_analysis.as_ref().map_or_else(
2847 || {
2848 countup_sort_key(
2849 presort_config,
2850 input_counts,
2851 &candidate.r1,
2852 candidate.r2.as_ref(),
2853 candidate.original_index,
2854 )
2855 },
2856 |analysis| {
2857 countup_sort_key_from_analysis(
2858 &candidate.r1,
2859 candidate.r2.as_ref(),
2860 candidate.original_index,
2861 analysis,
2862 )
2863 },
2864 );
2865 Some(CountupWorkPair {
2866 input_list_index: candidate.input_list_index,
2867 sort_key,
2868 r1: candidate.r1,
2869 r2: candidate.r2,
2870 })
2871}
2872
2873fn append_countup_work_pairs(
2874 config: &Config,
2875 temp_dir: &mut Option<tempfile::TempDir>,
2876 run_paths: &mut Vec<PathBuf>,
2877 spill_summary: &mut CountupSpillSummary,
2878 work_pairs: &mut Vec<CountupWorkPair>,
2879 work_pair_bytes: &mut usize,
2880 new_pairs: Vec<CountupWorkPair>,
2881) -> Result<()> {
2882 for work_pair in new_pairs {
2883 *work_pair_bytes =
2884 (*work_pair_bytes).saturating_add(countup_work_pair_memory_hint(&work_pair));
2885 work_pairs.push(work_pair);
2886 if work_pairs.len() >= COUNTUP_SORT_RUN_PAIR_LIMIT
2887 || *work_pair_bytes >= COUNTUP_SORT_RUN_BYTE_LIMIT
2888 {
2889 spill_countup_run(config, temp_dir, run_paths, spill_summary, work_pairs)?;
2890 *work_pair_bytes = 0;
2891 }
2892 }
2893 Ok(())
2894}
2895
2896fn countup_work_pair_memory_hint(pair: &CountupWorkPair) -> usize {
2897 std::mem::size_of::<CountupWorkPair>()
2898 .saturating_add(countup_sort_key_memory_hint(&pair.sort_key))
2899 .saturating_add(sequence_record_memory_hint(&pair.r1))
2900 .saturating_add(pair.r2.as_ref().map_or(0, sequence_record_memory_hint))
2901}
2902
2903fn countup_work_candidate_memory_hint(candidate: &CountupWorkCandidate) -> usize {
2904 std::mem::size_of::<CountupWorkCandidate>()
2905 .saturating_add(sequence_record_memory_hint(&candidate.r1))
2906 .saturating_add(candidate.r2.as_ref().map_or(0, sequence_record_memory_hint))
2907}
2908
2909fn countup_sort_key_memory_hint(key: &CountupSortKey) -> usize {
2910 let _ = key;
2911 std::mem::size_of::<CountupSortKey>()
2912}
2913
2914fn sequence_record_memory_hint(record: &SequenceRecord) -> usize {
2915 std::mem::size_of::<SequenceRecord>()
2916 .saturating_add(record.id.capacity())
2917 .saturating_add(record.bases.capacity())
2918 .saturating_add(record.qualities.as_ref().map_or(0, Vec::capacity))
2919}
2920
2921fn spill_countup_run(
2922 config: &Config,
2923 temp_dir: &mut Option<tempfile::TempDir>,
2924 run_paths: &mut Vec<PathBuf>,
2925 spill_summary: &mut CountupSpillSummary,
2926 work_pairs: &mut Vec<CountupWorkPair>,
2927) -> Result<()> {
2928 if work_pairs.is_empty() {
2929 return Ok(());
2930 }
2931 let dir = match temp_dir {
2932 Some(dir) => dir,
2933 None => temp_dir.insert(managed_temp_dir(config, "bbnorm-rs-countup-")?),
2934 };
2935 work_pairs.sort_by(compare_countup_work_pairs);
2936 let path = dir
2937 .path()
2938 .join(format!("countup-run-{:06}.bin", run_paths.len()));
2939 let bytes = write_countup_run(&path, work_pairs)?;
2940 spill_summary.note_initial_run(bytes);
2941 run_paths.push(path);
2942 enforce_countup_spill_limits(config, spill_summary, run_paths.len())?;
2943 work_pairs.clear();
2944 Ok(())
2945}
2946
2947fn compact_countup_runs(
2948 config: &Config,
2949 run_paths: &mut Vec<PathBuf>,
2950 spill_summary: &mut CountupSpillSummary,
2951) -> Result<()> {
2952 if run_paths.len() <= COUNTUP_SORT_MERGE_FANIN {
2953 return Ok(());
2954 }
2955 let run_dir = run_paths
2956 .first()
2957 .and_then(|path| path.parent())
2958 .context("count-up spill runs had no parent directory")?
2959 .to_path_buf();
2960 let mut round = 0usize;
2961 while run_paths.len() > COUNTUP_SORT_MERGE_FANIN {
2962 let old_paths = std::mem::take(run_paths);
2963 for (group_index, group) in old_paths.chunks(COUNTUP_SORT_MERGE_FANIN).enumerate() {
2964 let merged_path =
2965 run_dir.join(format!("countup-merge-{round:03}-{group_index:06}.bin"));
2966 let merged_bytes = merge_countup_run_group(group, &merged_path)?;
2967 spill_summary.note_merge_run(merged_bytes);
2968 run_paths.push(merged_path);
2969 enforce_countup_spill_limits(config, spill_summary, run_paths.len())?;
2970 }
2971 for path in old_paths {
2972 let removed_bytes = path.metadata().map(|metadata| metadata.len()).unwrap_or(0);
2973 match fs::remove_file(&path) {
2974 Ok(()) => spill_summary.note_removed(removed_bytes),
2975 Err(err) if err.kind() == ErrorKind::NotFound => {}
2976 Err(err) => {
2977 return Err(err).with_context(|| {
2978 format!("removing compacted count-up run {}", path.display())
2979 });
2980 }
2981 }
2982 }
2983 round += 1;
2984 }
2985 Ok(())
2986}
2987
2988fn enforce_countup_spill_limits(
2989 config: &Config,
2990 spill_summary: &CountupSpillSummary,
2991 live_run_count: usize,
2992) -> Result<()> {
2993 if let Some(limit) = config.max_countup_spill_initial_runs
2994 && spill_summary.initial_runs > limit
2995 {
2996 bail!(
2997 "count-up spill exceeded maxcountupspillinitialruns: initial spill runs {} > limit {}",
2998 spill_summary.initial_runs,
2999 limit
3000 );
3001 }
3002 if let Some(limit) = config.max_countup_spill_merge_runs
3003 && spill_summary.merge_runs > limit
3004 {
3005 bail!(
3006 "count-up spill exceeded maxcountupspillmergeruns: merge spill runs {} > limit {}",
3007 spill_summary.merge_runs,
3008 limit
3009 );
3010 }
3011 if let Some(limit) = config.max_countup_spill_final_runs
3012 && live_run_count > limit
3013 {
3014 bail!(
3015 "count-up spill exceeded maxcountupspillfinalruns: live spill runs {} > limit {}",
3016 live_run_count,
3017 limit
3018 );
3019 }
3020 if let Some(limit) = config.max_countup_spill_live_bytes
3021 && spill_summary.peak_live_bytes > limit
3022 {
3023 bail!(
3024 "count-up spill exceeded maxcountupspillbytes: peak live spill bytes {} > limit {}",
3025 spill_summary.peak_live_bytes,
3026 limit
3027 );
3028 }
3029 if let Some(limit) = config.max_countup_spill_final_live_bytes
3030 && spill_summary.final_live_bytes > limit
3031 {
3032 bail!(
3033 "count-up spill exceeded maxcountupspillfinallivebytes: current/final live spill bytes {} > limit {}",
3034 spill_summary.final_live_bytes,
3035 limit
3036 );
3037 }
3038 if let Some(limit) = config.max_countup_spill_write_bytes
3039 && spill_summary.bytes_written > limit
3040 {
3041 bail!(
3042 "count-up spill exceeded maxcountupspillwritebytes: cumulative spill bytes written {} > limit {}",
3043 spill_summary.bytes_written,
3044 limit
3045 );
3046 }
3047 Ok(())
3048}
3049
3050fn merge_countup_run_group(paths: &[PathBuf], output_path: &Path) -> Result<u64> {
3051 let mut merger = CountupRunMerger::new(paths)?;
3052 let file = fs::File::create(output_path)
3053 .with_context(|| format!("creating compacted count-up run {}", output_path.display()))?;
3054 let mut writer = BufWriter::with_capacity(COUNTUP_RUN_IO_BUFFER_CAPACITY, file);
3055 while let Some(pair) = merger.next_pair()? {
3056 write_countup_work_pair(&mut writer, &pair)?;
3057 }
3058 writer
3059 .flush()
3060 .with_context(|| format!("flushing compacted count-up run {}", output_path.display()))?;
3061 output_path
3062 .metadata()
3063 .map(|metadata| metadata.len())
3064 .with_context(|| format!("checking compacted count-up run {}", output_path.display()))
3065}
3066
3067impl CountupWorkSource {
3068 fn into_iter(self) -> Result<CountupWorkIter> {
3069 let CountupWorkSource { temp_dir, inner } = self;
3070 let inner = match inner {
3071 CountupWorkSourceInner::Memory(work_pairs) => {
3072 CountupWorkIterInner::Memory(work_pairs.into_iter())
3073 }
3074 CountupWorkSourceInner::Spilled(paths) => {
3075 CountupWorkIterInner::Spilled(CountupRunMerger::new(&paths)?)
3076 }
3077 };
3078 Ok(CountupWorkIter {
3079 _temp_dir: temp_dir,
3080 inner,
3081 })
3082 }
3083}
3084
3085impl CountupWorkIter {
3086 fn next_pair(&mut self) -> Result<Option<CountupWorkPair>> {
3087 match &mut self.inner {
3088 CountupWorkIterInner::Memory(iter) => Ok(iter.next()),
3089 CountupWorkIterInner::Spilled(merger) => merger.next_pair(),
3090 }
3091 }
3092}
3093
3094impl CountupRunMerger {
3095 fn new(paths: &[PathBuf]) -> Result<Self> {
3096 let mut readers = Vec::with_capacity(paths.len());
3097 let mut heap = BinaryHeap::new();
3098 for path in paths {
3099 let mut reader = CountupRunReader::open(path)?;
3100 if let Some(pair) = reader.next_pair()? {
3101 heap.push(CountupRunHead {
3102 pair,
3103 run_index: readers.len(),
3104 });
3105 }
3106 readers.push(reader);
3107 }
3108 Ok(Self { readers, heap })
3109 }
3110
3111 fn next_pair(&mut self) -> Result<Option<CountupWorkPair>> {
3112 let Some(head) = self.heap.pop() else {
3113 return Ok(None);
3114 };
3115 let pair = head.pair;
3116 if let Some(next) = self.readers[head.run_index].next_pair()? {
3117 self.heap.push(CountupRunHead {
3118 pair: next,
3119 run_index: head.run_index,
3120 });
3121 }
3122 Ok(Some(pair))
3123 }
3124}
3125
3126impl CountupRunReader {
3127 fn open(path: &Path) -> Result<Self> {
3128 let file = fs::File::open(path)
3129 .with_context(|| format!("opening count-up run {}", path.display()))?;
3130 Ok(Self {
3131 reader: BufReader::with_capacity(COUNTUP_RUN_IO_BUFFER_CAPACITY, file),
3132 })
3133 }
3134
3135 fn next_pair(&mut self) -> Result<Option<CountupWorkPair>> {
3136 read_countup_work_pair(&mut self.reader)
3137 }
3138}
3139
3140impl PartialEq for CountupRunHead {
3141 fn eq(&self, other: &Self) -> bool {
3142 compare_countup_work_pairs(&self.pair, &other.pair) == CmpOrdering::Equal
3143 && self.run_index == other.run_index
3144 }
3145}
3146
3147impl Eq for CountupRunHead {}
3148
3149impl PartialOrd for CountupRunHead {
3150 fn partial_cmp(&self, other: &Self) -> Option<CmpOrdering> {
3151 Some(self.cmp(other))
3152 }
3153}
3154
3155impl Ord for CountupRunHead {
3156 fn cmp(&self, other: &Self) -> CmpOrdering {
3157 compare_countup_work_pairs(&other.pair, &self.pair)
3158 .then_with(|| other.run_index.cmp(&self.run_index))
3159 }
3160}
3161
3162fn write_countup_run(path: &Path, work_pairs: &[CountupWorkPair]) -> Result<u64> {
3163 let file = fs::File::create(path)
3164 .with_context(|| format!("creating count-up run {}", path.display()))?;
3165 let mut writer = BufWriter::with_capacity(COUNTUP_RUN_IO_BUFFER_CAPACITY, file);
3166 for pair in work_pairs {
3167 write_countup_work_pair(&mut writer, pair)?;
3168 }
3169 writer
3170 .flush()
3171 .with_context(|| format!("flushing count-up run {}", path.display()))?;
3172 path.metadata()
3173 .map(|metadata| metadata.len())
3174 .with_context(|| format!("checking count-up run {}", path.display()))
3175}
3176
3177fn write_countup_work_pair(writer: &mut impl Write, pair: &CountupWorkPair) -> Result<()> {
3178 write_usize(writer, pair.input_list_index)?;
3179 write_usize(writer, pair.sort_key.errors)?;
3180 write_usize(writer, pair.sort_key.total_len)?;
3181 writer.write_all(&pair.sort_key.expected_errors.to_le_bytes())?;
3182 writer.write_all(&pair.sort_key.numeric_id.to_le_bytes())?;
3183 write_usize(writer, pair.sort_key.original_index)?;
3184 write_sequence_record(writer, &pair.r1)?;
3185 write_bool(writer, pair.r2.is_some())?;
3186 if let Some(r2) = &pair.r2 {
3187 write_sequence_record(writer, r2)?;
3188 }
3189 Ok(())
3190}
3191
3192fn read_countup_work_pair(reader: &mut impl Read) -> Result<Option<CountupWorkPair>> {
3193 let Some(input_list_index) = read_usize_opt(reader)? else {
3194 return Ok(None);
3195 };
3196 let errors = read_usize(reader)?;
3197 let total_len = read_usize(reader)?;
3198 let expected_errors = read_f64(reader)?;
3199 let numeric_id = read_u64(reader)?;
3200 let original_index = read_usize(reader)?;
3201 let r1 = read_sequence_record(reader)?;
3202 let has_r2 = read_bool(reader)?;
3203 let r2 = has_r2.then(|| read_sequence_record(reader)).transpose()?;
3204 Ok(Some(CountupWorkPair {
3205 input_list_index,
3206 sort_key: CountupSortKey {
3207 errors,
3208 total_len,
3209 expected_errors,
3210 numeric_id,
3211 original_index,
3212 },
3213 r1,
3214 r2,
3215 }))
3216}
3217
3218fn write_sequence_record(writer: &mut impl Write, record: &SequenceRecord) -> Result<()> {
3219 write_string(writer, &record.id)?;
3220 writer.write_all(&record.numeric_id.to_le_bytes())?;
3221 write_bytes(writer, &record.bases)?;
3222 write_bool(writer, record.qualities.is_some())?;
3223 if let Some(qualities) = &record.qualities {
3224 write_bytes(writer, qualities)?;
3225 }
3226 Ok(())
3227}
3228
3229fn read_sequence_record(reader: &mut impl Read) -> Result<SequenceRecord> {
3230 let id = read_string(reader)?;
3231 let numeric_id = read_u64(reader)?;
3232 let bases = read_bytes(reader)?;
3233 let has_qualities = read_bool(reader)?;
3234 let qualities = has_qualities.then(|| read_bytes(reader)).transpose()?;
3235 Ok(SequenceRecord {
3236 id,
3237 numeric_id,
3238 bases,
3239 qualities,
3240 })
3241}
3242
3243fn write_string(writer: &mut impl Write, value: &str) -> Result<()> {
3244 write_bytes(writer, value.as_bytes())
3245}
3246
3247fn read_string(reader: &mut impl Read) -> Result<String> {
3248 let bytes = read_bytes(reader)?;
3249 String::from_utf8(bytes).context("count-up run contained invalid UTF-8 id")
3250}
3251
3252fn write_bytes(writer: &mut impl Write, bytes: &[u8]) -> Result<()> {
3253 write_usize(writer, bytes.len())?;
3254 writer.write_all(bytes)?;
3255 Ok(())
3256}
3257
3258fn read_bytes(reader: &mut impl Read) -> Result<Vec<u8>> {
3259 let len = read_usize(reader)?;
3260 let mut bytes = vec![0; len];
3261 reader.read_exact(&mut bytes)?;
3262 Ok(bytes)
3263}
3264
3265fn write_bool(writer: &mut impl Write, value: bool) -> Result<()> {
3266 writer.write_all(&[u8::from(value)])?;
3267 Ok(())
3268}
3269
3270fn read_bool(reader: &mut impl Read) -> Result<bool> {
3271 let mut buf = [0; 1];
3272 reader.read_exact(&mut buf)?;
3273 Ok(buf[0] != 0)
3274}
3275
3276fn write_usize(writer: &mut impl Write, value: usize) -> Result<()> {
3277 writer.write_all(&(value as u64).to_le_bytes())?;
3278 Ok(())
3279}
3280
3281fn read_usize(reader: &mut impl Read) -> Result<usize> {
3282 let value = read_u64(reader)?;
3283 usize::try_from(value).context("count-up run usize field exceeded this platform")
3284}
3285
3286fn read_usize_opt(reader: &mut impl Read) -> Result<Option<usize>> {
3287 let Some(value) = read_u64_opt(reader)? else {
3288 return Ok(None);
3289 };
3290 Ok(Some(usize::try_from(value).context(
3291 "count-up run usize field exceeded this platform",
3292 )?))
3293}
3294
3295fn read_u64(reader: &mut impl Read) -> Result<u64> {
3296 let mut buf = [0; 8];
3297 reader.read_exact(&mut buf)?;
3298 Ok(u64::from_le_bytes(buf))
3299}
3300
3301fn read_u64_opt(reader: &mut impl Read) -> Result<Option<u64>> {
3302 let mut buf = [0; 8];
3303 match reader.read_exact(&mut buf) {
3304 Ok(()) => Ok(Some(u64::from_le_bytes(buf))),
3305 Err(err) if err.kind() == ErrorKind::UnexpectedEof => Ok(None),
3306 Err(err) => Err(err.into()),
3307 }
3308}
3309
3310fn read_f64(reader: &mut impl Read) -> Result<f64> {
3311 let mut buf = [0; 8];
3312 reader.read_exact(&mut buf)?;
3313 Ok(f64::from_le_bytes(buf))
3314}
3315
3316fn countup_prepass_config(config: &Config) -> Config {
3317 let mut prepass = config.clone();
3318 prepass.count_up = false;
3319 prepass.require_both_bad = true;
3322 prepass.target_depth = config.target_depth.saturating_mul(4).max(1);
3323 prepass.target_bad_percent_low = config.target_bad_percent_low / 4.0;
3324 prepass.target_bad_percent_high = config.target_bad_percent_high / 4.0;
3325 prepass.max_depth = config.max_depth.map(|depth| depth.saturating_mul(4).max(1));
3326 prepass.min_depth = config.min_depth / 2;
3327 prepass.min_kmers_over_min_depth = config.min_kmers_over_min_depth / 2;
3328 prepass.low_percentile = 0.20;
3329 prepass
3330}
3331
3332fn countup_prepass_pair(
3333 prepass_config: &Config,
3334 add_bad_reads_countup: bool,
3335 input_counts: &dyn CountLookup,
3336 r1: &mut SequenceRecord,
3337 r2: Option<&mut SequenceRecord>,
3338 rand: f64,
3339) -> CountupPrepassResult {
3340 let analysis = analyze_pair(prepass_config, input_counts, r1, r2.as_deref());
3341 countup_prepass_pair_from_analysis(
3342 prepass_config,
3343 add_bad_reads_countup,
3344 input_counts,
3345 r1,
3346 r2,
3347 rand,
3348 analysis,
3349 )
3350}
3351
3352fn countup_prepass_pair_from_analysis(
3353 prepass_config: &Config,
3354 add_bad_reads_countup: bool,
3355 input_counts: &dyn CountLookup,
3356 r1: &mut SequenceRecord,
3357 mut r2: Option<&mut SequenceRecord>,
3358 rand: f64,
3359 analysis: PairAnalysis,
3360) -> CountupPrepassResult {
3361 let decision =
3362 decide_pair_from_analysis(prepass_config, r1, r2.as_deref(), analysis, Some(rand));
3363 let include = !decision.toss || add_bad_reads_countup;
3364 if prepass_config.error_correct && !decision.toss {
3365 let correction =
3366 correct_pair_errors_with_rollback(prepass_config, input_counts, r1, r2.as_deref_mut());
3367 if (!correction.uncorrectable || prepass_config.mark_uncorrectable_errors)
3368 && prepass_config.trim_after_marking
3369 {
3370 trim_pair(prepass_config, r1, r2);
3371 }
3372 return CountupPrepassResult {
3373 include,
3374 sort_analysis: None,
3375 };
3376 }
3377 CountupPrepassResult {
3378 include,
3379 sort_analysis: include.then_some(decision.analysis),
3380 }
3381}
3382
3383fn compare_countup_work_pairs(left: &CountupWorkPair, right: &CountupWorkPair) -> CmpOrdering {
3384 left.input_list_index
3385 .cmp(&right.input_list_index)
3386 .then_with(|| compare_countup_sort_key(&left.sort_key, &right.sort_key))
3387 .then_with(|| left.r1.id.cmp(&right.r1.id))
3388 .then_with(|| {
3389 left.sort_key
3390 .original_index
3391 .cmp(&right.sort_key.original_index)
3392 })
3393}
3394
3395fn compare_countup_sort_key(left: &CountupSortKey, right: &CountupSortKey) -> CmpOrdering {
3396 left.errors
3397 .cmp(&right.errors)
3398 .then_with(|| right.total_len.cmp(&left.total_len))
3399 .then_with(|| {
3400 left.expected_errors
3401 .partial_cmp(&right.expected_errors)
3402 .unwrap_or(CmpOrdering::Equal)
3403 })
3404 .then_with(|| left.numeric_id.cmp(&right.numeric_id))
3405}
3406
3407fn countup_sort_key(
3408 config: &Config,
3409 input_counts: &dyn CountLookup,
3410 r1: &SequenceRecord,
3411 r2: Option<&SequenceRecord>,
3412 original_index: usize,
3413) -> CountupSortKey {
3414 let analysis = analyze_pair(config, input_counts, r1, r2);
3415 countup_sort_key_from_analysis(r1, r2, original_index, &analysis)
3416}
3417
3418fn countup_sort_key_from_analysis(
3419 r1: &SequenceRecord,
3420 r2: Option<&SequenceRecord>,
3421 original_index: usize,
3422 analysis: &PairAnalysis,
3423) -> CountupSortKey {
3424 CountupSortKey {
3425 errors: analysis.low_kmer_count,
3426 total_len: r1.len() + r2.map(SequenceRecord::len).unwrap_or(0),
3427 expected_errors: expected_errors(r1) + r2.map(expected_errors).unwrap_or(0.0),
3428 numeric_id: r1.numeric_id,
3429 original_index,
3430 }
3431}
3432
3433fn expected_errors(record: &SequenceRecord) -> f64 {
3434 let Some(qualities) = &record.qualities else {
3435 return 0.0;
3436 };
3437 record
3438 .bases
3439 .iter()
3440 .zip(qualities)
3441 .map(|(&base, &quality)| {
3442 let q = if is_defined_base(base) {
3443 quality.saturating_sub(33)
3444 } else {
3445 0
3446 };
3447 phred_error_probability(q)
3448 })
3449 .sum()
3450}
3451
3452fn phred_error_probability(q: u8) -> f64 {
3453 match q {
3454 0 => 0.75,
3455 1 => 0.70,
3456 _ => 10f64.powf(-0.1 * f64::from(q)),
3457 }
3458}
3459
3460fn unique_pair_kmers(
3461 config: &Config,
3462 r1: &SequenceRecord,
3463 r2: Option<&SequenceRecord>,
3464) -> Vec<KmerKey> {
3465 let mut keys = Vec::with_capacity(pair_kmer_window_capacity(config, r1, r2));
3466 fill_unique_pair_kmers(config, r1, r2, &mut keys);
3467 keys
3468}
3469
3470fn fill_unique_pair_kmers(
3471 config: &Config,
3472 r1: &SequenceRecord,
3473 r2: Option<&SequenceRecord>,
3474 keys: &mut Vec<KmerKey>,
3475) {
3476 keys.clear();
3477 let required = pair_kmer_window_capacity(config, r1, r2);
3478 if keys.capacity() < required {
3479 keys.reserve(required - keys.capacity());
3480 }
3481 for_each_kmer_for_record(r1, config, |key| keys.push(key));
3482 if let Some(mate) = r2 {
3483 for_each_kmer_for_record(mate, config, |key| keys.push(key));
3484 }
3485 keys.sort_unstable();
3486 keys.dedup();
3487}
3488
3489fn pair_kmer_window_capacity(
3490 config: &Config,
3491 r1: &SequenceRecord,
3492 r2: Option<&SequenceRecord>,
3493) -> usize {
3494 record_kmer_window_capacity(config.k, r1)
3495 .saturating_add(r2.map_or(0, |mate| record_kmer_window_capacity(config.k, mate)))
3496}
3497
3498fn record_kmer_window_capacity(k: usize, record: &SequenceRecord) -> usize {
3499 if k == 0 {
3500 0
3501 } else {
3502 record.bases.len().saturating_sub(k).saturating_add(1)
3503 }
3504}
3505
3506#[cfg(test)]
3507fn decide_countup_pair(
3508 config: &Config,
3509 input_counts: &dyn CountLookup,
3510 kept_counts: &dyn CountLookup,
3511 keys: &[KmerKey],
3512 target_depth: u64,
3513) -> bool {
3514 countup_decision_plan(config, input_counts, kept_counts, keys, target_depth).toss
3515}
3516
3517fn countup_decision_plan(
3518 config: &Config,
3519 input_counts: &dyn CountLookup,
3520 kept_counts: &dyn CountLookup,
3521 keys: &[KmerKey],
3522 target_depth: u64,
3523) -> CountupDecisionPlan {
3524 let unique = keys.len();
3525 if unique == 0 {
3526 return CountupDecisionPlan {
3527 toss: !config.keep_all,
3528 eligible_key_indices: Vec::new(),
3529 };
3530 }
3531
3532 let mut desired = 0usize;
3533 let mut needed = 0usize;
3534 let mut badly_needed = 0usize;
3535 let mut input_depths = config.toss_error_reads.then(Vec::new);
3536 let mut eligible_key_indices = Vec::with_capacity(keys.len());
3537 for (index, key) in keys.iter().enumerate() {
3538 let input_depth = input_counts.depth(key);
3539 if let Some(depths) = &mut input_depths {
3540 depths.push(input_depth);
3541 }
3542 if input_depth >= config.min_depth {
3543 desired += 1;
3544 eligible_key_indices.push(index);
3545 let kept_depth = kept_counts.depth(key);
3546 if kept_depth < target_depth {
3547 needed += 1;
3548 if kept_depth < target_depth.min(input_depth).saturating_mul(3) / 4 {
3549 badly_needed += 1;
3550 }
3551 }
3552 }
3553 }
3554
3555 let threshold_needed = 8usize.max(unique.div_ceil(6));
3556 let threshold_badly_needed = 2usize.max(unique.div_ceil(24));
3557 let keep = (needed >= threshold_needed || badly_needed >= threshold_badly_needed)
3558 && (desired >= config.min_kmers_over_min_depth || unique < config.min_kmers_over_min_depth);
3559 let mut toss = !keep;
3560 if config.toss_error_reads
3561 && let Some(mut depths) = input_depths
3562 {
3563 let errors = countup_error_count(&mut depths, config);
3564 if errors > 8 && needed < 2 * threshold_needed && badly_needed < 2 * threshold_badly_needed
3565 {
3566 toss = true;
3567 }
3568 if errors > unique / 2
3569 && needed < 3 * threshold_needed
3570 && badly_needed < 4 * threshold_badly_needed
3571 {
3572 toss = true;
3573 }
3574 }
3575 CountupDecisionPlan {
3576 toss: if config.keep_all { false } else { toss },
3577 eligible_key_indices,
3578 }
3579}
3580
3581fn countup_error_count(depths: &mut [u64], config: &Config) -> usize {
3582 depths.sort_unstable();
3583 let mut previous: Option<u64> = None;
3584 for (index, &depth) in depths.iter().enumerate() {
3585 if let Some(prev) = previous
3586 && ((depth >= config.high_thresh && prev <= config.low_thresh)
3587 || depth >= prev.saturating_mul(config.error_detect_ratio))
3588 {
3589 return depths.len() - index;
3590 }
3591 previous = Some(depth);
3592 }
3593 0
3594}
3595
3596#[cfg(test)]
3597fn increment_countup_kept_counts(
3598 config: &Config,
3599 kept_counts: &mut OutputCounts,
3600 input_counts: &dyn CountLookup,
3601 keys: &[KmerKey],
3602) {
3603 let mut atomic_increments = 0u64;
3604 for key in keys {
3605 if input_counts.depth(key) >= config.min_depth {
3606 match kept_counts {
3607 OutputCounts::Exact(counts) => {
3608 *counts.entry(key.clone()).or_insert(0) += 1;
3609 }
3610 OutputCounts::Sketch(sketch) => sketch.increment(key),
3611 OutputCounts::AtomicSketch(sketch) => {
3612 sketch.increment_key(key);
3613 atomic_increments = atomic_increments.saturating_add(1);
3614 }
3615 }
3616 }
3617 }
3618 if let OutputCounts::AtomicSketch(sketch) = kept_counts {
3619 sketch.add_key_increments(atomic_increments);
3620 }
3621}
3622
3623#[cfg(test)]
3624fn update_countup_kept_counts_for_decision(
3625 config: &Config,
3626 kept_counts: &mut OutputCounts,
3627 input_counts: &dyn CountLookup,
3628 keys: &[KmerKey],
3629 toss: bool,
3630) {
3631 if !toss || config.add_bad_reads_countup {
3632 increment_countup_kept_counts(config, kept_counts, input_counts, keys);
3633 }
3634}
3635
3636fn update_countup_kept_counts_for_plan(
3637 config: &Config,
3638 kept_counts: &mut OutputCounts,
3639 keys: &[KmerKey],
3640 plan: &CountupDecisionPlan,
3641) {
3642 if plan.toss && !config.add_bad_reads_countup {
3643 return;
3644 }
3645 let mut atomic_increments = 0u64;
3646 for &index in &plan.eligible_key_indices {
3647 let Some(key) = keys.get(index) else {
3648 continue;
3649 };
3650 match kept_counts {
3651 OutputCounts::Exact(counts) => {
3652 *counts.entry(key.clone()).or_insert(0) += 1;
3653 }
3654 OutputCounts::Sketch(sketch) => sketch.increment(key),
3655 OutputCounts::AtomicSketch(sketch) => {
3656 sketch.increment_key(key);
3657 atomic_increments = atomic_increments.saturating_add(1);
3658 }
3659 }
3660 }
3661 if let OutputCounts::AtomicSketch(sketch) = kept_counts {
3662 sketch.add_key_increments(atomic_increments);
3663 }
3664}
3665
3666fn countup_length_toss(config: &Config, r1: &SequenceRecord, r2: Option<&SequenceRecord>) -> bool {
3667 !config.keep_all
3668 && (r1.len() < config.min_length || r2.is_some_and(|mate| mate.len() < config.min_length))
3669}
3670
3671#[cfg(test)]
3672fn count_map_depth_hist(counts: &CountMap, hist_len: usize) -> Vec<u64> {
3673 let mut hist = vec![0; hist_len];
3674 for &depth in counts.values() {
3675 let idx = (depth as usize).min(hist_len.saturating_sub(1));
3676 hist[idx] += depth;
3677 }
3678 hist
3679}
3680
3681fn count_map_sparse_depth_hist(counts: &CountMap, hist_len: usize) -> SparseHist {
3682 let Some(last_index) = hist_len.checked_sub(1) else {
3683 return SparseHist::default();
3684 };
3685 let mut hist = SparseHist::default();
3686 for &depth in counts.values() {
3687 add_depth_to_sparse_hist(&mut hist, depth, last_index);
3688 }
3689 hist
3690}
3691
3692#[cfg(test)]
3693fn add_depth_to_dynamic_hist(local: &mut Vec<u64>, depth: u64, last_index: usize) {
3694 if depth == 0 {
3695 return;
3696 }
3697 let idx = usize_from_u64_saturating(depth).min(last_index);
3698 if idx >= local.len() {
3699 local.resize(idx + 1, 0);
3700 }
3701 local[idx] = local[idx].saturating_add(depth);
3702}
3703
3704#[cfg(test)]
3705fn merge_dynamic_depth_hist(mut left: Vec<u64>, right: Vec<u64>) -> Vec<u64> {
3706 if right.len() > left.len() {
3707 left.resize(right.len(), 0);
3708 }
3709 for (index, value) in right.into_iter().enumerate() {
3710 left[index] = left[index].saturating_add(value);
3711 }
3712 left
3713}
3714
3715fn add_depth_to_sparse_hist(local: &mut SparseHist, depth: u64, last_index: usize) {
3716 if depth == 0 {
3717 return;
3718 }
3719 let idx = usize_from_u64_saturating(depth).min(last_index);
3720 let entry = local.entry(idx).or_insert(0);
3721 *entry = entry.saturating_add(depth);
3722}
3723
3724fn merge_sparse_depth_hist(mut left: SparseHist, right: SparseHist) -> SparseHist {
3725 merge_sparse_hist(&mut left, right);
3726 left
3727}
3728
3729#[cfg(test)]
3730fn build_input_counts(config: &Config) -> Result<InputCounts> {
3731 let mut stage_timings = Vec::new();
3732 build_input_counts_with_stage_timings(config, &mut stage_timings)
3733}
3734
3735fn build_input_counts_with_stage_timings(
3736 config: &Config,
3737 stage_timings: &mut Vec<StageTiming>,
3738) -> Result<InputCounts> {
3739 let started = Instant::now();
3740 let counts = build_input_counts_inner(config, stage_timings)?;
3741 record_stage_timing(stage_timings, "input_counting", started);
3742 Ok(counts)
3743}
3744
3745fn build_input_counts_inner(
3746 config: &Config,
3747 stage_timings: &mut Vec<StageTiming>,
3748) -> Result<InputCounts> {
3749 if use_bounded_input_sketch(config) {
3750 return build_sketch_input_counts(config, stage_timings);
3751 }
3752 let started = Instant::now();
3753 let mut counts = new_count_map(config);
3754 count_primary(config, &mut counts)?;
3755 for extra in &config.extra {
3756 count_single_file(config, extra, &mut counts, None)?;
3757 }
3758 apply_trusted_build_pass_filter(config, &mut counts);
3759 apply_prefilter_collision_estimates(config, &mut counts);
3760 apply_count_min_collision_estimates(config, &mut counts);
3761 record_stage_timing(stage_timings, "input_exact_counting", started);
3762 Ok(InputCounts::Exact(counts))
3763}
3764
3765fn use_bounded_input_sketch(config: &Config) -> bool {
3766 if config.force_exact_counts {
3767 return false;
3768 }
3769 config.count_min.cells.is_some()
3770 || config.count_min.memory_bytes.is_some()
3771 || automatic_count_min_should_use(config)
3772}
3773
3774fn gpu_counting_supported(config: &Config) -> bool {
3775 config.gpu_counting
3776 && config.gpu_helper.is_some()
3777 && config.k <= 31
3778 && !use_prefilter_collision_estimates(config)
3779}
3780
3781fn build_sketch_input_counts(
3782 config: &Config,
3783 stage_timings: &mut Vec<StageTiming>,
3784) -> Result<InputCounts> {
3785 validate_gpu_counting_request(config)?;
3786 if use_prefilter_collision_estimates(config) {
3787 let started = Instant::now();
3788 let mut prefilter = new_input_prefilter_count_min_sketch(config)?;
3789 count_primary_prefilter_sketch(config, &mut prefilter)?;
3790 for extra in &config.extra {
3791 count_single_file_prefilter_sketch(config, extra, &mut prefilter, None)?;
3792 }
3793 let prefilter_limit = prefilter.max_count();
3794 record_stage_timing(stage_timings, "input_prefilter_counting", started);
3795
3796 if use_atomic_count_min_sketch(config) {
3797 let started = Instant::now();
3798 let sketch = new_atomic_count_min_sketch_with_mask_seed(
3799 config,
3800 BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED,
3801 )?;
3802 count_primary_atomic_sketch(
3803 config,
3804 &sketch,
3805 Some(PrefilterGate::new(&prefilter, prefilter_limit)),
3806 )?;
3807 for extra in &config.extra {
3808 count_single_file_atomic_sketch(
3809 config,
3810 extra,
3811 &sketch,
3812 None,
3813 Some(PrefilterGate::new(&prefilter, prefilter_limit)),
3814 )?;
3815 }
3816 record_stage_timing(stage_timings, "input_main_counting", started);
3817 return Ok(InputCounts::PrefilteredSketch {
3818 prefilter,
3819 limit: prefilter_limit,
3820 main: Box::new(InputCounts::AtomicSketch(sketch)),
3821 });
3822 }
3823
3824 let started = Instant::now();
3825 let mut sketch = new_bounded_count_min_sketch_with_mask_seed(
3826 config,
3827 BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED,
3828 )?;
3829 count_primary_sketch(
3830 config,
3831 &mut sketch,
3832 Some(PrefilterGate::new(&prefilter, prefilter_limit)),
3833 )?;
3834 for extra in &config.extra {
3835 count_single_file_sketch(
3836 config,
3837 extra,
3838 &mut sketch,
3839 None,
3840 Some(PrefilterGate::new(&prefilter, prefilter_limit)),
3841 )?;
3842 }
3843 record_stage_timing(stage_timings, "input_main_counting", started);
3844 return Ok(InputCounts::PrefilteredSketch {
3845 prefilter,
3846 limit: prefilter_limit,
3847 main: Box::new(InputCounts::Sketch(sketch)),
3848 });
3849 }
3850
3851 if use_atomic_count_min_sketch(config) {
3852 let started = Instant::now();
3853 let sketch = new_atomic_count_min_sketch(config)?;
3854 if gpu_counting_supported(config) {
3855 count_primary_gpu_reduced_runs_atomic_sketch(config, &sketch)?;
3856 } else {
3857 count_primary_atomic_sketch(config, &sketch, None)?;
3858 }
3859 for extra in &config.extra {
3860 count_single_file_atomic_sketch(config, extra, &sketch, None, None)?;
3861 }
3862 record_stage_timing(stage_timings, "input_main_counting", started);
3863 return Ok(InputCounts::AtomicSketch(sketch));
3864 }
3865 if use_atomic_packed_input_sketch(config) {
3866 let started = Instant::now();
3867 let sketch = new_atomic_packed_count_min_sketch(config)?;
3868 count_primary_atomic_packed_sketch(config, &sketch)?;
3869 for extra in &config.extra {
3870 count_single_file_atomic_packed_sketch(config, extra, &sketch, None)?;
3871 }
3872 record_stage_timing(stage_timings, "input_main_counting", started);
3873 return Ok(InputCounts::AtomicPackedSketch(sketch));
3874 }
3875 let started = Instant::now();
3876 let mut sketch = new_bounded_count_min_sketch(config)?;
3877 if gpu_counting_supported(config) {
3878 count_primary_gpu_reduced_runs_sketch(config, &mut sketch)?;
3879 } else {
3880 count_primary_sketch(config, &mut sketch, None)?;
3881 }
3882 for extra in &config.extra {
3883 count_single_file_sketch(config, extra, &mut sketch, None, None)?;
3884 }
3885 record_stage_timing(stage_timings, "input_main_counting", started);
3886 Ok(InputCounts::Sketch(sketch))
3887}
3888
3889fn validate_gpu_counting_request(config: &Config) -> Result<()> {
3890 if !config.gpu_counting {
3891 return Ok(());
3892 }
3893 ensure!(
3894 config.gpu_helper.is_some(),
3895 "gpucounting=t requires gpuhelper=<cuda_kmer_reduce_runs binary>"
3896 );
3897 ensure!(
3898 config.k <= 31,
3899 "gpucounting=t currently supports short k-mers only (k<=31)"
3900 );
3901 ensure!(
3902 !use_prefilter_collision_estimates(config),
3903 "gpucounting=t currently supports the main bounded sketch without prefilter=t"
3904 );
3905 Ok(())
3906}
3907
3908fn new_output_counts(config: &Config) -> Result<OutputCounts> {
3909 if use_bounded_input_sketch(config) {
3910 if config.count_up {
3911 return new_countup_output_counts(config);
3912 }
3913 if use_atomic_count_min_sketch(config) {
3914 new_atomic_output_count_min_sketch(config).map(OutputCounts::AtomicSketch)
3915 } else {
3916 new_bounded_output_count_min_sketch(config).map(OutputCounts::Sketch)
3917 }
3918 } else {
3919 Ok(OutputCounts::Exact(new_count_map(config)))
3920 }
3921}
3922
3923fn new_atomic_output_count_min_sketch(config: &Config) -> Result<AtomicCountMinSketch> {
3924 let hashes = config
3925 .count_min
3926 .hashes
3927 .unwrap_or(3)
3928 .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
3929 let total_cells = output_count_min_total_cells(config, 32);
3930 ensure_count_min_budget_fits_memory(
3931 "output_kept",
3932 total_cells,
3933 32,
3934 output_count_min_memory_bytes(config, 32),
3935 )?;
3936 let min_arrays = kcount_array_min_arrays(config);
3937 let cells = count_min_table_cells_from_total_bits_with_min_arrays(total_cells, 32, min_arrays);
3938 let update_mode = count_min_update_mode(config, 32, hashes);
3939 AtomicCountMinSketch::new_with_min_arrays_and_update_mode(
3940 cells,
3941 hashes,
3942 min_arrays,
3943 update_mode,
3944 kept_output_mask_seed(config),
3945 )
3946 .map(|sketch| sketch.with_parallel_replay(!config.deterministic))
3947}
3948
3949fn new_bounded_output_count_min_sketch(config: &Config) -> Result<PackedCountMinSketch> {
3950 let hashes = config
3951 .count_min
3952 .hashes
3953 .unwrap_or(3)
3954 .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
3955 let bits = config.count_min.bits.unwrap_or(32);
3956 let total_cells = output_count_min_total_cells(config, bits);
3957 ensure_count_min_budget_fits_memory(
3958 "output_kept",
3959 total_cells,
3960 bits,
3961 output_count_min_memory_bytes(config, bits),
3962 )?;
3963 let min_arrays = kcount_array_min_arrays(config);
3964 let cells =
3965 count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
3966 PackedCountMinSketch::new_with_min_arrays_and_mask_seed(
3967 cells,
3968 hashes,
3969 bits,
3970 min_arrays,
3971 kept_output_mask_seed(config),
3972 )
3973 .map(|sketch| sketch.with_update_mode(count_min_update_mode(config, bits, hashes)))
3974}
3975
3976fn new_countup_output_counts(config: &Config) -> Result<OutputCounts> {
3977 let bits = countup_output_count_bits(config);
3978 let hashes = 3;
3979 let total_cells = countup_output_total_cells(config, bits);
3980 ensure_count_min_budget_fits_memory(
3981 "count-up output",
3982 total_cells,
3983 bits,
3984 config
3985 .count_min
3986 .memory_bytes
3987 .or_else(|| automatic_count_min_memory_bytes(config)),
3988 )?;
3989 let min_arrays = kcount_array_min_arrays(config);
3990 let cells =
3991 count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
3992 PackedCountMinSketch::new_with_min_arrays_and_mask_seed(
3993 cells,
3994 hashes,
3995 bits,
3996 min_arrays,
3997 countup_output_mask_seed(config),
3998 )
3999 .map(|sketch| sketch.with_update_mode(count_min_update_mode(config, bits, hashes)))
4000 .map(OutputCounts::Sketch)
4001}
4002
4003fn countup_output_count_bits(config: &Config) -> u8 {
4004 let target = countup_adjusted_target_depth(config);
4005 if target <= 15 {
4006 4
4007 } else if target <= 255 {
4008 8
4009 } else {
4010 16
4011 }
4012}
4013
4014fn countup_adjusted_target_depth(config: &Config) -> u64 {
4015 ((config.target_depth as f64) * 0.95).round().max(1.0) as u64
4016}
4017
4018fn countup_output_total_cells(config: &Config, bits: u8) -> usize {
4019 config
4020 .count_min
4021 .cells
4022 .unwrap_or_else(|| count_min_cells_from_memory(count_min_memory_bytes(config), bits))
4023 .max(1)
4024}
4025
4026fn countup_output_mask_seed(config: &Config) -> u64 {
4027 kept_output_mask_seed(config)
4028}
4029
4030fn kept_output_mask_seed(config: &Config) -> u64 {
4031 let preceding_tables = if use_prefilter_collision_estimates(config) {
4032 2
4033 } else {
4034 1
4035 };
4036 BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED
4037 .saturating_add(BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP.saturating_mul(preceding_tables))
4038}
4039
4040fn output_count_min_total_cells(config: &Config, bits: u8) -> usize {
4041 let base = config
4042 .count_min
4043 .cells
4044 .unwrap_or_else(|| {
4045 count_min_cells_from_memory(output_count_min_memory_bytes(config, bits), bits)
4046 })
4047 .max(1);
4048 let Some(fraction_micros) = prefilter_memory_fraction_micros(config) else {
4049 return cap_main_cells_to_short_kmer_space(config, base);
4050 };
4051 let main_fraction = 1_000_000usize.saturating_sub(fraction_micros as usize);
4052 base.saturating_mul(main_fraction)
4053 .checked_div(1_000_000)
4054 .unwrap_or(0)
4055 .max(1)
4056}
4057
4058fn output_count_min_memory_bytes(config: &Config, _bits: u8) -> Option<usize> {
4059 if config.count_min.cells.is_some() {
4060 return config
4061 .count_min
4062 .memory_bytes
4063 .or(config.auto_count_min_memory_bytes)
4064 .or_else(|| automatic_count_min_memory_bytes(config));
4065 }
4066 if config.count_min.memory_bytes.is_some() {
4067 return count_min_memory_bytes(config);
4068 }
4069 automatic_count_min_memory_bytes(config).map(output_count_min_auto_memory_bytes)
4070}
4071
4072fn output_count_min_auto_memory_bytes(memory_bytes: usize) -> usize {
4073 let min_memory = OUTPUT_COUNT_MIN_AUTO_MIN_MEMORY_BYTES.min(memory_bytes);
4074 scale_by_micros(memory_bytes, OUTPUT_COUNT_MIN_AUTO_FRACTION_MICROS)
4075 .max(min_memory)
4076 .min(memory_bytes)
4077 .max(1)
4078}
4079
4080fn use_atomic_count_min_sketch(config: &Config) -> bool {
4081 config.count_min.bits.unwrap_or(32) == 32
4082}
4083
4084fn use_atomic_packed_input_sketch(config: &Config) -> bool {
4085 !config.deterministic
4086 && config.count_min.bits.unwrap_or(32) < 32
4087 && !use_prefilter_collision_estimates(config)
4088 && !gpu_counting_supported(config)
4089}
4090
4091fn new_atomic_count_min_sketch(config: &Config) -> Result<AtomicCountMinSketch> {
4092 new_atomic_count_min_sketch_with_mask_seed(config, BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED)
4093}
4094
4095fn new_atomic_count_min_sketch_with_mask_seed(
4096 config: &Config,
4097 mask_seed: u64,
4098) -> Result<AtomicCountMinSketch> {
4099 let hashes = config
4100 .count_min
4101 .hashes
4102 .unwrap_or(3)
4103 .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4104 let total_cells = main_count_min_total_cells(config, 32);
4105 ensure_count_min_budget_fits_memory(
4106 "main",
4107 total_cells,
4108 32,
4109 config
4110 .count_min
4111 .memory_bytes
4112 .or(config.auto_count_min_memory_bytes),
4113 )?;
4114 let min_arrays = kcount_array_min_arrays(config);
4115 let cells = count_min_table_cells_from_total_bits_with_min_arrays(total_cells, 32, min_arrays);
4116 let update_mode = count_min_update_mode(config, 32, hashes);
4117 AtomicCountMinSketch::new_with_min_arrays_and_update_mode(
4118 cells,
4119 hashes,
4120 min_arrays,
4121 update_mode,
4122 mask_seed,
4123 )
4124 .map(|sketch| sketch.with_parallel_replay(!config.deterministic))
4125}
4126
4127fn new_atomic_packed_count_min_sketch(config: &Config) -> Result<AtomicPackedCountMinSketch> {
4128 new_atomic_packed_count_min_sketch_with_mask_seed(config, BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED)
4129}
4130
4131fn new_atomic_packed_count_min_sketch_with_mask_seed(
4132 config: &Config,
4133 mask_seed: u64,
4134) -> Result<AtomicPackedCountMinSketch> {
4135 let bits = config.count_min.bits.unwrap_or(32);
4136 let hashes = config
4137 .count_min
4138 .hashes
4139 .unwrap_or(BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS);
4140 let total_cells = main_count_min_total_cells(config, bits);
4141 ensure_count_min_budget_fits_memory(
4142 "count-min sketch",
4143 total_cells,
4144 bits,
4145 config
4146 .count_min
4147 .memory_bytes
4148 .or(config.auto_count_min_memory_bytes),
4149 )?;
4150 let min_arrays = hashes.max(BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS);
4151 let cells =
4152 count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
4153 AtomicPackedCountMinSketch::new_with_min_arrays_and_update_mode(
4154 cells,
4155 hashes,
4156 bits,
4157 min_arrays,
4158 count_min_update_mode(config, bits, hashes),
4159 mask_seed,
4160 )
4161}
4162
4163fn new_bounded_count_min_sketch(config: &Config) -> Result<PackedCountMinSketch> {
4164 new_bounded_count_min_sketch_with_mask_seed(config, BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED)
4165}
4166
4167fn new_bounded_count_min_sketch_with_mask_seed(
4168 config: &Config,
4169 mask_seed: u64,
4170) -> Result<PackedCountMinSketch> {
4171 let hashes = config
4172 .count_min
4173 .hashes
4174 .unwrap_or(3)
4175 .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4176 let bits = config.count_min.bits.unwrap_or(32);
4177 let total_cells = main_count_min_total_cells(config, bits);
4178 ensure_count_min_budget_fits_memory(
4179 "main",
4180 total_cells,
4181 bits,
4182 config
4183 .count_min
4184 .memory_bytes
4185 .or(config.auto_count_min_memory_bytes),
4186 )?;
4187 let min_arrays = kcount_array_min_arrays(config);
4188 let cells =
4189 count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
4190 PackedCountMinSketch::new_with_min_arrays_and_mask_seed(
4191 cells, hashes, bits, min_arrays, mask_seed,
4192 )
4193 .map(|sketch| sketch.with_update_mode(count_min_update_mode(config, bits, hashes)))
4194}
4195
4196fn new_prefilter_count_min_sketch(config: &Config) -> Result<PackedCountMinSketch> {
4197 let hashes = config
4198 .prefilter
4199 .hashes
4200 .unwrap_or_else(|| default_prefilter_hashes(config))
4201 .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4202 let bits = config.prefilter.bits.unwrap_or(DEFAULT_PREFILTER_BITS);
4203 let total_cells = prefilter_total_cells(config, bits).max(1);
4204 ensure_count_min_budget_fits_memory(
4205 "prefilter",
4206 total_cells,
4207 bits,
4208 config
4209 .prefilter
4210 .memory_bytes
4211 .or(config.count_min.memory_bytes)
4212 .or(config.auto_count_min_memory_bytes),
4213 )?;
4214 let min_arrays = kcount_array_min_arrays(config);
4215 let cells =
4216 count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
4217 PackedCountMinSketch::new_with_min_arrays(cells, hashes, bits, min_arrays)
4218 .map(|sketch| sketch.with_update_mode(count_min_update_mode(config, bits, hashes)))
4219}
4220
4221fn new_input_prefilter_count_min_sketch(config: &Config) -> Result<PrefilterCountMinSketch> {
4222 if config.deterministic {
4223 return new_prefilter_count_min_sketch(config).map(PrefilterCountMinSketch::Packed);
4224 }
4225 new_atomic_packed_prefilter_count_min_sketch(config).map(PrefilterCountMinSketch::AtomicPacked)
4226}
4227
4228fn new_atomic_packed_prefilter_count_min_sketch(
4229 config: &Config,
4230) -> Result<AtomicPackedCountMinSketch> {
4231 let hashes = config
4232 .prefilter
4233 .hashes
4234 .unwrap_or_else(|| default_prefilter_hashes(config))
4235 .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4236 let bits = config.prefilter.bits.unwrap_or(DEFAULT_PREFILTER_BITS);
4237 let total_cells = prefilter_total_cells(config, bits).max(1);
4238 ensure_count_min_budget_fits_memory(
4239 "prefilter",
4240 total_cells,
4241 bits,
4242 config
4243 .prefilter
4244 .memory_bytes
4245 .or(config.count_min.memory_bytes)
4246 .or(config.auto_count_min_memory_bytes),
4247 )?;
4248 let min_arrays = kcount_array_min_arrays(config);
4249 let cells =
4250 count_min_table_cells_from_total_bits_with_min_arrays(total_cells, bits, min_arrays);
4251 AtomicPackedCountMinSketch::new_with_min_arrays_and_update_mode(
4252 cells,
4253 hashes,
4254 bits,
4255 min_arrays,
4256 count_min_update_mode(config, bits, hashes),
4257 BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
4258 )
4259}
4260
4261fn default_prefilter_hashes(config: &Config) -> usize {
4262 let main_hashes = config
4263 .count_min
4264 .hashes
4265 .unwrap_or(3)
4266 .clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4267 main_hashes.div_ceil(2)
4268}
4269
4270fn count_min_update_mode(config: &Config, bits: u8, hashes: usize) -> CountMinUpdateMode {
4271 if bits > 1 && hashes > 1 && config.locked_increment.unwrap_or(true) {
4275 CountMinUpdateMode::Conservative
4276 } else {
4277 CountMinUpdateMode::Independent
4278 }
4279}
4280
4281fn count_min_memory_bytes(config: &Config) -> Option<usize> {
4282 config
4283 .count_min
4284 .memory_bytes
4285 .or_else(|| automatic_count_min_memory_bytes(config))
4286}
4287
4288fn main_count_min_total_cells(config: &Config, bits: u8) -> usize {
4289 let base = config
4290 .count_min
4291 .cells
4292 .unwrap_or_else(|| count_min_cells_from_memory(count_min_memory_bytes(config), bits))
4293 .max(1);
4294 let Some(fraction_micros) = prefilter_memory_fraction_micros(config) else {
4295 return cap_main_cells_to_short_kmer_space(config, base);
4296 };
4297 let main_fraction = 1_000_000usize.saturating_sub(fraction_micros as usize);
4298 base.saturating_mul(main_fraction)
4299 .checked_div(1_000_000)
4300 .unwrap_or(0)
4301 .max(1)
4302}
4303
4304fn cap_main_cells_to_short_kmer_space(config: &Config, cells: usize) -> usize {
4305 if use_prefilter_collision_estimates(config) {
4306 return cells;
4307 }
4308 short_kmer_space_cells(config.k)
4309 .map(|cap| cells.min(cap))
4310 .unwrap_or(cells)
4311 .max(1)
4312}
4313
4314fn short_kmer_space_cells(k: usize) -> Option<usize> {
4315 if k >= 32 {
4316 return None;
4317 }
4318 1usize.checked_shl((2 * k) as u32)
4319}
4320
4321fn prefilter_memory_fraction_micros(config: &Config) -> Option<u32> {
4322 if config.prefilter.force_disabled {
4323 return None;
4324 }
4325 if config.prefilter.cells.is_some() || config.prefilter.memory_bytes.is_some() {
4326 return None;
4327 }
4328 if let Some(fraction) = config
4329 .prefilter
4330 .memory_fraction_micros
4331 .filter(|fraction| *fraction > 0)
4332 {
4333 return Some(fraction);
4334 }
4335 if config.prefilter.enabled && use_bounded_input_sketch(config) {
4336 return Some(DEFAULT_PREFILTER_FRACTION_MICROS);
4337 }
4338 None
4339}
4340
4341fn scale_by_micros(value: usize, micros: u32) -> usize {
4342 value
4343 .saturating_mul(micros as usize)
4344 .checked_div(1_000_000)
4345 .unwrap_or(0)
4346}
4347
4348fn zeroed_u64_vec(len: usize) -> Result<Vec<u64>> {
4349 unsafe { zeroed_vec_with_layout::<u64>(len, "u64") }
4351}
4352
4353fn zeroed_atomic_u32_vec(len: usize) -> Result<Vec<AtomicU32>> {
4354 unsafe { zeroed_vec_with_layout::<AtomicU32>(len, "AtomicU32") }
4356}
4357
4358fn zeroed_atomic_u64_vec(len: usize) -> Result<Vec<AtomicU64>> {
4359 unsafe { zeroed_vec_with_layout::<AtomicU64>(len, "AtomicU64") }
4361}
4362
4363unsafe fn zeroed_vec_with_layout<T>(len: usize, type_name: &str) -> Result<Vec<T>> {
4364 if len == 0 {
4365 return Ok(Vec::new());
4366 }
4367 let layout = Layout::array::<T>(len)
4368 .with_context(|| format!("allocating zeroed {type_name} vector layout"))?;
4369 let ptr = unsafe { alloc_zeroed(layout) };
4373 if ptr.is_null() {
4374 bail!("allocating zeroed {type_name} vector failed for {len} elements");
4375 }
4376 Ok(unsafe { Vec::from_raw_parts(ptr.cast::<T>(), len, len) })
4380}
4381
4382fn count_min_cells_from_memory(memory_bytes: Option<usize>, bits: u8) -> usize {
4383 let Some(memory_bytes) = memory_bytes else {
4384 return DEFAULT_PREFILTER_CELLS;
4385 };
4386 let bits_total = memory_bytes.saturating_mul(8);
4387 let bits_per_cell = bits.max(1) as usize;
4388 (bits_total / bits_per_cell).max(1)
4389}
4390
4391fn count_min_total_bytes(total_cells: usize, bits: u8) -> Result<usize> {
4392 let total_cells = total_cells.max(1);
4393 let bits = bits.max(1) as usize;
4394 let total_bits = total_cells
4395 .checked_mul(bits)
4396 .context("bounded count-min sketch size overflowed")?;
4397 Ok(total_bits.div_ceil(8).max(1))
4398}
4399
4400fn packed_sketch_should_track_slots(cells: usize) -> bool {
4401 cells <= PACKED_SKETCH_TRACKED_SLOT_LIMIT
4402}
4403
4404fn safe_explicit_count_min_bytes(available: usize) -> usize {
4405 available
4406 .saturating_mul(EXPLICIT_COUNT_MIN_SAFE_MEMORY_PERCENT)
4407 .checked_div(100)
4408 .unwrap_or(0)
4409 .max(1)
4410}
4411
4412fn count_min_safe_budget_bytes(
4413 configured_memory_bytes: Option<usize>,
4414 available_memory_bytes: Option<usize>,
4415) -> Option<usize> {
4416 let safe_available = available_memory_bytes.map(safe_explicit_count_min_bytes);
4417 match (configured_memory_bytes, safe_available) {
4418 (Some(configured), Some(available)) => Some(configured.min(available)),
4419 (Some(configured), None) => Some(configured),
4420 (None, Some(available)) => Some(available),
4421 (None, None) => None,
4422 }
4423}
4424
4425fn ensure_count_min_budget_fits_ceiling(
4426 label: &str,
4427 total_cells: usize,
4428 bits: u8,
4429 safe_budget: usize,
4430) -> Result<()> {
4431 let requested = count_min_total_bytes(total_cells, bits)?;
4432 if requested > safe_budget {
4433 bail!(
4434 "{label} count-min table requests {requested} bytes ({total_cells} cells x {} bits), above safe memory budget {safe_budget} bytes; reduce cells/matrixbits/sketchmemory/mem",
4435 bits.max(1)
4436 );
4437 }
4438 Ok(())
4439}
4440
4441fn ensure_count_min_budget_fits_memory(
4442 label: &str,
4443 total_cells: usize,
4444 bits: u8,
4445 configured_memory_bytes: Option<usize>,
4446) -> Result<()> {
4447 if let Some(safe_budget) =
4448 count_min_safe_budget_bytes(configured_memory_bytes, system_available_memory_bytes())
4449 {
4450 ensure_count_min_budget_fits_ceiling(label, total_cells, bits, safe_budget)
4451 } else {
4452 count_min_total_bytes(total_cells, bits).map(|_| ())
4453 }
4454}
4455
4456#[cfg(test)]
4457fn count_min_table_cells_from_total(total_cells: usize, hashes: usize) -> usize {
4458 let _ = hashes;
4459 count_min_table_cells_from_total_bits(total_cells, 32)
4460}
4461
4462#[cfg(test)]
4463fn count_min_table_cells_from_total_bits(total_cells: usize, bits: u8) -> usize {
4464 count_min_table_cells_from_total_bits_with_min_arrays(
4465 total_cells,
4466 bits,
4467 BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
4468 )
4469}
4470
4471fn count_min_table_cells_from_total_bits_with_min_arrays(
4472 total_cells: usize,
4473 bits: u8,
4474 min_arrays: usize,
4475) -> usize {
4476 let total_cells = total_cells.max(1);
4477 let arrays = kcount_array_count(total_cells, bits, min_arrays);
4478 if arrays <= 1 {
4479 return prime_at_most(total_cells);
4480 }
4481 prime_at_most(total_cells.div_ceil(arrays)).saturating_mul(arrays)
4482}
4483
4484fn kcount_array_min_arrays(config: &Config) -> usize {
4485 kcount_array_min_arrays_for_threads(config.threads.unwrap_or_else(rayon::current_num_threads))
4486}
4487
4488fn kcount_array_min_arrays_for_threads(threads: usize) -> usize {
4489 let target = threads.max(BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS);
4490 let mut arrays = BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS;
4491 while arrays < target {
4492 let next = arrays.saturating_mul(2);
4493 if next == arrays {
4494 break;
4495 }
4496 arrays = next;
4497 }
4498 arrays
4499}
4500
4501fn kcount_array_lock_index(key: &KmerKey) -> usize {
4502 let raw = match key {
4503 KmerKey::Short(raw) | KmerKey::LongHash(raw) => *raw,
4504 };
4505 ((raw & (i64::MAX as u64)) % BBTOOLS_KCOUNT_ARRAY_LOCKS as u64) as usize
4506}
4507
4508fn kcount_array_count(desired_cells: usize, bits: u8, min_arrays: usize) -> usize {
4509 if desired_cells < BBTOOLS_KCOUNT_ARRAY_SHARD_MIN_CELLS {
4510 return 1;
4511 }
4512 let bits = bits.clamp(1, 64) as usize;
4513 let min_arrays = kcount_array_min_arrays_for_threads(min_arrays);
4514 let words = desired_cells
4515 .saturating_mul(bits)
4516 .saturating_add(31)
4517 .checked_div(32)
4518 .unwrap_or(usize::MAX)
4519 .max(min_arrays);
4520 let mut arrays = min_arrays;
4521 while words / arrays >= i32::MAX as usize {
4522 arrays = arrays.saturating_mul(2);
4523 }
4524 while arrays > desired_cells {
4525 arrays /= 2;
4526 }
4527 arrays.max(1)
4528}
4529
4530fn prime_at_most(value: usize) -> usize {
4531 if value <= 2 {
4532 return value.max(1);
4533 }
4534
4535 let mut candidate = if value.is_multiple_of(2) {
4536 value - 1
4537 } else {
4538 value
4539 };
4540 while candidate > 2 {
4541 if is_prime(candidate) {
4542 return candidate;
4543 }
4544 candidate -= 2;
4545 }
4546 2
4547}
4548
4549fn is_prime(value: usize) -> bool {
4550 if value <= 3 {
4551 return value > 1;
4552 }
4553 if value.is_multiple_of(2) || value.is_multiple_of(3) {
4554 return false;
4555 }
4556
4557 let mut divisor = 5usize;
4558 while divisor <= value / divisor {
4559 if value.is_multiple_of(divisor) || value.is_multiple_of(divisor + 2) {
4560 return false;
4561 }
4562 divisor += 6;
4563 }
4564 true
4565}
4566
4567fn automatic_count_min_should_use(config: &Config) -> bool {
4568 if !config.auto_count_min || config.force_exact_counts {
4569 return false;
4570 }
4571 if config
4572 .table_reads
4573 .or(config.max_reads)
4574 .is_some_and(|reads| reads >= config.auto_count_min_read_threshold)
4575 {
4576 return true;
4577 }
4578 input_metadata_bytes(config)
4579 .is_some_and(|bytes| bytes >= config.auto_count_min_input_bytes as u64)
4580}
4581
4582fn automatic_count_min_memory_bytes(config: &Config) -> Option<usize> {
4583 if !automatic_count_min_should_use(config) {
4584 return None;
4585 }
4586 let raw_memory = config
4587 .auto_count_min_memory_bytes
4588 .unwrap_or_else(default_auto_count_min_memory_bytes);
4589 Some(automatic_count_min_filter_memory_bytes(config, raw_memory))
4590}
4591
4592fn automatic_count_min_filter_memory_bytes(config: &Config, raw_memory: usize) -> usize {
4593 let usable = bbtools_usable_table_memory_bytes(config, raw_memory).max(1);
4594 if config.count_up {
4595 (usable / 2).max(1)
4596 } else {
4597 usable
4598 }
4599}
4600
4601fn default_auto_count_min_memory_bytes() -> usize {
4602 system_available_memory_bytes()
4603 .map(|bytes| {
4604 (bytes / 4).clamp(
4605 AUTO_COUNT_MIN_MIN_MEMORY_BYTES,
4606 AUTO_COUNT_MIN_MAX_MEMORY_BYTES,
4607 )
4608 })
4609 .unwrap_or(AUTO_COUNT_MIN_FALLBACK_MEMORY_BYTES)
4610}
4611
4612fn bbtools_usable_table_memory_bytes(config: &Config, memory_bytes: usize) -> usize {
4613 let after_headroom = memory_bytes.saturating_sub(BBTOOLS_MEMORY_HEADROOM_BYTES) as f64 * 0.73;
4614 let fraction = memory_bytes as f64 * 0.45;
4615 let mut usable = after_headroom.max(fraction).max(1.0) as usize;
4616 if histogram_memory_is_reserved(config) {
4617 let threads = config
4618 .threads
4619 .unwrap_or_else(rayon::current_num_threads)
4620 .max(1);
4621 let hist_bytes = config
4622 .hist_len
4623 .saturating_mul(8)
4624 .saturating_mul(threads.saturating_add(1));
4625 usable = usable.saturating_sub(hist_bytes);
4626 }
4627 if config.build_passes > 1 {
4628 usable /= 2;
4629 }
4630 usable.max(1)
4631}
4632
4633fn histogram_memory_is_reserved(config: &Config) -> bool {
4634 config.hist_in.is_some()
4635 || config.hist_out.is_some()
4636 || config.peaks_in.is_some()
4637 || config.peaks_out.is_some()
4638}
4639
4640fn system_available_memory_bytes() -> Option<usize> {
4641 let text = fs::read_to_string("/proc/meminfo").ok()?;
4642 for line in text.lines() {
4643 if let Some(rest) = line.strip_prefix("MemAvailable:") {
4644 let kb = rest.split_whitespace().next()?.parse::<usize>().ok()?;
4645 return kb.checked_mul(1024);
4646 }
4647 }
4648 None
4649}
4650
4651fn input_metadata_bytes(config: &Config) -> Option<u64> {
4652 let mut total = 0u64;
4653 let mut found = false;
4654 for path in input_metadata_paths(config) {
4655 let Ok(metadata) = fs::metadata(path) else {
4656 continue;
4657 };
4658 if metadata.is_file() {
4659 total = total.saturating_add(metadata.len());
4660 found = true;
4661 }
4662 }
4663 found.then_some(total)
4664}
4665
4666fn input_metadata_paths(config: &Config) -> Vec<PathBuf> {
4667 let mut paths = Vec::new();
4668 if let Some(path) = &config.in1 {
4669 paths.extend(metadata_path_expansion(path));
4670 }
4671 if let Some(path) = &config.in2 {
4672 paths.extend(metadata_path_expansion(path));
4673 }
4674 for path in &config.extra {
4675 paths.extend(metadata_path_expansion(path));
4676 }
4677 paths
4678}
4679
4680fn metadata_path_expansion(path: &Path) -> Vec<PathBuf> {
4681 if path.exists() {
4682 return vec![path.to_path_buf()];
4683 }
4684 let text = path.to_string_lossy();
4685 if text.contains(',') {
4686 split_path_list(&text)
4687 } else {
4688 vec![path.to_path_buf()]
4689 }
4690}
4691
4692fn apply_output_count_adjustments(config: &Config, counts: &mut OutputCounts) {
4693 let OutputCounts::Exact(counts) = counts else {
4694 return;
4695 };
4696 apply_trusted_build_pass_filter(config, counts);
4697 apply_prefilter_collision_estimates(config, counts);
4698 apply_count_min_collision_estimates(config, counts);
4699}
4700
4701fn apply_trusted_build_pass_filter(config: &Config, counts: &mut CountMap) {
4702 if config.build_passes <= 1 || counts.len() < 2 {
4703 return;
4704 }
4705 let decrement = (config.build_passes as u64).saturating_sub(1);
4706 for count in counts.values_mut() {
4707 if *count > 1 {
4708 *count = count.saturating_sub(decrement).max(1);
4709 }
4710 }
4711}
4712
4713fn apply_prefilter_collision_estimates(config: &Config, counts: &mut CountMap) {
4714 if config.force_exact_counts {
4715 return;
4716 }
4717 if !use_prefilter_collision_estimates(config) {
4718 return;
4719 };
4720 if counts.len() < 2 {
4721 return;
4722 }
4723 let entries = sorted_count_entries(counts);
4724 let Ok(mut sketch) = new_prefilter_count_min_sketch(config) else {
4725 return;
4726 };
4727 sketch.add_key_counts(counts);
4728
4729 for (key, exact) in entries {
4730 let estimate = sketch.depth(&key);
4731 if estimate < sketch.max_count {
4732 counts.insert(key, estimate);
4733 } else {
4734 counts.insert(key, exact);
4735 }
4736 }
4737}
4738
4739fn use_prefilter_collision_estimates(config: &Config) -> bool {
4740 if config.prefilter.force_disabled {
4741 return false;
4742 }
4743 config.prefilter.cells.is_some()
4744 || config.prefilter.hashes.is_some()
4745 || config.prefilter.memory_bytes.is_some()
4746 || config
4747 .prefilter
4748 .memory_fraction_micros
4749 .is_some_and(|fraction| fraction > 0)
4750 || (config.prefilter.enabled && use_bounded_input_sketch(config))
4751}
4752
4753fn prefilter_total_cells(config: &Config, bits: u8) -> usize {
4754 if let Some(cells) = config.prefilter.cells {
4755 return cells.max(1);
4756 }
4757 if let Some(memory_bytes) = config.prefilter.memory_bytes {
4758 return count_min_cells_from_memory(Some(memory_bytes), bits);
4759 }
4760 if let Some(fraction_micros) = prefilter_memory_fraction_micros(config) {
4761 if let Some(total_cells) = config.count_min.cells {
4762 let main_bits = config.count_min.bits.unwrap_or(32).max(1) as usize;
4763 let prefilter_bits = scale_by_micros(
4764 total_cells.max(1).saturating_mul(main_bits),
4765 fraction_micros,
4766 )
4767 .max(bits.max(1) as usize);
4768 return (prefilter_bits / bits.max(1) as usize).max(1);
4769 }
4770 if let Some(memory_bytes) =
4771 count_min_memory_bytes(config).or(config.auto_count_min_memory_bytes)
4772 {
4773 let prefilter_memory = scale_by_micros(memory_bytes, fraction_micros).max(1);
4774 return count_min_cells_from_memory(Some(prefilter_memory), bits);
4775 }
4776 }
4777 DEFAULT_PREFILTER_CELLS
4778}
4779
4780fn apply_count_min_collision_estimates(config: &Config, counts: &mut CountMap) {
4781 if config.force_exact_counts {
4782 return;
4783 }
4784 let Some(cells) = config.count_min.cells else {
4785 return;
4786 };
4787 if cells == 0 || counts.len() < 2 {
4788 return;
4789 }
4790 let entries = sorted_count_entries(counts);
4791 let Ok(mut sketch) = new_bounded_count_min_sketch(config) else {
4792 return;
4793 };
4794 sketch.add_key_counts(counts);
4795
4796 for (key, exact) in entries {
4797 let exact = exact.min(sketch.max_count);
4798 let estimate = sketch.depth(&key).max(exact).min(sketch.max_count);
4799 counts.insert(key, estimate);
4800 }
4801}
4802
4803fn sorted_count_entries(counts: &CountMap) -> Vec<(KmerKey, u64)> {
4804 let mut entries: Vec<_> = counts
4805 .iter()
4806 .map(|(key, &count)| (key.clone(), count))
4807 .collect();
4808 entries.sort_unstable_by(|(left, _), (right, _)| left.cmp(right));
4809 entries
4810}
4811
4812impl PackedCountMinSketch {
4813 #[cfg(test)]
4814 fn new(cells: usize, hashes: usize, bits: u8) -> Result<Self> {
4815 Self::new_with_min_arrays(cells, hashes, bits, BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS)
4816 }
4817
4818 fn new_with_min_arrays(
4819 cells: usize,
4820 hashes: usize,
4821 bits: u8,
4822 min_arrays: usize,
4823 ) -> Result<Self> {
4824 Self::new_with_min_arrays_and_mask_seed(
4825 cells,
4826 hashes,
4827 bits,
4828 min_arrays,
4829 BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
4830 )
4831 }
4832
4833 fn new_with_min_arrays_and_mask_seed(
4834 cells: usize,
4835 hashes: usize,
4836 bits: u8,
4837 min_arrays: usize,
4838 mask_seed: u64,
4839 ) -> Result<Self> {
4840 let cells = cells.max(1);
4841 let hashes = hashes.clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
4842 let bits = bits.clamp(1, 64);
4843 let layout = KCountArrayLayout::new_with_min_arrays_and_mask_seed(
4844 cells, bits, min_arrays, mask_seed,
4845 );
4846 let word_count = if bits == 64 {
4847 cells
4848 } else {
4849 let total_bits = cells
4850 .checked_mul(bits as usize)
4851 .context("bounded sketch bit count overflowed")?;
4852 total_bits.div_ceil(64)
4853 };
4854 let words = zeroed_u64_vec(word_count).context("allocating bounded count-min sketch")?;
4855 Ok(Self {
4856 cells,
4857 hashes,
4858 bits,
4859 max_count: count_min_max_count(bits),
4860 layout,
4861 update_mode: CountMinUpdateMode::Conservative,
4862 words,
4863 increments: 0,
4864 occupied_slots: 0,
4865 tracked_slots: packed_sketch_should_track_slots(cells).then(Vec::new),
4866 })
4867 }
4868
4869 fn with_update_mode(mut self, update_mode: CountMinUpdateMode) -> Self {
4870 self.update_mode = update_mode;
4871 self
4872 }
4873
4874 fn layout_summary(
4875 &self,
4876 table: &'static str,
4877 prefilter_limit: Option<u64>,
4878 ) -> SketchLayoutSummary {
4879 SketchLayoutSummary {
4880 table,
4881 kind: "packed",
4882 cells: self.cells,
4883 hashes: self.hashes,
4884 bits: self.bits,
4885 arrays: self.layout.array_count(),
4886 cells_per_array: self.layout.cells_per_array,
4887 mask_seed: self.layout.mask_seed,
4888 update_mode: self.update_mode.as_str(),
4889 max_count: self.max_count,
4890 memory_bytes: self.estimated_memory_bytes(),
4891 prefilter_limit,
4892 }
4893 }
4894
4895 fn estimated_memory_bytes(&self) -> usize {
4896 self.words
4897 .len()
4898 .saturating_mul(std::mem::size_of::<u64>())
4899 .saturating_add(self.tracked_slot_memory_bytes())
4900 }
4901
4902 fn tracked_slot_memory_bytes(&self) -> usize {
4903 self.tracked_slots.as_ref().map_or(0, |slots| {
4904 slots
4905 .capacity()
4906 .saturating_mul(std::mem::size_of::<usize>())
4907 })
4908 }
4909
4910 fn increment(&mut self, key: &KmerKey) {
4911 self.add_key_count(key, 1);
4912 self.increments = self.increments.saturating_add(1);
4913 }
4914
4915 fn add_key_count(&mut self, key: &KmerKey, count: u64) {
4916 let _ = self.increment_and_return_unincremented(key, count);
4917 }
4918
4919 fn increment_and_return_unincremented(&mut self, key: &KmerKey, count: u64) -> u64 {
4920 if count == 0 {
4921 return self.depth(key);
4922 }
4923 if self.update_mode == CountMinUpdateMode::Independent {
4924 return self.increment_independent_and_return_unincremented(key, count);
4925 }
4926 if self.bits == 2 && self.hashes == 2 {
4927 return self.increment_2bit_2hash_conservative_and_return_unincremented(key, count);
4928 }
4929 if self.bits == 16 && self.hashes == 3 {
4930 return self.increment_16bit_3hash_conservative_and_return_unincremented(key, count);
4931 }
4932 let target_increment = count.min(self.max_count);
4933 let mut slots = [0usize; 16];
4934 let mut min_depth = self.max_count;
4935 fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
4936 for slot in slots.iter().take(self.hashes) {
4937 min_depth = min_depth.min(self.cell(*slot));
4938 }
4939 if min_depth >= self.max_count {
4940 return min_depth;
4941 }
4942 let target = min_depth
4943 .saturating_add(target_increment)
4944 .min(self.max_count);
4945 let mut previous_min = self.max_count;
4946 for slot in slots.iter().take(self.hashes) {
4947 let previous = self.cell(*slot);
4948 previous_min = previous_min.min(previous);
4949 if previous < target {
4950 self.set_cell_with_previous(*slot, previous, target);
4951 }
4952 }
4953 previous_min
4954 }
4955
4956 fn increment_16bit_3hash_conservative_and_return_unincremented(
4957 &mut self,
4958 key: &KmerKey,
4959 count: u64,
4960 ) -> u64 {
4961 let [first, second, third] = count_min_three_buckets_raw(raw_kmer_key(key), self.layout);
4962 let first_depth = self.cell_16bit(first);
4963 let second_depth = self.cell_16bit(second);
4964 let third_depth = self.cell_16bit(third);
4965 let min_depth = first_depth.min(second_depth).min(third_depth);
4966 if min_depth >= self.max_count {
4967 return min_depth;
4968 }
4969 let target = min_depth
4970 .saturating_add(count.min(self.max_count))
4971 .min(self.max_count);
4972 if first_depth < target {
4973 self.set_cell_16bit_with_previous(first, first_depth, target);
4974 }
4975 if second_depth < target {
4976 self.set_cell_16bit_with_previous(second, second_depth, target);
4977 }
4978 if third_depth < target {
4979 self.set_cell_16bit_with_previous(third, third_depth, target);
4980 }
4981 min_depth
4982 }
4983
4984 fn increment_2bit_2hash_conservative_and_return_unincremented(
4985 &mut self,
4986 key: &KmerKey,
4987 count: u64,
4988 ) -> u64 {
4989 let [first, second] = count_min_two_buckets(key, self.layout);
4990 let first_depth = self.cell_2bit(first);
4991 let second_depth = self.cell_2bit(second);
4992 let min_depth = first_depth.min(second_depth);
4993 if min_depth >= self.max_count {
4994 return min_depth;
4995 }
4996 let target = min_depth
4997 .saturating_add(count.min(self.max_count))
4998 .min(self.max_count);
4999 if first_depth < target {
5000 self.set_cell_2bit_with_previous(first, first_depth, target);
5001 }
5002 if second_depth < target {
5003 self.set_cell_2bit_with_previous(second, second_depth, target);
5004 }
5005 min_depth
5006 }
5007
5008 fn increment_independent_and_return_unincremented(&mut self, key: &KmerKey, count: u64) -> u64 {
5009 if count == 0 {
5010 return self.depth(key);
5011 }
5012 let increment = count.min(self.max_count);
5013 let mut previous_min = self.max_count;
5014 let mut slots = [0usize; 16];
5015 fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5016 for slot in slots.iter().take(self.hashes) {
5017 let previous = self.cell(*slot);
5018 previous_min = previous_min.min(previous);
5019 let next = previous.saturating_add(increment).min(self.max_count);
5020 self.set_cell_with_previous(*slot, previous, next);
5021 }
5022 previous_min
5023 }
5024
5025 fn add_key_counts(&mut self, counts: &CountMap) {
5026 if self.update_mode == CountMinUpdateMode::Conservative
5027 && self.bits == 16
5028 && self.hashes == 3
5029 {
5030 for (key, count) in counts {
5031 let _ =
5032 self.increment_16bit_3hash_conservative_and_return_unincremented(key, *count);
5033 }
5034 return;
5035 }
5036 for (key, count) in counts {
5037 self.add_key_count(key, *count);
5038 }
5039 }
5040
5041 fn add_key_increments(&mut self, key_increments: u64) {
5042 self.increments = self.increments.saturating_add(key_increments);
5043 }
5044
5045 fn depth_16bit_3hash(&self, key: &KmerKey) -> u64 {
5046 let [first, second, third] = count_min_three_buckets_raw(raw_kmer_key(key), self.layout);
5047 self.cell_16bit(first)
5048 .min(self.cell_16bit(second))
5049 .min(self.cell_16bit(third))
5050 }
5051
5052 fn occupied_slots_at_least(&self, min_depth: u64) -> usize {
5053 if min_depth > self.max_count {
5054 return 0;
5055 }
5056 if min_depth <= 1 {
5057 return self.occupied_slots;
5058 }
5059 let min_depth = min_depth.max(1);
5060 if let Some(slots) = &self.tracked_slots {
5061 return slots
5062 .par_iter()
5063 .filter(|&&slot| self.cell(slot) >= min_depth)
5064 .count();
5065 }
5066 (0..self.cells)
5067 .into_par_iter()
5068 .filter(|&slot| self.cell(slot) >= min_depth)
5069 .count()
5070 }
5071
5072 fn cell(&self, slot: usize) -> u64 {
5073 if self.bits == 64 {
5074 return self.words[slot];
5075 }
5076 if self.bits == 16 {
5077 return self.cell_16bit(slot);
5078 }
5079 if self.bits == 2 {
5080 return self.cell_2bit(slot);
5081 }
5082 let bit = slot * self.bits as usize;
5083 let word = bit / 64;
5084 let offset = bit % 64;
5085 let mask = (1u64 << self.bits) - 1;
5086 if offset + self.bits as usize <= 64 {
5087 (self.words[word] >> offset) & mask
5088 } else {
5089 let low_bits = 64 - offset;
5090 let high_bits = self.bits as usize - low_bits;
5091 let low = self.words[word] >> offset;
5092 let high = self.words[word + 1] & ((1u64 << high_bits) - 1);
5093 ((high << low_bits) | low) & mask
5094 }
5095 }
5096
5097 fn cell_16bit(&self, slot: usize) -> u64 {
5098 let word = slot >> 2;
5099 let offset = (slot & 3) << 4;
5100 (self.words[word] >> offset) & 0xffff
5101 }
5102
5103 fn cell_2bit(&self, slot: usize) -> u64 {
5104 let word = slot >> 5;
5105 let offset = (slot & 31) << 1;
5106 (self.words[word] >> offset) & 3
5107 }
5108
5109 #[cfg(test)]
5110 fn set_cell(&mut self, slot: usize, value: u64) {
5111 let previous = self.cell(slot);
5112 self.set_cell_with_previous(slot, previous, value);
5113 }
5114
5115 fn set_cell_with_previous(&mut self, slot: usize, previous: u64, value: u64) {
5116 let value = value.min(self.max_count);
5117 self.set_cell_raw(slot, value);
5118 self.note_cell_transition(previous, value, slot);
5119 }
5120
5121 fn set_cell_raw(&mut self, slot: usize, value: u64) {
5122 if self.bits == 64 {
5123 self.words[slot] = value;
5124 return;
5125 }
5126 if self.bits == 16 {
5127 self.set_cell_16bit_raw(slot, value);
5128 return;
5129 }
5130 if self.bits == 2 {
5131 self.set_cell_2bit_raw(slot, value);
5132 return;
5133 }
5134 let bit = slot * self.bits as usize;
5135 let word = bit / 64;
5136 let offset = bit % 64;
5137 let mask = (1u64 << self.bits) - 1;
5138 if offset + self.bits as usize <= 64 {
5139 let shifted_mask = mask << offset;
5140 self.words[word] = (self.words[word] & !shifted_mask) | ((value & mask) << offset);
5141 } else {
5142 let low_bits = 64 - offset;
5143 let high_bits = self.bits as usize - low_bits;
5144 let low_mask = ((1u64 << low_bits) - 1) << offset;
5145 self.words[word] =
5146 (self.words[word] & !low_mask) | ((value & ((1u64 << low_bits) - 1)) << offset);
5147 let high_mask = (1u64 << high_bits) - 1;
5148 self.words[word + 1] =
5149 (self.words[word + 1] & !high_mask) | ((value >> low_bits) & high_mask);
5150 }
5151 }
5152
5153 fn set_cell_16bit_raw(&mut self, slot: usize, value: u64) {
5154 let word = slot >> 2;
5155 let offset = (slot & 3) << 4;
5156 let shifted_mask = 0xffffu64 << offset;
5157 self.words[word] = (self.words[word] & !shifted_mask) | ((value & 0xffff) << offset);
5158 }
5159
5160 fn set_cell_16bit_with_previous(&mut self, slot: usize, previous: u64, value: u64) {
5161 let value = value.min(self.max_count);
5162 self.set_cell_16bit_raw(slot, value);
5163 self.note_cell_transition(previous, value, slot);
5164 }
5165
5166 fn set_cell_2bit_with_previous(&mut self, slot: usize, previous: u64, value: u64) {
5167 let value = value.min(self.max_count);
5168 self.set_cell_2bit_raw(slot, value);
5169 self.note_cell_transition(previous, value, slot);
5170 }
5171
5172 fn set_cell_2bit_raw(&mut self, slot: usize, value: u64) {
5173 let word = slot >> 5;
5174 let offset = (slot & 31) << 1;
5175 let shifted_mask = 3u64 << offset;
5176 self.words[word] = (self.words[word] & !shifted_mask) | ((value & 3) << offset);
5177 }
5178
5179 fn note_cell_transition(&mut self, previous: u64, value: u64, slot: usize) {
5180 match (previous == 0, value == 0) {
5181 (true, false) => {
5182 self.occupied_slots = self.occupied_slots.saturating_add(1);
5183 if let Some(slots) = &mut self.tracked_slots {
5184 if slots.len() < PACKED_SKETCH_TRACKED_SLOT_LIMIT {
5185 slots.push(slot);
5186 } else {
5187 self.tracked_slots = None;
5188 }
5189 }
5190 }
5191 (false, true) => {
5192 self.occupied_slots = self.occupied_slots.saturating_sub(1);
5193 if let Some(slots) = &mut self.tracked_slots
5194 && let Some(index) = slots.iter().position(|&tracked| tracked == slot)
5195 {
5196 slots.swap_remove(index);
5197 }
5198 }
5199 _ => {}
5200 }
5201 }
5202
5203 #[cfg(test)]
5204 fn depth_hist(&self, hist_len: usize) -> Vec<u64> {
5205 let Some(last_index) = hist_len.checked_sub(1) else {
5206 return Vec::new();
5207 };
5208 if let Some(slots) = &self.tracked_slots {
5209 let mut hist = slots
5210 .par_iter()
5211 .fold(Vec::new, |mut local, &slot| {
5212 add_depth_to_dynamic_hist(&mut local, self.cell(slot), last_index);
5213 local
5214 })
5215 .reduce(Vec::new, merge_dynamic_depth_hist);
5216 hist.resize(hist_len, 0);
5217 return hist;
5218 }
5219 let mut hist = (0..self.cells)
5220 .into_par_iter()
5221 .fold(Vec::new, |mut local, slot| {
5222 add_depth_to_dynamic_hist(&mut local, self.cell(slot), last_index);
5223 local
5224 })
5225 .reduce(Vec::new, merge_dynamic_depth_hist);
5226 hist.resize(hist_len, 0);
5227 hist
5228 }
5229
5230 fn sparse_depth_hist(&self, hist_len: usize) -> SparseHist {
5231 let Some(last_index) = hist_len.checked_sub(1) else {
5232 return SparseHist::default();
5233 };
5234 if let Some(slots) = &self.tracked_slots {
5235 return slots
5236 .par_iter()
5237 .fold(SparseHist::default, |mut local, &slot| {
5238 add_depth_to_sparse_hist(&mut local, self.cell(slot), last_index);
5239 local
5240 })
5241 .reduce(SparseHist::default, merge_sparse_depth_hist);
5242 }
5243 (0..self.cells)
5244 .into_par_iter()
5245 .fold(SparseHist::default, |mut local, slot| {
5246 add_depth_to_sparse_hist(&mut local, self.cell(slot), last_index);
5247 local
5248 })
5249 .reduce(SparseHist::default, merge_sparse_depth_hist)
5250 }
5251}
5252
5253impl PrefilterCountMinSketch {
5254 fn max_count(&self) -> u64 {
5255 match self {
5256 Self::Packed(sketch) => sketch.max_count,
5257 Self::AtomicPacked(sketch) => sketch.max_count,
5258 }
5259 }
5260
5261 #[cfg(test)]
5262 fn bits(&self) -> u8 {
5263 match self {
5264 Self::Packed(sketch) => sketch.bits,
5265 Self::AtomicPacked(sketch) => sketch.bits,
5266 }
5267 }
5268
5269 #[cfg(test)]
5270 fn update_mode(&self) -> CountMinUpdateMode {
5271 match self {
5272 Self::Packed(sketch) => sketch.update_mode,
5273 Self::AtomicPacked(sketch) => sketch.update_mode,
5274 }
5275 }
5276
5277 fn layout_summary(
5278 &self,
5279 table: &'static str,
5280 prefilter_limit: Option<u64>,
5281 ) -> SketchLayoutSummary {
5282 match self {
5283 Self::Packed(sketch) => sketch.layout_summary(table, prefilter_limit),
5284 Self::AtomicPacked(sketch) => sketch.layout_summary(table, prefilter_limit),
5285 }
5286 }
5287}
5288
5289impl CountLookup for PrefilterCountMinSketch {
5290 fn depth(&self, key: &KmerKey) -> u64 {
5291 match self {
5292 Self::Packed(sketch) => sketch.depth(key),
5293 Self::AtomicPacked(sketch) => sketch.depth(key),
5294 }
5295 }
5296
5297 fn unique_kmers(&self) -> usize {
5298 match self {
5299 Self::Packed(sketch) => sketch.unique_kmers(),
5300 Self::AtomicPacked(sketch) => sketch.unique_kmers(),
5301 }
5302 }
5303
5304 fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
5305 match self {
5306 Self::Packed(sketch) => sketch.unique_kmers_at_least(min_depth),
5307 Self::AtomicPacked(sketch) => sketch.unique_kmers_at_least(min_depth),
5308 }
5309 }
5310}
5311
5312impl AtomicCountMinSketch {
5313 #[cfg(test)]
5314 fn new(cells: usize, hashes: usize) -> Result<Self> {
5315 Self::new_with_min_arrays(cells, hashes, BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS)
5316 }
5317
5318 #[cfg(test)]
5319 fn new_with_min_arrays(cells: usize, hashes: usize, min_arrays: usize) -> Result<Self> {
5320 Self::new_with_min_arrays_and_update_mode(
5321 cells,
5322 hashes,
5323 min_arrays,
5324 CountMinUpdateMode::Conservative,
5325 BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
5326 )
5327 }
5328
5329 fn new_with_min_arrays_and_update_mode(
5330 cells: usize,
5331 hashes: usize,
5332 min_arrays: usize,
5333 update_mode: CountMinUpdateMode,
5334 mask_seed: u64,
5335 ) -> Result<Self> {
5336 let cells = cells.max(1);
5337 let hashes = hashes.clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
5338 let layout =
5339 KCountArrayLayout::new_with_min_arrays_and_mask_seed(cells, 32, min_arrays, mask_seed);
5340 let cells_by_hash =
5341 zeroed_atomic_u32_vec(cells).context("allocating atomic count-min sketch")?;
5342 let locks = atomic_count_min_locks(update_mode)?;
5343 Ok(Self {
5344 cells,
5345 hashes,
5346 max_count: i32::MAX as u32,
5347 layout,
5348 update_mode,
5349 parallel_replay: false,
5350 cells_by_hash,
5351 locks,
5352 increments: AtomicU64::new(0),
5353 occupied_slots: AtomicUsize::new(0),
5354 })
5355 }
5356
5357 fn with_parallel_replay(mut self, parallel_replay: bool) -> Self {
5358 self.parallel_replay = parallel_replay;
5359 self
5360 }
5361
5362 fn layout_summary(
5363 &self,
5364 table: &'static str,
5365 prefilter_limit: Option<u64>,
5366 ) -> SketchLayoutSummary {
5367 SketchLayoutSummary {
5368 table,
5369 kind: "atomic",
5370 cells: self.cells,
5371 hashes: self.hashes,
5372 bits: 32,
5373 arrays: self.layout.array_count(),
5374 cells_per_array: self.layout.cells_per_array,
5375 mask_seed: self.layout.mask_seed,
5376 update_mode: self.update_mode.as_str(),
5377 max_count: u64::from(self.max_count),
5378 memory_bytes: self
5379 .cells_by_hash
5380 .len()
5381 .saturating_mul(std::mem::size_of::<AtomicU32>())
5382 .saturating_add(
5383 self.locks
5384 .len()
5385 .saturating_mul(std::mem::size_of::<Mutex<()>>()),
5386 )
5387 .saturating_add(std::mem::size_of::<AtomicUsize>()),
5388 prefilter_limit,
5389 }
5390 }
5391
5392 fn increment_key(&self, key: &KmerKey) {
5393 self.add_key_count(key, 1);
5394 }
5395
5396 fn add_key_count(&self, key: &KmerKey, count: u64) {
5397 let (_, newly_occupied) = self.increment_and_count_newly_occupied(key, count);
5398 self.add_occupied_slots(newly_occupied);
5399 }
5400
5401 #[cfg(test)]
5402 fn increment_and_return_unincremented(&self, key: &KmerKey, count: u64) -> u64 {
5403 let (previous_min, newly_occupied) = self.increment_and_count_newly_occupied(key, count);
5404 self.add_occupied_slots(newly_occupied);
5405 previous_min
5406 }
5407
5408 fn add_key_count_counting_newly_occupied(&self, key: &KmerKey, count: u64) -> usize {
5409 self.increment_and_count_newly_occupied(key, count).1
5410 }
5411
5412 fn add_key_count_unlocked_counting_newly_occupied(&self, key: &KmerKey, count: u64) -> usize {
5413 if self.update_mode == CountMinUpdateMode::Independent {
5414 self.increment_independent_and_count_newly_occupied(key, count)
5415 .1
5416 } else {
5417 self.increment_conservative_unlocked_and_count_newly_occupied(key, count)
5418 .1
5419 }
5420 }
5421
5422 fn increment_and_count_newly_occupied(&self, key: &KmerKey, count: u64) -> (u64, usize) {
5423 if count == 0 {
5424 return (self.depth(key), 0);
5425 }
5426 if self.update_mode == CountMinUpdateMode::Independent {
5427 return self.increment_independent_and_count_newly_occupied(key, count);
5428 }
5429 let _guard = self.lock_for_key(key);
5430 self.increment_conservative_unlocked_and_count_newly_occupied(key, count)
5431 }
5432
5433 fn increment_conservative_unlocked_and_count_newly_occupied(
5434 &self,
5435 key: &KmerKey,
5436 count: u64,
5437 ) -> (u64, usize) {
5438 let target_increment = count.min(u64::from(self.max_count)) as u32;
5439 if self.hashes == 3 {
5440 return self.increment_conservative_three_unlocked_and_count_newly_occupied(
5441 key,
5442 target_increment,
5443 );
5444 }
5445 let mut slots = [0usize; 16];
5446 let mut min_depth = self.max_count;
5447 fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5448 for slot in slots.iter().take(self.hashes) {
5449 min_depth = min_depth.min(self.cells_by_hash[*slot].load(Ordering::Relaxed));
5450 }
5451 if min_depth >= self.max_count {
5452 return (u64::from(min_depth), 0);
5453 }
5454 let target = min_depth
5455 .saturating_add(target_increment)
5456 .min(self.max_count);
5457 let mut previous_min = self.max_count;
5458 let mut newly_occupied = 0usize;
5459 for slot in slots.iter().take(self.hashes) {
5460 let (previous, cell_newly_occupied) =
5461 raise_atomic_cell_to_at_least(&self.cells_by_hash[*slot], target);
5462 previous_min = previous_min.min(previous);
5463 newly_occupied += usize::from(cell_newly_occupied);
5464 }
5465 (u64::from(previous_min), newly_occupied)
5466 }
5467
5468 fn increment_conservative_three_unlocked_and_count_newly_occupied(
5469 &self,
5470 key: &KmerKey,
5471 target_increment: u32,
5472 ) -> (u64, usize) {
5473 let [first, second, third] = count_min_three_buckets(key, self.layout);
5474 let first_depth = self.cells_by_hash[first].load(Ordering::Relaxed);
5475 let second_depth = self.cells_by_hash[second].load(Ordering::Relaxed);
5476 let third_depth = self.cells_by_hash[third].load(Ordering::Relaxed);
5477 let min_depth = first_depth.min(second_depth).min(third_depth);
5478 if min_depth >= self.max_count {
5479 return (u64::from(min_depth), 0);
5480 }
5481 let target = min_depth
5482 .saturating_add(target_increment)
5483 .min(self.max_count);
5484 let (first_previous, first_new) =
5485 raise_atomic_cell_to_at_least(&self.cells_by_hash[first], target);
5486 let (second_previous, second_new) =
5487 raise_atomic_cell_to_at_least(&self.cells_by_hash[second], target);
5488 let (third_previous, third_new) =
5489 raise_atomic_cell_to_at_least(&self.cells_by_hash[third], target);
5490 (
5491 u64::from(first_previous.min(second_previous).min(third_previous)),
5492 usize::from(first_new) + usize::from(second_new) + usize::from(third_new),
5493 )
5494 }
5495
5496 fn lock_for_key(&self, key: &KmerKey) -> std::sync::MutexGuard<'_, ()> {
5497 let lock_index = kcount_array_lock_index(key);
5498 self.locks[lock_index]
5499 .lock()
5500 .unwrap_or_else(|poisoned| poisoned.into_inner())
5501 }
5502
5503 fn increment_independent_and_count_newly_occupied(
5504 &self,
5505 key: &KmerKey,
5506 count: u64,
5507 ) -> (u64, usize) {
5508 if count == 0 {
5509 return (self.depth(key), 0);
5510 }
5511 let increment = count.min(u64::from(self.max_count)) as u32;
5512 let mut previous_min = self.max_count;
5513 let mut newly_occupied = 0usize;
5514 let mut slots = [0usize; 16];
5515 fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5516 for slot in slots.iter().take(self.hashes) {
5517 let (previous, cell_newly_occupied) = increment_atomic_cell_saturating(
5518 &self.cells_by_hash[*slot],
5519 increment,
5520 self.max_count,
5521 );
5522 previous_min = previous_min.min(previous);
5523 newly_occupied += usize::from(cell_newly_occupied);
5524 }
5525 (u64::from(previous_min), newly_occupied)
5526 }
5527
5528 fn add_key_counts(&self, counts: &CountMap) {
5529 let newly_occupied =
5530 if self.parallel_replay && counts.len() >= ATOMIC_SKETCH_PAR_REPLAY_MIN_KEYS {
5531 counts
5532 .par_iter()
5533 .map(|(key, count)| self.add_key_count_counting_newly_occupied(key, *count))
5534 .sum()
5535 } else {
5536 counts
5537 .iter()
5538 .map(|(key, count)| {
5539 self.add_key_count_unlocked_counting_newly_occupied(key, *count)
5540 })
5541 .sum()
5542 };
5543 self.add_occupied_slots(newly_occupied);
5544 }
5545
5546 fn add_key_increments(&self, key_increments: u64) {
5547 self.increments.fetch_add(key_increments, Ordering::Relaxed);
5548 }
5549
5550 fn add_occupied_slots(&self, newly_occupied: usize) {
5551 if newly_occupied > 0 {
5552 self.occupied_slots
5553 .fetch_add(newly_occupied, Ordering::Relaxed);
5554 }
5555 }
5556
5557 fn occupied_slots_at_least(&self, min_depth: u64) -> usize {
5558 if min_depth > u64::from(self.max_count) {
5559 return 0;
5560 }
5561 if min_depth <= 1 {
5562 return self.occupied_slots.load(Ordering::Relaxed);
5563 }
5564 let min_depth = min_depth.max(1) as u32;
5565 self.cells_by_hash
5566 .par_iter()
5567 .filter(|cell| cell.load(Ordering::Relaxed) >= min_depth)
5568 .count()
5569 }
5570
5571 #[cfg(test)]
5572 fn depth_hist(&self, hist_len: usize) -> Vec<u64> {
5573 let Some(last_index) = hist_len.checked_sub(1) else {
5574 return Vec::new();
5575 };
5576 let mut hist = self
5577 .cells_by_hash
5578 .par_iter()
5579 .fold(Vec::new, |mut local, cell| {
5580 add_depth_to_dynamic_hist(
5581 &mut local,
5582 u64::from(cell.load(Ordering::Relaxed)),
5583 last_index,
5584 );
5585 local
5586 })
5587 .reduce(Vec::new, merge_dynamic_depth_hist);
5588 hist.resize(hist_len, 0);
5589 hist
5590 }
5591
5592 fn sparse_depth_hist(&self, hist_len: usize) -> SparseHist {
5593 let Some(last_index) = hist_len.checked_sub(1) else {
5594 return SparseHist::default();
5595 };
5596 self.cells_by_hash
5597 .par_iter()
5598 .fold(SparseHist::default, |mut local, cell| {
5599 add_depth_to_sparse_hist(
5600 &mut local,
5601 u64::from(cell.load(Ordering::Relaxed)),
5602 last_index,
5603 );
5604 local
5605 })
5606 .reduce(SparseHist::default, merge_sparse_depth_hist)
5607 }
5608}
5609
5610impl AtomicPackedCountMinSketch {
5611 fn new_with_min_arrays_and_update_mode(
5612 cells: usize,
5613 hashes: usize,
5614 bits: u8,
5615 min_arrays: usize,
5616 update_mode: CountMinUpdateMode,
5617 mask_seed: u64,
5618 ) -> Result<Self> {
5619 let cells = cells.max(1);
5620 let hashes = hashes.clamp(1, BBTOOLS_KCOUNT_ARRAY_MAX_HASHES);
5621 ensure!(
5622 bits.is_power_of_two() && bits <= 64,
5623 "atomic packed count-min sketches require power-of-two cell bits up to 64"
5624 );
5625 let layout = KCountArrayLayout::new_with_min_arrays_and_mask_seed(
5626 cells, bits, min_arrays, mask_seed,
5627 );
5628 let word_count = if bits == 64 {
5629 cells
5630 } else {
5631 let cells_per_word = 64 / bits as usize;
5632 cells.div_ceil(cells_per_word)
5633 };
5634 let words = zeroed_atomic_u64_vec(word_count)
5635 .context("allocating atomic packed count-min sketch")?;
5636 let locks = atomic_count_min_locks(update_mode)?;
5637 Ok(Self {
5638 cells,
5639 hashes,
5640 bits,
5641 max_count: count_min_max_count(bits),
5642 layout,
5643 update_mode,
5644 words,
5645 locks,
5646 increments: AtomicU64::new(0),
5647 occupied_slots: AtomicUsize::new(0),
5648 })
5649 }
5650
5651 fn layout_summary(
5652 &self,
5653 table: &'static str,
5654 prefilter_limit: Option<u64>,
5655 ) -> SketchLayoutSummary {
5656 SketchLayoutSummary {
5657 table,
5658 kind: "atomic_packed",
5659 cells: self.cells,
5660 hashes: self.hashes,
5661 bits: self.bits,
5662 arrays: self.layout.array_count(),
5663 cells_per_array: self.layout.cells_per_array,
5664 mask_seed: self.layout.mask_seed,
5665 update_mode: self.update_mode.as_str(),
5666 max_count: self.max_count,
5667 memory_bytes: self
5668 .words
5669 .len()
5670 .saturating_mul(std::mem::size_of::<AtomicU64>())
5671 .saturating_add(
5672 self.locks
5673 .len()
5674 .saturating_mul(std::mem::size_of::<Mutex<()>>()),
5675 )
5676 .saturating_add(std::mem::size_of::<AtomicUsize>()),
5677 prefilter_limit,
5678 }
5679 }
5680
5681 #[cfg(test)]
5682 fn add_key_count(&self, key: &KmerKey, count: u64) {
5683 let (_, newly_occupied) = self.increment_and_count_newly_occupied(key, count);
5684 self.add_occupied_slots(newly_occupied);
5685 }
5686
5687 fn add_key_count_counting_newly_occupied(&self, key: &KmerKey, count: u64) -> usize {
5688 self.increment_and_count_newly_occupied(key, count).1
5689 }
5690
5691 fn increment_and_count_newly_occupied(&self, key: &KmerKey, count: u64) -> (u64, usize) {
5692 if count == 0 {
5693 return (self.depth(key), 0);
5694 }
5695 if self.update_mode == CountMinUpdateMode::Independent {
5696 return self.increment_independent_and_count_newly_occupied(key, count);
5697 }
5698 let _guard = self.lock_for_key(key);
5699 let target_increment = count.min(self.max_count);
5700 let mut slots = [0usize; 16];
5701 let mut min_depth = self.max_count;
5702 fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5703 for slot in slots.iter().take(self.hashes) {
5704 min_depth = min_depth.min(self.cell(*slot));
5705 }
5706 if min_depth >= self.max_count {
5707 return (min_depth, 0);
5708 }
5709 let target = min_depth
5710 .saturating_add(target_increment)
5711 .min(self.max_count);
5712 let mut previous_min = self.max_count;
5713 let mut newly_occupied = 0usize;
5714 for slot in slots.iter().take(self.hashes) {
5715 let (previous, cell_newly_occupied) = self.raise_cell_to_at_least(*slot, target);
5716 previous_min = previous_min.min(previous);
5717 newly_occupied += usize::from(cell_newly_occupied);
5718 }
5719 (previous_min, newly_occupied)
5720 }
5721
5722 fn increment_independent_and_count_newly_occupied(
5723 &self,
5724 key: &KmerKey,
5725 count: u64,
5726 ) -> (u64, usize) {
5727 if count == 0 {
5728 return (self.depth(key), 0);
5729 }
5730 let increment = count.min(self.max_count);
5731 let mut previous_min = self.max_count;
5732 let mut newly_occupied = 0usize;
5733 let mut slots = [0usize; 16];
5734 fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5735 for slot in slots.iter().take(self.hashes) {
5736 let (previous, cell_newly_occupied) = self.increment_cell_saturating(*slot, increment);
5737 previous_min = previous_min.min(previous);
5738 newly_occupied += usize::from(cell_newly_occupied);
5739 }
5740 (previous_min, newly_occupied)
5741 }
5742
5743 fn add_key_increments(&self, key_increments: u64) {
5744 self.increments.fetch_add(key_increments, Ordering::Relaxed);
5745 }
5746
5747 fn add_occupied_slots(&self, newly_occupied: usize) {
5748 if newly_occupied > 0 {
5749 self.occupied_slots
5750 .fetch_add(newly_occupied, Ordering::Relaxed);
5751 }
5752 }
5753
5754 fn lock_for_key(&self, key: &KmerKey) -> std::sync::MutexGuard<'_, ()> {
5755 let lock_index = kcount_array_lock_index(key);
5756 self.locks[lock_index]
5757 .lock()
5758 .unwrap_or_else(|poisoned| poisoned.into_inner())
5759 }
5760
5761 fn cell(&self, slot: usize) -> u64 {
5762 let position = self.cell_position(slot);
5763 (self.words[position.word].load(Ordering::Relaxed) >> position.shift) & position.mask
5764 }
5765
5766 fn raise_cell_to_at_least(&self, slot: usize, target: u64) -> (u64, bool) {
5767 let target = target.min(self.max_count);
5768 let position = self.cell_position(slot);
5769 let cell = &self.words[position.word];
5770 let mut current = cell.load(Ordering::Relaxed);
5771 loop {
5772 let previous = (current >> position.shift) & position.mask;
5773 if previous >= target {
5774 return (previous, false);
5775 }
5776 let next = replace_packed_cell(current, position, target);
5777 match cell.compare_exchange_weak(current, next, Ordering::Relaxed, Ordering::Relaxed) {
5778 Ok(_) => return (previous, previous == 0 && target > 0),
5779 Err(observed) => current = observed,
5780 }
5781 }
5782 }
5783
5784 fn increment_cell_saturating(&self, slot: usize, increment: u64) -> (u64, bool) {
5785 let increment = increment.min(self.max_count);
5786 let position = self.cell_position(slot);
5787 let cell = &self.words[position.word];
5788 let mut current = cell.load(Ordering::Relaxed);
5789 loop {
5790 let previous = (current >> position.shift) & position.mask;
5791 if previous >= self.max_count {
5792 return (previous, false);
5793 }
5794 let next_value = previous.saturating_add(increment).min(self.max_count);
5795 let next = replace_packed_cell(current, position, next_value);
5796 match cell.compare_exchange_weak(current, next, Ordering::Relaxed, Ordering::Relaxed) {
5797 Ok(_) => return (previous, previous == 0 && next_value > 0),
5798 Err(observed) => current = observed,
5799 }
5800 }
5801 }
5802
5803 fn cell_position(&self, slot: usize) -> PackedCellPosition {
5804 if self.bits == 64 {
5805 return PackedCellPosition {
5806 word: slot,
5807 shift: 0,
5808 mask: u64::MAX,
5809 };
5810 }
5811 let cells_per_word = 64 / self.bits as usize;
5812 let word = slot / cells_per_word;
5813 let shift = (slot % cells_per_word) * self.bits as usize;
5814 let mask = (1u64 << self.bits) - 1;
5815 PackedCellPosition { word, shift, mask }
5816 }
5817
5818 fn occupied_slots_at_least(&self, min_depth: u64) -> usize {
5819 if min_depth > self.max_count {
5820 return 0;
5821 }
5822 if min_depth <= 1 {
5823 return self.occupied_slots.load(Ordering::Relaxed);
5824 }
5825 let min_depth = min_depth.max(1);
5826 (0..self.cells)
5827 .into_par_iter()
5828 .filter(|&slot| self.cell(slot) >= min_depth)
5829 .count()
5830 }
5831}
5832
5833#[derive(Debug, Clone, Copy)]
5834struct PackedCellPosition {
5835 word: usize,
5836 shift: usize,
5837 mask: u64,
5838}
5839
5840fn replace_packed_cell(word: u64, position: PackedCellPosition, value: u64) -> u64 {
5841 let shifted_mask = position.mask << position.shift;
5842 (word & !shifted_mask) | ((value & position.mask) << position.shift)
5843}
5844
5845fn raise_atomic_cell_to_at_least(cell: &AtomicU32, target: u32) -> (u32, bool) {
5846 let mut current = cell.load(Ordering::Relaxed);
5847 loop {
5848 if current >= target {
5849 return (current, false);
5850 }
5851 match cell.compare_exchange_weak(current, target, Ordering::Relaxed, Ordering::Relaxed) {
5852 Ok(_) => return (current, current == 0 && target > 0),
5853 Err(observed) => current = observed,
5854 }
5855 }
5856}
5857
5858fn increment_atomic_cell_saturating(
5859 cell: &AtomicU32,
5860 increment: u32,
5861 max_count: u32,
5862) -> (u32, bool) {
5863 let mut current = cell.load(Ordering::Relaxed);
5864 loop {
5865 if current >= max_count {
5866 return (current, false);
5867 }
5868 let next = current.saturating_add(increment).min(max_count);
5869 match cell.compare_exchange_weak(current, next, Ordering::Relaxed, Ordering::Relaxed) {
5870 Ok(_) => return (current, current == 0 && next > 0),
5871 Err(observed) => current = observed,
5872 }
5873 }
5874}
5875
5876fn atomic_count_min_locks(update_mode: CountMinUpdateMode) -> Result<Vec<Mutex<()>>> {
5877 if update_mode == CountMinUpdateMode::Independent {
5878 return Ok(Vec::new());
5879 }
5880 let mut locks = Vec::new();
5881 locks
5882 .try_reserve_exact(BBTOOLS_KCOUNT_ARRAY_LOCKS)
5883 .context("allocating atomic count-min sketch locks")?;
5884 locks.resize_with(BBTOOLS_KCOUNT_ARRAY_LOCKS, || Mutex::new(()));
5885 Ok(locks)
5886}
5887
5888impl CountLookup for PackedCountMinSketch {
5889 fn depth(&self, key: &KmerKey) -> u64 {
5890 if self.bits == 16 && self.hashes == 3 {
5891 return self.depth_16bit_3hash(key);
5892 }
5893 let mut slots = [0usize; 16];
5894 fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5895 slots
5896 .iter()
5897 .take(self.hashes)
5898 .map(|&slot| self.cell(slot))
5899 .min()
5900 .unwrap_or(0)
5901 }
5902
5903 fn unique_kmers(&self) -> usize {
5904 self.unique_kmers_at_least(1)
5905 }
5906
5907 fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
5908 let occupied = self.occupied_slots_at_least(min_depth);
5909 estimate_unique_kmers_from_occupied(self.cells, occupied, self.hashes, self.increments)
5910 }
5911}
5912
5913impl CountLookup for AtomicCountMinSketch {
5914 fn depth(&self, key: &KmerKey) -> u64 {
5915 let mut slots = [0usize; 16];
5916 fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5917 slots
5918 .iter()
5919 .take(self.hashes)
5920 .map(|&slot| u64::from(self.cells_by_hash[slot].load(Ordering::Relaxed)))
5921 .min()
5922 .unwrap_or(0)
5923 }
5924
5925 fn unique_kmers(&self) -> usize {
5926 self.unique_kmers_at_least(1)
5927 }
5928
5929 fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
5930 let occupied = self.occupied_slots_at_least(min_depth);
5931 let increments = self.increments.load(Ordering::Relaxed);
5932 estimate_unique_kmers_from_occupied(self.cells, occupied, self.hashes, increments)
5933 }
5934}
5935
5936impl CountLookup for AtomicPackedCountMinSketch {
5937 fn depth(&self, key: &KmerKey) -> u64 {
5938 let mut slots = [0usize; 16];
5939 fill_count_min_buckets(key, self.hashes, self.layout, &mut slots);
5940 slots
5941 .iter()
5942 .take(self.hashes)
5943 .map(|&slot| self.cell(slot))
5944 .min()
5945 .unwrap_or(0)
5946 }
5947
5948 fn unique_kmers(&self) -> usize {
5949 self.unique_kmers_at_least(1)
5950 }
5951
5952 fn unique_kmers_at_least(&self, min_depth: u64) -> usize {
5953 let occupied = self.occupied_slots_at_least(min_depth);
5954 let increments = self.increments.load(Ordering::Relaxed);
5955 estimate_unique_kmers_from_occupied(self.cells, occupied, self.hashes, increments)
5956 }
5957}
5958
5959fn estimate_unique_kmers_from_occupied(
5960 total_slots: usize,
5961 occupied_slots: usize,
5962 hashes: usize,
5963 increments: u64,
5964) -> usize {
5965 if occupied_slots == 0 || total_slots == 0 {
5968 return 0;
5969 }
5970 let increment_cap = usize_from_u64_saturating(increments);
5971 if occupied_slots >= total_slots {
5972 return increment_cap;
5973 }
5974 let used_fraction = occupied_slots as f64 / total_slots as f64;
5975 let hash_count = hashes.max(1) as f64;
5976 let one_hash_fraction = 1.0 - (1.0 - used_fraction).powf(1.0 / hash_count);
5977 let estimate = (-(total_slots as f64) * (1.0 - one_hash_fraction).ln()).round();
5978 let estimate = estimate.max(1.0) as usize;
5979 estimate.min(increment_cap)
5980}
5981
5982fn usize_from_u64_saturating(value: u64) -> usize {
5983 usize::try_from(value).unwrap_or(usize::MAX)
5984}
5985
5986fn count_min_max_count(bits: u8) -> u64 {
5987 if bits >= 31 {
5988 i32::MAX as u64
5989 } else {
5990 (1u64 << bits.max(1)) - 1
5991 }
5992}
5993
5994impl KCountArrayLayout {
5995 #[cfg(test)]
5996 fn new(cells: usize, bits: u8) -> Self {
5997 Self::new_with_min_arrays(cells, bits, BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS)
5998 }
5999
6000 #[cfg(test)]
6001 fn new_with_min_arrays(cells: usize, bits: u8, min_arrays: usize) -> Self {
6002 Self::new_with_min_arrays_and_mask_seed(
6003 cells,
6004 bits,
6005 min_arrays,
6006 BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
6007 )
6008 }
6009
6010 fn new_with_min_arrays_and_mask_seed(
6011 cells: usize,
6012 bits: u8,
6013 min_arrays: usize,
6014 mask_seed: u64,
6015 ) -> Self {
6016 let cells = cells.max(1);
6017 let arrays = kcount_array_count(cells, bits, min_arrays);
6018 let cells_per_array = (cells / arrays).max(1);
6019 Self {
6020 array_mask: arrays.saturating_sub(1) as u64,
6021 array_bits: arrays.trailing_zeros(),
6022 cells_per_array,
6023 mask_seed,
6024 masks: bbtools_hash_masks(mask_seed),
6025 }
6026 }
6027
6028 fn array_count(self) -> usize {
6029 self.array_mask.saturating_add(1) as usize
6030 }
6031
6032 fn bucket(self, hashed: u64) -> usize {
6033 if self.cells_per_array <= 1 && self.array_mask == 0 {
6034 return 0;
6035 }
6036 let array_num = (hashed & self.array_mask) as usize;
6037 let cell = ((hashed >> self.array_bits) % self.cells_per_array as u64) as usize;
6038 array_num * self.cells_per_array + cell
6039 }
6040}
6041
6042#[cfg(test)]
6043fn count_min_bucket(key: &KmerKey, hash_index: usize, cells: usize) -> usize {
6044 count_min_bucket_with_layout(key, hash_index, KCountArrayLayout::new(cells, 32))
6045}
6046
6047#[cfg(test)]
6048fn count_min_bucket_with_layout(
6049 key: &KmerKey,
6050 hash_index: usize,
6051 layout: KCountArrayLayout,
6052) -> usize {
6053 let hashed = bbtools_count_min_row_hash_with_masks(raw_kmer_key(key), hash_index, layout.masks);
6054 layout.bucket(hashed)
6055}
6056
6057#[inline]
6058fn fill_count_min_buckets(
6059 key: &KmerKey,
6060 hashes: usize,
6061 layout: KCountArrayLayout,
6062 slots: &mut [usize; 16],
6063) {
6064 let hashes = hashes.min(slots.len());
6065 if hashes == 0 {
6066 return;
6067 }
6068 let mut hashed = bbtools_mask_hash_with_masks(raw_kmer_key(key), 0, layout.masks);
6069 slots[0] = layout.bucket(hashed);
6070 for (hash_index, slot) in slots.iter_mut().enumerate().take(hashes).skip(1) {
6071 hashed = hashed.rotate_right(BBTOOLS_HASH_BITS);
6072 hashed = bbtools_mask_hash_with_masks(hashed, hash_index, layout.masks);
6073 *slot = layout.bucket(hashed);
6074 }
6075}
6076
6077#[inline]
6078fn count_min_three_buckets(key: &KmerKey, layout: KCountArrayLayout) -> [usize; 3] {
6079 count_min_three_buckets_raw(raw_kmer_key(key), layout)
6080}
6081
6082#[inline]
6083fn count_min_three_buckets_raw(raw_key: u64, layout: KCountArrayLayout) -> [usize; 3] {
6084 let mut hashed = bbtools_mask_hash_with_masks(raw_key, 0, layout.masks);
6085 let first = layout.bucket(hashed);
6086 hashed = bbtools_mask_hash_with_masks(hashed.rotate_right(BBTOOLS_HASH_BITS), 1, layout.masks);
6087 let second = layout.bucket(hashed);
6088 hashed = bbtools_mask_hash_with_masks(hashed.rotate_right(BBTOOLS_HASH_BITS), 2, layout.masks);
6089 [first, second, layout.bucket(hashed)]
6090}
6091
6092#[inline]
6093fn count_min_two_buckets(key: &KmerKey, layout: KCountArrayLayout) -> [usize; 2] {
6094 let mut hashed = bbtools_mask_hash_with_masks(raw_kmer_key(key), 0, layout.masks);
6095 let first = layout.bucket(hashed);
6096 hashed = bbtools_mask_hash_with_masks(hashed.rotate_right(BBTOOLS_HASH_BITS), 1, layout.masks);
6097 [first, layout.bucket(hashed)]
6098}
6099
6100#[cfg(test)]
6101fn bbtools_count_min_row_hash_with_masks(
6102 raw_key: u64,
6103 hash_index: usize,
6104 masks: &BbtoolsHashMaskTable,
6105) -> u64 {
6106 let mut key = bbtools_mask_hash_with_masks(raw_key, 0, masks);
6107 for row in 1..=hash_index {
6108 key = key.rotate_right(BBTOOLS_HASH_BITS);
6109 key = bbtools_mask_hash_with_masks(key, row, masks);
6110 }
6111 key
6112}
6113
6114#[cfg(test)]
6115fn bbtools_mask_hash(key: u64, row: usize, mask_seed: u64) -> u64 {
6116 let masks = bbtools_hash_masks(mask_seed);
6117 bbtools_mask_hash_with_masks(key, row, masks)
6118}
6119
6120#[inline]
6121fn bbtools_mask_hash_with_masks(mut key: u64, row: usize, masks: &BbtoolsHashMaskTable) -> u64 {
6122 let row = row & 7;
6123 let mut cell =
6124 ((key & BBTOOLS_LONG_MAX_VALUE) % (BBTOOLS_HASH_ARRAY_LENGTH as u64 - 1)) as usize;
6125
6126 if row == 0 {
6127 key ^= masks[(row + 4) & 7][cell];
6128 cell = ((key >> 5) & BBTOOLS_HASH_CELL_MASK) as usize;
6129 }
6130
6131 key ^ masks[row][cell]
6132}
6133
6134fn bbtools_hash_masks(mask_seed: u64) -> BbtoolsHashMaskRef {
6135 static SEED0_MASKS: OnceLock<BbtoolsHashMaskTable> = OnceLock::new();
6136 static SEED7_MASKS: OnceLock<BbtoolsHashMaskTable> = OnceLock::new();
6137 static SEED14_MASKS: OnceLock<BbtoolsHashMaskTable> = OnceLock::new();
6138 static OTHER_MASKS: OnceLock<Mutex<BbtoolsHashMaskCache>> = OnceLock::new();
6139 match mask_seed {
6140 BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED => {
6141 SEED0_MASKS.get_or_init(|| make_bbtools_hash_masks(mask_seed))
6142 }
6143 BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED => {
6144 SEED7_MASKS.get_or_init(|| make_bbtools_hash_masks(mask_seed))
6145 }
6146 BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED => {
6147 SEED14_MASKS.get_or_init(|| make_bbtools_hash_masks(mask_seed))
6148 }
6149 _ => {
6150 let cache = OTHER_MASKS.get_or_init(|| Mutex::new(FxHashMap::default()));
6151 let mut cache = cache
6152 .lock()
6153 .unwrap_or_else(|poisoned| poisoned.into_inner());
6154 if let Some(&masks) = cache.get(&mask_seed) {
6155 return masks;
6156 }
6157 let masks = Box::leak(Box::new(make_bbtools_hash_masks(mask_seed)));
6158 cache.insert(mask_seed, masks);
6159 masks
6160 }
6161 }
6162}
6163
6164fn make_bbtools_hash_masks(mask_seed: u64) -> BbtoolsHashMaskTable {
6165 let mut masks = [[0u64; BBTOOLS_HASH_ARRAY_LENGTH]; 8];
6166 let mut rng = BbtoolsXoshiro::new(mask_seed);
6167 for row_masks in &mut masks {
6168 fill_bbtools_hash_mask_row(row_masks, &mut rng);
6169 }
6170 masks
6171}
6172
6173fn fill_bbtools_hash_mask_row(
6174 row_masks: &mut [u64; BBTOOLS_HASH_ARRAY_LENGTH],
6175 rng: &mut BbtoolsXoshiro,
6176) {
6177 let mut low_cells = [0u8; BBTOOLS_HASH_ARRAY_LENGTH];
6178 let mut rotated_cells = [0u8; BBTOOLS_HASH_ARRAY_LENGTH];
6179
6180 for mask in row_masks {
6181 let (value, low_cell, rotated_cell) = loop {
6182 let mut value = rng.next_long();
6183 while (value & 0xffff_ffff).count_ones() < 16 {
6184 value |= 1u64 << rng.next_power_of_two_int(32);
6185 }
6186 while (value & 0xffff_ffff).count_ones() > 16 {
6187 value &= !(1u64 << rng.next_power_of_two_int(32));
6188 }
6189 while (value & 0xffff_ffff_0000_0000).count_ones() < 16 {
6190 value |= 1u64 << (rng.next_power_of_two_int(32) + 32);
6191 }
6192 while (value & 0xffff_ffff_0000_0000).count_ones() > 16 {
6193 value &= !(1u64 << (rng.next_power_of_two_int(32) + 32));
6194 }
6195
6196 let low_cell = (value & BBTOOLS_HASH_CELL_MASK) as usize;
6197 let rotated_cell =
6198 (((value as i64) >> BBTOOLS_HASH_BITS) as u64 & BBTOOLS_HASH_CELL_MASK) as usize;
6199 if low_cells[low_cell] == 0 && rotated_cells[rotated_cell] == 0 {
6200 break (value & BBTOOLS_LONG_MAX_VALUE, low_cell, rotated_cell);
6201 }
6202 };
6203
6204 *mask = value;
6205 low_cells[low_cell] = low_cells[low_cell].saturating_add(1);
6206 rotated_cells[rotated_cell] = rotated_cells[rotated_cell].saturating_add(1);
6207 }
6208}
6209
6210struct BbtoolsXoshiro {
6211 s0: u64,
6212 s1: u64,
6213 s2: u64,
6214 s3: u64,
6215}
6216
6217impl BbtoolsXoshiro {
6218 fn new(seed: u64) -> Self {
6219 let mut rng = Self {
6220 s0: seed,
6221 s1: Self::mix_seed(seed),
6222 s2: 0,
6223 s3: 0,
6224 };
6225 rng.s2 = Self::mix_seed(rng.s1);
6226 rng.s3 = Self::mix_seed(rng.s2);
6227 if rng.s0 == 0 && rng.s1 == 0 && rng.s2 == 0 && rng.s3 == 0 {
6228 rng.s0 = 0x5DEECE66D;
6229 rng.s1 = 0xB;
6230 rng.s2 = 0xCCA;
6231 rng.s3 = 0xF00;
6232 }
6233 for _ in 0..4 {
6234 rng.next_long();
6235 }
6236 rng
6237 }
6238
6239 fn mix_seed(mut value: u64) -> u64 {
6240 value = value.wrapping_add(0x9E37_79B9_7F4A_7C15);
6241 value = (value ^ (value >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
6242 value = (value ^ (value >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
6243 value ^ (value >> 31)
6244 }
6245
6246 fn next_long(&mut self) -> u64 {
6247 let result = self.s0.wrapping_add(self.s3);
6248 let t = self.s1 << 17;
6249
6250 self.s2 ^= self.s0;
6251 self.s3 ^= self.s1;
6252 self.s1 ^= self.s2;
6253 self.s0 ^= self.s3;
6254
6255 self.s2 ^= t;
6256 self.s3 = self.s3.rotate_left(45);
6257
6258 result
6259 }
6260
6261 fn next_power_of_two_int(&mut self, bound: u32) -> u32 {
6262 debug_assert!(bound.is_power_of_two());
6263 (self.next_long() as u32) & (bound - 1)
6264 }
6265}
6266
6267fn new_count_map(config: &Config) -> CountMap {
6268 let mut counts = CountMap::default();
6269 if let Some(capacity) = count_map_capacity_hint(config) {
6270 let _ = counts.try_reserve(capacity);
6271 }
6272 counts
6273}
6274
6275fn count_map_with_capacity(capacity: usize) -> CountMap {
6276 let mut counts = CountMap::default();
6277 if capacity > 0 {
6278 let _ = counts.try_reserve(capacity);
6279 }
6280 counts
6281}
6282
6283fn count_chunk_local_map(
6284 config: &Config,
6285 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6286) -> CountMap {
6287 count_map_with_capacity(count_chunk_local_map_capacity(config, pairs))
6288}
6289
6290fn count_chunk_local_map_capacity(
6291 config: &Config,
6292 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6293) -> usize {
6294 let total_windows: usize = pairs
6295 .iter()
6296 .map(|(r1, r2)| pair_kmer_window_capacity(config, r1, r2.as_ref()))
6297 .sum();
6298 if total_windows == 0 {
6299 return 0;
6300 }
6301 total_windows
6302 .div_ceil(rayon::current_num_threads().max(1))
6303 .clamp(64, COUNT_CHUNK_LOCAL_MAP_MAX_CAPACITY)
6304}
6305
6306fn count_map_capacity_hint(config: &Config) -> Option<usize> {
6307 let explicit = config.table_initial_size;
6308 let prealloc = preallocation_capacity_hint(config);
6309 explicit.max(prealloc)
6310}
6311
6312fn preallocation_capacity_hint(config: &Config) -> Option<usize> {
6313 let fraction = config.table_prealloc_fraction?;
6314 let reads = config.table_reads.or(config.max_reads)?;
6315 let reads = usize::try_from(reads).ok()?;
6316 if reads == 0 || fraction <= 0.0 {
6317 return None;
6318 }
6319 let mates = if config.in2.is_some() || config.interleaved {
6320 2usize
6321 } else {
6322 1usize
6323 };
6324 let kmers_per_read_hint = 100usize.saturating_sub(config.k).saturating_add(1).max(1);
6325 let raw = reads
6326 .saturating_mul(mates)
6327 .saturating_mul(kmers_per_read_hint);
6328 Some(((raw as f64) * fraction).ceil().max(1.0) as usize)
6329}
6330
6331fn count_primary(config: &Config, counts: &mut CountMap) -> Result<()> {
6332 if let Some(paths) = primary_input_lists(config) {
6333 if let Some(first) = paths.first.first() {
6334 if let Some(second) = paths.second.as_ref().and_then(|paths| paths.first()) {
6335 count_paired_files(config, first, second, counts, config.table_reads)?;
6336 } else {
6337 count_single_file(config, first, counts, config.table_reads)?;
6338 }
6339 }
6340 for path in paths.first.iter().skip(1) {
6341 count_single_file(config, path, counts, None)?;
6342 }
6343 if let Some(second) = &paths.second {
6344 for path in second.iter().skip(1) {
6345 count_single_file(config, path, counts, None)?;
6346 }
6347 }
6348 return Ok(());
6349 }
6350
6351 let mut readers = PrimaryReaders::open(config, config.table_reads)?;
6352 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6353 while let Some((r1, r2)) = readers.next_pair()? {
6354 chunk.push((r1, r2));
6355 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6356 increment_counts_from_pair_chunk(config, counts, &chunk);
6357 chunk.clear();
6358 }
6359 }
6360 if !chunk.is_empty() {
6361 increment_counts_from_pair_chunk(config, counts, &chunk);
6362 }
6363 Ok(())
6364}
6365
6366fn count_primary_sketch(
6367 config: &Config,
6368 sketch: &mut PackedCountMinSketch,
6369 prefilter: Option<PrefilterGate<'_>>,
6370) -> Result<()> {
6371 if let Some(paths) = primary_input_lists(config) {
6372 if let Some(first) = paths.first.first() {
6373 if let Some(second) = paths.second.as_ref().and_then(|paths| paths.first()) {
6374 count_paired_files_sketch(
6375 config,
6376 first,
6377 second,
6378 sketch,
6379 config.table_reads,
6380 prefilter,
6381 )?;
6382 } else {
6383 count_single_file_sketch(config, first, sketch, config.table_reads, prefilter)?;
6384 }
6385 }
6386 for path in paths.first.iter().skip(1) {
6387 count_single_file_sketch(config, path, sketch, None, prefilter)?;
6388 }
6389 if let Some(second) = &paths.second {
6390 for path in second.iter().skip(1) {
6391 count_single_file_sketch(config, path, sketch, None, prefilter)?;
6392 }
6393 }
6394 return Ok(());
6395 }
6396
6397 let mut readers = PrimaryReaders::open(config, config.table_reads)?;
6398 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6399 while let Some((r1, r2)) = readers.next_pair()? {
6400 chunk.push((r1, r2));
6401 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6402 increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6403 chunk.clear();
6404 }
6405 }
6406 if !chunk.is_empty() {
6407 increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6408 }
6409 Ok(())
6410}
6411
6412fn count_primary_prefilter_sketch(
6413 config: &Config,
6414 sketch: &mut PrefilterCountMinSketch,
6415) -> Result<()> {
6416 match sketch {
6417 PrefilterCountMinSketch::Packed(sketch) => count_primary_sketch(config, sketch, None),
6418 PrefilterCountMinSketch::AtomicPacked(sketch) => {
6419 count_primary_atomic_packed_sketch(config, sketch)
6420 }
6421 }
6422}
6423
6424fn count_primary_atomic_packed_sketch(
6425 config: &Config,
6426 sketch: &AtomicPackedCountMinSketch,
6427) -> Result<()> {
6428 if let Some(paths) = primary_input_lists(config) {
6429 if let Some(first) = paths.first.first() {
6430 if let Some(second) = paths.second.as_ref().and_then(|paths| paths.first()) {
6431 count_paired_files_atomic_packed_sketch(
6432 config,
6433 first,
6434 second,
6435 sketch,
6436 config.table_reads,
6437 )?;
6438 } else {
6439 count_single_file_atomic_packed_sketch(config, first, sketch, config.table_reads)?;
6440 }
6441 }
6442 for path in paths.first.iter().skip(1) {
6443 count_single_file_atomic_packed_sketch(config, path, sketch, None)?;
6444 }
6445 if let Some(second) = &paths.second {
6446 for path in second.iter().skip(1) {
6447 count_single_file_atomic_packed_sketch(config, path, sketch, None)?;
6448 }
6449 }
6450 return Ok(());
6451 }
6452
6453 let mut readers = PrimaryReaders::open(config, config.table_reads)?;
6454 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6455 while let Some((r1, r2)) = readers.next_pair()? {
6456 chunk.push((r1, r2));
6457 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6458 increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
6459 chunk.clear();
6460 }
6461 }
6462 if !chunk.is_empty() {
6463 increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
6464 }
6465 Ok(())
6466}
6467
6468fn count_primary_atomic_sketch(
6469 config: &Config,
6470 sketch: &AtomicCountMinSketch,
6471 prefilter: Option<PrefilterGate<'_>>,
6472) -> Result<()> {
6473 if let Some(paths) = primary_input_lists(config) {
6474 if let Some(first) = paths.first.first() {
6475 if let Some(second) = paths.second.as_ref().and_then(|paths| paths.first()) {
6476 count_paired_files_atomic_sketch(
6477 config,
6478 first,
6479 second,
6480 sketch,
6481 config.table_reads,
6482 prefilter,
6483 )?;
6484 } else {
6485 count_single_file_atomic_sketch(
6486 config,
6487 first,
6488 sketch,
6489 config.table_reads,
6490 prefilter,
6491 )?;
6492 }
6493 }
6494 for path in paths.first.iter().skip(1) {
6495 count_single_file_atomic_sketch(config, path, sketch, None, prefilter)?;
6496 }
6497 if let Some(second) = &paths.second {
6498 for path in second.iter().skip(1) {
6499 count_single_file_atomic_sketch(config, path, sketch, None, prefilter)?;
6500 }
6501 }
6502 return Ok(());
6503 }
6504
6505 let mut readers = PrimaryReaders::open(config, config.table_reads)?;
6506 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6507 while let Some((r1, r2)) = readers.next_pair()? {
6508 chunk.push((r1, r2));
6509 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6510 increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6511 chunk.clear();
6512 }
6513 }
6514 if !chunk.is_empty() {
6515 increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6516 }
6517 Ok(())
6518}
6519
6520fn count_primary_gpu_reduced_runs_sketch(
6521 config: &Config,
6522 sketch: &mut PackedCountMinSketch,
6523) -> Result<()> {
6524 for_each_gpu_reduced_chunk_run(config, |key, count| {
6525 sketch.add_key_count(&key, count);
6526 sketch.add_key_increments(count);
6527 })
6528}
6529
6530fn count_primary_gpu_reduced_runs_atomic_sketch(
6531 config: &Config,
6532 sketch: &AtomicCountMinSketch,
6533) -> Result<()> {
6534 for_each_gpu_reduced_chunk_run(config, |key, count| {
6535 sketch.add_key_count(&key, count);
6536 sketch.add_key_increments(count);
6537 })
6538}
6539
6540fn for_each_gpu_reduced_chunk_run<F>(config: &Config, mut f: F) -> Result<()>
6541where
6542 F: FnMut(KmerKey, u64),
6543{
6544 let helper = config
6545 .gpu_helper
6546 .as_ref()
6547 .context("gpucounting=t requires gpuhelper=<cuda_kmer_reduce_runs binary>")?;
6548 if !helper.exists() {
6549 bail!("gpuhelper does not exist: {}", helper.display());
6550 }
6551 ensure!(
6552 config.k <= 31,
6553 "gpucounting=t currently supports short k-mers only (k<=31)"
6554 );
6555 ensure!(
6556 !use_prefilter_collision_estimates(config),
6557 "gpucounting=t currently supports the main bounded sketch without prefilter=t"
6558 );
6559 let temp_dir = config.temp_dir.clone().unwrap_or_else(std::env::temp_dir);
6560 fs::create_dir_all(&temp_dir)
6561 .with_context(|| format!("creating GPU counting temp dir {}", temp_dir.display()))?;
6562 let token = format!(
6563 "{}_{}",
6564 std::process::id(),
6565 SystemTime::now()
6566 .duration_since(UNIX_EPOCH)
6567 .unwrap_or_default()
6568 .as_nanos()
6569 );
6570 let kmers_path = temp_dir.join(format!("bbnorm-rs-gpu-kmers-{token}.u64"));
6571 let runs_path = temp_dir.join(format!("bbnorm-rs-gpu-runs-{token}.bin"));
6572 let result = (|| {
6573 let mut readers = PrimaryReaders::open(config, config.table_reads)?;
6574 let mut persistent = config
6575 .gpu_persistent
6576 .then(|| PersistentGpuReducer::start(helper))
6577 .transpose()?;
6578 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6579 let mut persistent_keys = Vec::new();
6580 while let Some((r1, r2)) = readers.next_pair()? {
6581 chunk.push((r1, r2));
6582 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6583 if let Some(reducer) = &mut persistent {
6584 reduce_gpu_pair_chunk_persistent(
6585 config,
6586 reducer,
6587 &chunk,
6588 &mut persistent_keys,
6589 &mut f,
6590 )?;
6591 } else {
6592 reduce_gpu_pair_chunk(config, helper, &kmers_path, &runs_path, &chunk, &mut f)?;
6593 }
6594 chunk.clear();
6595 }
6596 }
6597 if !chunk.is_empty() {
6598 if let Some(reducer) = &mut persistent {
6599 reduce_gpu_pair_chunk_persistent(
6600 config,
6601 reducer,
6602 &chunk,
6603 &mut persistent_keys,
6604 &mut f,
6605 )?;
6606 } else {
6607 reduce_gpu_pair_chunk(config, helper, &kmers_path, &runs_path, &chunk, &mut f)?;
6608 }
6609 }
6610 if let Some(reducer) = persistent {
6611 reducer.finish()?;
6612 }
6613 Ok(())
6614 })();
6615 let _ = fs::remove_file(&kmers_path);
6616 let _ = fs::remove_file(&runs_path);
6617 result
6618}
6619
6620fn reduce_gpu_pair_chunk<F>(
6621 config: &Config,
6622 helper: &Path,
6623 kmers_path: &Path,
6624 runs_path: &Path,
6625 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6626 f: &mut F,
6627) -> Result<()>
6628where
6629 F: FnMut(KmerKey, u64),
6630{
6631 write_pair_chunk_short_kmers(config, pairs, kmers_path)?;
6632 if fs::metadata(kmers_path)?.len() == 0 {
6633 return Ok(());
6634 }
6635 let status = Command::new(helper)
6636 .arg(kmers_path)
6637 .arg(runs_path)
6638 .status()
6639 .with_context(|| format!("running GPU helper {}", helper.display()))?;
6640 if !status.success() {
6641 bail!("GPU helper failed with status {status}");
6642 }
6643 replay_reduced_runs_file(runs_path, f)?;
6644 let _ = fs::remove_file(kmers_path);
6645 let _ = fs::remove_file(runs_path);
6646 Ok(())
6647}
6648
6649fn reduce_gpu_pair_chunk_persistent<F>(
6650 config: &Config,
6651 reducer: &mut PersistentGpuReducer,
6652 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6653 keys: &mut Vec<u64>,
6654 f: &mut F,
6655) -> Result<()>
6656where
6657 F: FnMut(KmerKey, u64),
6658{
6659 collect_pair_chunk_short_kmers(config, pairs, keys)?;
6660 if keys.is_empty() {
6661 return Ok(());
6662 }
6663 reducer.reduce(keys, f)
6664}
6665
6666fn write_pair_chunk_short_kmers(
6667 config: &Config,
6668 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6669 path: &Path,
6670) -> Result<()> {
6671 let mut writer = BufWriter::new(
6672 fs::File::create(path).with_context(|| format!("create {}", path.display()))?,
6673 );
6674 let mut keys = Vec::new();
6675 collect_pair_chunk_short_kmers(config, pairs, &mut keys)?;
6676 for raw in keys {
6677 writer.write_all(&raw.to_le_bytes())?;
6678 }
6679 writer.flush()?;
6680 Ok(())
6681}
6682
6683fn collect_pair_chunk_short_kmers(
6684 config: &Config,
6685 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
6686 out: &mut Vec<u64>,
6687) -> Result<()> {
6688 out.clear();
6689 let mut keys = Vec::new();
6690 for (r1, r2) in pairs {
6691 if config.remove_duplicate_kmers {
6692 fill_unique_pair_kmers(config, r1, r2.as_ref(), &mut keys);
6693 for key in &keys {
6694 out.push(short_kmer_raw(key)?);
6695 }
6696 } else {
6697 let mut write_error = None;
6698 for_each_kmer_for_record(r1, config, |key| match short_kmer_raw(&key) {
6699 Ok(raw) => out.push(raw),
6700 Err(err) => {
6701 write_error = Some(err);
6702 }
6703 });
6704 if let Some(mate) = r2 {
6705 for_each_kmer_for_record(mate, config, |key| match short_kmer_raw(&key) {
6706 Ok(raw) => out.push(raw),
6707 Err(err) => {
6708 write_error = Some(err);
6709 }
6710 });
6711 }
6712 if let Some(err) = write_error {
6713 return Err(err);
6714 }
6715 }
6716 }
6717 Ok(())
6718}
6719
6720fn short_kmer_raw(key: &KmerKey) -> Result<u64> {
6721 let KmerKey::Short(raw) = key else {
6722 bail!("GPU counting helper only accepts short k-mer keys");
6723 };
6724 Ok(*raw)
6725}
6726
6727struct PersistentGpuReducer {
6728 child: Child,
6729 stdin: BufWriter<ChildStdin>,
6730 stdout: BufReader<ChildStdout>,
6731}
6732
6733impl PersistentGpuReducer {
6734 fn start(helper: &Path) -> Result<Self> {
6735 let mut child = Command::new(helper)
6736 .stdin(Stdio::piped())
6737 .stdout(Stdio::piped())
6738 .stderr(Stdio::inherit())
6739 .spawn()
6740 .with_context(|| format!("starting persistent GPU helper {}", helper.display()))?;
6741 let stdin = child
6742 .stdin
6743 .take()
6744 .context("persistent GPU helper stdin was not piped")?;
6745 let stdout = child
6746 .stdout
6747 .take()
6748 .context("persistent GPU helper stdout was not piped")?;
6749 Ok(Self {
6750 child,
6751 stdin: BufWriter::new(stdin),
6752 stdout: BufReader::new(stdout),
6753 })
6754 }
6755
6756 fn reduce<F>(&mut self, keys: &[u64], f: &mut F) -> Result<()>
6757 where
6758 F: FnMut(KmerKey, u64),
6759 {
6760 let count = keys.len() as u64;
6761 self.stdin.write_all(&count.to_le_bytes())?;
6762 for key in keys {
6763 self.stdin.write_all(&key.to_le_bytes())?;
6764 }
6765 self.stdin.flush()?;
6766
6767 let mut unique_buf = [0u8; 8];
6768 self.stdout
6769 .read_exact(&mut unique_buf)
6770 .context("reading persistent GPU helper unique count")?;
6771 let unique = u64::from_le_bytes(unique_buf);
6772 let mut record = [0u8; 12];
6773 for _ in 0..unique {
6774 self.stdout
6775 .read_exact(&mut record)
6776 .context("reading persistent GPU helper reduced run")?;
6777 let key = u64::from_le_bytes(record[0..8].try_into().unwrap());
6778 let count = u32::from_le_bytes(record[8..12].try_into().unwrap());
6779 f(KmerKey::Short(key), u64::from(count));
6780 }
6781 Ok(())
6782 }
6783
6784 fn finish(mut self) -> Result<()> {
6785 self.stdin.write_all(&u64::MAX.to_le_bytes())?;
6786 self.stdin.flush()?;
6787 drop(self.stdin);
6788 let status = self
6789 .child
6790 .wait()
6791 .context("waiting for persistent GPU helper")?;
6792 if !status.success() {
6793 bail!("persistent GPU helper failed with status {status}");
6794 }
6795 Ok(())
6796 }
6797}
6798
6799fn replay_reduced_runs_file<F>(path: &Path, f: &mut F) -> Result<()>
6800where
6801 F: FnMut(KmerKey, u64),
6802{
6803 let mut reader =
6804 BufReader::new(fs::File::open(path).with_context(|| format!("open {}", path.display()))?);
6805 let mut record = [0u8; 12];
6806 loop {
6807 match reader.read_exact(&mut record) {
6808 Ok(()) => {
6809 let key = u64::from_le_bytes(record[0..8].try_into().unwrap());
6810 let count = u32::from_le_bytes(record[8..12].try_into().unwrap());
6811 f(KmerKey::Short(key), u64::from(count));
6812 }
6813 Err(err) if err.kind() == ErrorKind::UnexpectedEof => break,
6814 Err(err) => return Err(err).context("reading GPU reduced runs"),
6815 }
6816 }
6817 Ok(())
6818}
6819
6820fn count_single_file(
6821 config: &Config,
6822 path: &Path,
6823 counts: &mut CountMap,
6824 limit: Option<u64>,
6825) -> Result<()> {
6826 let mut reader = open_sequence_reader(config, path, sequence_settings(config))?;
6827 let mut reads_seen = 0u64;
6828 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6829 while let Some(record) = reader.next_record()? {
6830 if limit_reached(limit, reads_seen) {
6831 break;
6832 }
6833 chunk.push((record, None));
6834 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6835 increment_counts_from_pair_chunk(config, counts, &chunk);
6836 chunk.clear();
6837 }
6838 reads_seen += 1;
6839 }
6840 if !chunk.is_empty() {
6841 increment_counts_from_pair_chunk(config, counts, &chunk);
6842 }
6843 Ok(())
6844}
6845
6846fn count_single_file_sketch(
6847 config: &Config,
6848 path: &Path,
6849 sketch: &mut PackedCountMinSketch,
6850 limit: Option<u64>,
6851 prefilter: Option<PrefilterGate<'_>>,
6852) -> Result<()> {
6853 let mut reader = open_sequence_reader(config, path, sequence_settings(config))?;
6854 let mut reads_seen = 0u64;
6855 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6856 while let Some(record) = reader.next_record()? {
6857 if limit_reached(limit, reads_seen) {
6858 break;
6859 }
6860 chunk.push((record, None));
6861 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6862 increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6863 chunk.clear();
6864 }
6865 reads_seen += 1;
6866 }
6867 if !chunk.is_empty() {
6868 increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6869 }
6870 Ok(())
6871}
6872
6873fn count_single_file_prefilter_sketch(
6874 config: &Config,
6875 path: &Path,
6876 sketch: &mut PrefilterCountMinSketch,
6877 limit: Option<u64>,
6878) -> Result<()> {
6879 match sketch {
6880 PrefilterCountMinSketch::Packed(sketch) => {
6881 count_single_file_sketch(config, path, sketch, limit, None)
6882 }
6883 PrefilterCountMinSketch::AtomicPacked(sketch) => {
6884 count_single_file_atomic_packed_sketch(config, path, sketch, limit)
6885 }
6886 }
6887}
6888
6889fn count_single_file_atomic_packed_sketch(
6890 config: &Config,
6891 path: &Path,
6892 sketch: &AtomicPackedCountMinSketch,
6893 limit: Option<u64>,
6894) -> Result<()> {
6895 let mut reader = open_sequence_reader(config, path, sequence_settings(config))?;
6896 let mut reads_seen = 0u64;
6897 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6898 while let Some(record) = reader.next_record()? {
6899 if limit_reached(limit, reads_seen) {
6900 break;
6901 }
6902 chunk.push((record, None));
6903 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6904 increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
6905 chunk.clear();
6906 }
6907 reads_seen += 1;
6908 }
6909 if !chunk.is_empty() {
6910 increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
6911 }
6912 Ok(())
6913}
6914
6915fn count_single_file_atomic_sketch(
6916 config: &Config,
6917 path: &Path,
6918 sketch: &AtomicCountMinSketch,
6919 limit: Option<u64>,
6920 prefilter: Option<PrefilterGate<'_>>,
6921) -> Result<()> {
6922 let mut reader = open_sequence_reader(config, path, sequence_settings(config))?;
6923 let mut reads_seen = 0u64;
6924 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6925 while let Some(record) = reader.next_record()? {
6926 if limit_reached(limit, reads_seen) {
6927 break;
6928 }
6929 chunk.push((record, None));
6930 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6931 increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6932 chunk.clear();
6933 }
6934 reads_seen += 1;
6935 }
6936 if !chunk.is_empty() {
6937 increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
6938 }
6939 Ok(())
6940}
6941
6942fn count_paired_files(
6943 config: &Config,
6944 path1: &Path,
6945 path2: &Path,
6946 counts: &mut CountMap,
6947 limit: Option<u64>,
6948) -> Result<()> {
6949 let settings = sequence_settings(config);
6950 let (mut reader1, mut reader2) = open_paired_sequence_readers(config, path1, path2, settings)?;
6951 if reader1.format() != reader2.format() {
6952 bail!("paired inputs must use the same FASTA/FASTQ format");
6953 }
6954
6955 let mut pairs_seen = 0u64;
6956 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
6957 loop {
6958 if limit_reached(limit, pairs_seen) {
6959 break;
6960 }
6961 match (reader1.next_record()?, reader2.next_record()?) {
6962 (None, None) => break,
6963 (Some(read1), Some(read2)) => {
6964 chunk.push((read1, Some(read2)));
6965 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
6966 increment_counts_from_pair_chunk(config, counts, &chunk);
6967 chunk.clear();
6968 }
6969 pairs_seen += 1;
6970 }
6971 (Some(_), None) => bail!(
6972 "{} has fewer records than {}",
6973 path2.display(),
6974 path1.display()
6975 ),
6976 (None, Some(_)) => bail!(
6977 "{} has fewer records than {}",
6978 path1.display(),
6979 path2.display()
6980 ),
6981 }
6982 }
6983 if !chunk.is_empty() {
6984 increment_counts_from_pair_chunk(config, counts, &chunk);
6985 }
6986 Ok(())
6987}
6988
6989fn count_paired_files_sketch(
6990 config: &Config,
6991 path1: &Path,
6992 path2: &Path,
6993 sketch: &mut PackedCountMinSketch,
6994 limit: Option<u64>,
6995 prefilter: Option<PrefilterGate<'_>>,
6996) -> Result<()> {
6997 let settings = sequence_settings(config);
6998 let (mut reader1, mut reader2) = open_paired_sequence_readers(config, path1, path2, settings)?;
6999 if reader1.format() != reader2.format() {
7000 bail!("paired inputs must use the same FASTA/FASTQ format");
7001 }
7002
7003 let mut pairs_seen = 0u64;
7004 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
7005 loop {
7006 if limit_reached(limit, pairs_seen) {
7007 break;
7008 }
7009 match (reader1.next_record()?, reader2.next_record()?) {
7010 (None, None) => break,
7011 (Some(read1), Some(read2)) => {
7012 chunk.push((read1, Some(read2)));
7013 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
7014 increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
7015 chunk.clear();
7016 }
7017 pairs_seen += 1;
7018 }
7019 (Some(_), None) => bail!(
7020 "{} has fewer records than {}",
7021 path2.display(),
7022 path1.display()
7023 ),
7024 (None, Some(_)) => bail!(
7025 "{} has fewer records than {}",
7026 path1.display(),
7027 path2.display()
7028 ),
7029 }
7030 }
7031 if !chunk.is_empty() {
7032 increment_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
7033 }
7034 Ok(())
7035}
7036
7037fn count_paired_files_atomic_packed_sketch(
7038 config: &Config,
7039 path1: &Path,
7040 path2: &Path,
7041 sketch: &AtomicPackedCountMinSketch,
7042 limit: Option<u64>,
7043) -> Result<()> {
7044 let settings = sequence_settings(config);
7045 let (mut reader1, mut reader2) = open_paired_sequence_readers(config, path1, path2, settings)?;
7046 if reader1.format() != reader2.format() {
7047 bail!("paired inputs must use the same FASTA/FASTQ format");
7048 }
7049
7050 let mut pairs_seen = 0u64;
7051 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
7052 loop {
7053 if limit_reached(limit, pairs_seen) {
7054 break;
7055 }
7056 match (reader1.next_record()?, reader2.next_record()?) {
7057 (None, None) => break,
7058 (Some(read1), Some(read2)) => {
7059 chunk.push((read1, Some(read2)));
7060 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
7061 increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
7062 chunk.clear();
7063 }
7064 pairs_seen += 1;
7065 }
7066 (Some(_), None) => bail!(
7067 "{} has fewer records than {}",
7068 path2.display(),
7069 path1.display()
7070 ),
7071 (None, Some(_)) => bail!(
7072 "{} has fewer records than {}",
7073 path1.display(),
7074 path2.display()
7075 ),
7076 }
7077 }
7078 if !chunk.is_empty() {
7079 increment_atomic_packed_sketch_from_pair_chunk(config, sketch, &chunk);
7080 }
7081 Ok(())
7082}
7083
7084fn count_paired_files_atomic_sketch(
7085 config: &Config,
7086 path1: &Path,
7087 path2: &Path,
7088 sketch: &AtomicCountMinSketch,
7089 limit: Option<u64>,
7090 prefilter: Option<PrefilterGate<'_>>,
7091) -> Result<()> {
7092 let settings = sequence_settings(config);
7093 let (mut reader1, mut reader2) = open_paired_sequence_readers(config, path1, path2, settings)?;
7094 if reader1.format() != reader2.format() {
7095 bail!("paired inputs must use the same FASTA/FASTQ format");
7096 }
7097
7098 let mut pairs_seen = 0u64;
7099 let mut chunk = Vec::with_capacity(COUNT_PARALLEL_CHUNK_SIZE);
7100 loop {
7101 if limit_reached(limit, pairs_seen) {
7102 break;
7103 }
7104 match (reader1.next_record()?, reader2.next_record()?) {
7105 (None, None) => break,
7106 (Some(read1), Some(read2)) => {
7107 chunk.push((read1, Some(read2)));
7108 if chunk.len() >= COUNT_PARALLEL_CHUNK_SIZE {
7109 increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
7110 chunk.clear();
7111 }
7112 pairs_seen += 1;
7113 }
7114 (Some(_), None) => bail!(
7115 "{} has fewer records than {}",
7116 path2.display(),
7117 path1.display()
7118 ),
7119 (None, Some(_)) => bail!(
7120 "{} has fewer records than {}",
7121 path1.display(),
7122 path2.display()
7123 ),
7124 }
7125 }
7126 if !chunk.is_empty() {
7127 increment_atomic_sketch_from_pair_chunk(config, sketch, &chunk, prefilter);
7128 }
7129 Ok(())
7130}
7131
7132fn normalize_primary(
7133 config: &Config,
7134 input_counts: &dyn CountLookup,
7135 mut output_counts: Option<&mut OutputCounts>,
7136 mut output_cardinality: Option<&mut KmerCardinalityEstimator>,
7137 cardinality_config: &Config,
7138 random_seed: u64,
7139 mut input_hist: InputHistSinks<'_>,
7140) -> Result<RunSummary> {
7141 let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7142 let format1 = readers.format1();
7143 let format2 = readers.format2();
7144 let mut writers = OptionalWriters::open(config, format1, format2)?;
7145 let mut summary = RunSummary::default();
7146 let mut rng = JavaXoshiro::new(random_seed);
7147 let mut chunk = Vec::with_capacity(NORMALIZE_PARALLEL_CHUNK_SIZE);
7148
7149 while let Some((r1, r2)) = readers.next_pair()? {
7150 chunk.push((readers.input_list_index(), r1, r2, rng.next_double()));
7151 if chunk.len() >= NORMALIZE_PARALLEL_CHUNK_SIZE {
7152 let pairs = normalize_pair_chunk(config, input_counts, &chunk);
7153 write_normalized_pairs(
7154 config,
7155 &mut writers,
7156 &mut output_counts,
7157 &mut output_cardinality,
7158 cardinality_config,
7159 &mut summary,
7160 &pairs,
7161 &mut input_hist,
7162 )?;
7163 chunk.clear();
7164 }
7165 }
7166 if !chunk.is_empty() {
7167 let pairs = normalize_pair_chunk(config, input_counts, &chunk);
7168 write_normalized_pairs(
7169 config,
7170 &mut writers,
7171 &mut output_counts,
7172 &mut output_cardinality,
7173 cardinality_config,
7174 &mut summary,
7175 &pairs,
7176 &mut input_hist,
7177 )?;
7178 }
7179
7180 writers.flush()?;
7181 Ok(summary)
7182}
7183
7184fn normalize_pair_chunk(
7185 config: &Config,
7186 input_counts: &dyn CountLookup,
7187 pairs: &[NormalizationInput],
7188) -> Vec<NormalizedPair> {
7189 pairs
7190 .par_iter()
7191 .map(|(input_list_index, r1, r2, rand)| {
7192 let mut r1 = r1.clone();
7193 let mut r2 = r2.clone();
7194 if !config.trim_after_marking {
7195 trim_pair(config, &mut r1, r2.as_mut());
7196 }
7197 let decision = decide_pair(config, input_counts, &r1, r2.as_ref(), Some(*rand));
7198 let mut correction = CorrectionResult::default();
7199 if config.error_correct && !decision.toss {
7200 correction =
7201 correct_pair_errors_with_rollback(config, input_counts, &mut r1, r2.as_mut());
7202 }
7203 if config.trim_after_marking && config.error_correct {
7204 trim_pair(config, &mut r1, r2.as_mut());
7205 }
7206 let (out_r1, out_r2) = maybe_rename_pair(config, &r1, r2.as_ref(), &decision.analysis);
7207 let read_count = 1 + u64::from(r2.is_some());
7208 let base_count = r1.len() as u64 + r2.as_ref().map(|r| r.len() as u64).unwrap_or(0);
7209 NormalizedPair {
7210 input_list_index: *input_list_index,
7211 r1,
7212 r2,
7213 out_r1,
7214 out_r2,
7215 decision,
7216 uncorrectable: correction.uncorrectable,
7217 read_count,
7218 base_count,
7219 }
7220 })
7221 .collect()
7222}
7223
7224#[allow(clippy::too_many_arguments)]
7225fn write_normalized_pairs(
7226 config: &Config,
7227 writers: &mut OptionalWriters,
7228 output_counts: &mut Option<&mut OutputCounts>,
7229 output_cardinality: &mut Option<&mut KmerCardinalityEstimator>,
7230 cardinality_config: &Config,
7231 summary: &mut RunSummary,
7232 pairs: &[NormalizedPair],
7233 input_hist: &mut InputHistSinks<'_>,
7234) -> Result<()> {
7235 for pair in pairs {
7236 writers.sync_to_input_list_index(config, pair.input_list_index)?;
7237 summary.reads_in += pair.read_count;
7238 summary.bases_in += pair.base_count;
7239
7240 if let Some(hist) = input_hist.depth.as_deref_mut() {
7241 increment_sparse_hist_from_analysis(
7242 hist,
7243 &pair.decision.analysis.read1,
7244 config.hist_len,
7245 );
7246 if let Some(read2) = &pair.decision.analysis.read2 {
7247 increment_sparse_hist_from_analysis(hist, read2, config.hist_len);
7248 }
7249 }
7250 if let Some(read_hist) = input_hist.read.as_deref_mut() {
7251 increment_sparse_read_hist(
7252 read_hist,
7253 &pair.decision.analysis.read1,
7254 pair.r1.len(),
7255 config.hist_len,
7256 );
7257 if let (Some(read2_analysis), Some(read2)) =
7258 (&pair.decision.analysis.read2, pair.r2.as_ref())
7259 {
7260 increment_sparse_read_hist(read_hist, read2_analysis, read2.len(), config.hist_len);
7261 }
7262 }
7263
7264 if pair.decision.toss {
7265 summary.reads_tossed += pair.read_count;
7266 summary.bases_tossed += pair.base_count;
7267 } else {
7268 summary.reads_kept += pair.read_count;
7269 summary.bases_kept += pair.base_count;
7270 }
7271
7272 writers.write_pair(pair.decision.toss, &pair.out_r1, pair.out_r2.as_ref())?;
7273 if pair.uncorrectable {
7274 writers.write_uncorrected(&pair.r1, pair.r2.as_ref())?;
7275 }
7276 if depth_bin_outputs_enabled(config) {
7277 writers.write_depth_bin(
7278 config,
7279 &pair.decision.analysis,
7280 &pair.out_r1,
7281 pair.out_r2.as_ref(),
7282 )?;
7283 }
7284 }
7285 if let Some(counts) = output_counts.as_mut() {
7286 increment_output_counts_from_normalized_chunk(config, counts, pairs);
7287 }
7288 if let Some(estimator) = output_cardinality.as_mut() {
7289 for pair in pairs.iter().filter(|pair| !pair.decision.toss) {
7290 estimator.observe_pair(cardinality_config, &pair.r1, pair.r2.as_ref());
7291 }
7292 }
7293 Ok(())
7294}
7295
7296fn increment_output_counts_from_normalized_chunk(
7297 config: &Config,
7298 counts: &mut OutputCounts,
7299 pairs: &[NormalizedPair],
7300) {
7301 match counts {
7302 OutputCounts::Exact(counts) => {
7303 let chunk_counts = pairs
7304 .par_iter()
7305 .filter(|pair| !pair.decision.toss)
7306 .fold(CountMap::default, |mut local_counts, pair| {
7307 increment_pair_counts(config, &mut local_counts, &pair.r1, pair.r2.as_ref());
7308 local_counts
7309 })
7310 .reduce(CountMap::default, |mut left, right| {
7311 merge_count_maps(&mut left, right);
7312 left
7313 });
7314 merge_count_maps(counts, chunk_counts);
7315 }
7316 OutputCounts::Sketch(sketch) => {
7317 increment_sketch_from_normalized_chunk(config, sketch, pairs);
7318 }
7319 OutputCounts::AtomicSketch(sketch) => {
7320 increment_atomic_sketch_from_normalized_chunk(config, sketch, pairs);
7321 }
7322 }
7323}
7324
7325fn increment_atomic_sketch_from_normalized_chunk(
7326 config: &Config,
7327 sketch: &AtomicCountMinSketch,
7328 pairs: &[NormalizedPair],
7329) {
7330 if !config.deterministic {
7331 let (key_increments, newly_occupied) = pairs
7332 .par_iter()
7333 .filter(|pair| !pair.decision.toss)
7334 .map(|pair| {
7335 increment_pair_atomic_sketch_direct(
7336 config,
7337 sketch,
7338 &pair.r1,
7339 pair.r2.as_ref(),
7340 None,
7341 )
7342 })
7343 .reduce(
7344 || (0u64, 0usize),
7345 |left, right| {
7346 (
7347 left.0.saturating_add(right.0),
7348 left.1.saturating_add(right.1),
7349 )
7350 },
7351 );
7352 sketch.add_key_increments(key_increments);
7353 sketch.add_occupied_slots(newly_occupied);
7354 return;
7355 }
7356
7357 let chunk_counts = pairs
7358 .par_iter()
7359 .filter(|pair| !pair.decision.toss)
7360 .fold(CountMap::default, |mut local_counts, pair| {
7361 increment_pair_counts(config, &mut local_counts, &pair.r1, pair.r2.as_ref());
7362 local_counts
7363 })
7364 .reduce(CountMap::default, |mut left, right| {
7365 merge_count_maps(&mut left, right);
7366 left
7367 });
7368 let key_increments = chunk_counts.values().copied().sum();
7369 sketch.add_key_counts(&chunk_counts);
7370 sketch.add_key_increments(key_increments);
7371}
7372
7373fn increment_sketch_from_normalized_chunk(
7374 config: &Config,
7375 sketch: &mut PackedCountMinSketch,
7376 pairs: &[NormalizedPair],
7377) {
7378 let chunk_counts = pairs
7379 .par_iter()
7380 .filter(|pair| !pair.decision.toss)
7381 .fold(CountMap::default, |mut local_counts, pair| {
7382 increment_pair_counts(config, &mut local_counts, &pair.r1, pair.r2.as_ref());
7383 local_counts
7384 })
7385 .reduce(CountMap::default, |mut left, right| {
7386 merge_count_maps(&mut left, right);
7387 left
7388 });
7389 let key_increments = chunk_counts.values().copied().sum();
7390 sketch.add_key_counts(&chunk_counts);
7391 sketch.add_key_increments(key_increments);
7392}
7393
7394#[cfg(test)]
7395fn collect_primary_hist(
7396 config: &Config,
7397 hist_counts: &dyn CountLookup,
7398 keep_filter_counts: Option<&dyn CountLookup>,
7399 random_seed: u64,
7400) -> Result<Vec<u64>> {
7401 let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7402 let mut hist = vec![0u64; config.hist_len];
7403 let mut rng = JavaXoshiro::new(random_seed);
7404 let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7405
7406 while let Some((mut r1, mut r2)) = readers.next_pair()? {
7407 trim_pair(config, &mut r1, r2.as_mut());
7408 let rand = keep_filter_counts.map(|_| rng.next_double());
7409 chunk.push((r1, r2, rand));
7410 if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7411 increment_hist_from_pair_chunk(
7412 config,
7413 hist_counts,
7414 keep_filter_counts,
7415 &mut hist,
7416 &chunk,
7417 );
7418 chunk.clear();
7419 }
7420 }
7421 if !chunk.is_empty() {
7422 increment_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, &mut hist, &chunk);
7423 }
7424
7425 Ok(hist)
7426}
7427
7428fn collect_primary_sparse_hist(
7429 config: &Config,
7430 hist_counts: &dyn CountLookup,
7431 keep_filter_counts: Option<&dyn CountLookup>,
7432 random_seed: u64,
7433) -> Result<SparseHist> {
7434 let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7435 let mut hist = SparseHist::default();
7436 let mut rng = JavaXoshiro::new(random_seed);
7437 let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7438
7439 while let Some((mut r1, mut r2)) = readers.next_pair()? {
7440 trim_pair(config, &mut r1, r2.as_mut());
7441 let rand = keep_filter_counts.map(|_| rng.next_double());
7442 chunk.push((r1, r2, rand));
7443 if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7444 let chunk_hist =
7445 sparse_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, &chunk);
7446 merge_sparse_hist(&mut hist, chunk_hist);
7447 chunk.clear();
7448 }
7449 }
7450 if !chunk.is_empty() {
7451 let chunk_hist =
7452 sparse_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, &chunk);
7453 merge_sparse_hist(&mut hist, chunk_hist);
7454 }
7455
7456 Ok(hist)
7457}
7458
7459#[cfg(test)]
7460fn collect_primary_read_hist(
7461 config: &Config,
7462 hist_counts: &dyn CountLookup,
7463 keep_filter_counts: Option<&dyn CountLookup>,
7464 random_seed: u64,
7465) -> Result<ReadDepthHistogram> {
7466 let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7467 let mut hist = ReadDepthHistogram::new(config.hist_len);
7468 let mut rng = JavaXoshiro::new(random_seed);
7469 let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7470
7471 while let Some((mut r1, mut r2)) = readers.next_pair()? {
7472 trim_pair(config, &mut r1, r2.as_mut());
7473 let rand = keep_filter_counts.map(|_| rng.next_double());
7474 chunk.push((r1, r2, rand));
7475 if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7476 increment_read_hist_from_pair_chunk(
7477 config,
7478 hist_counts,
7479 keep_filter_counts,
7480 &mut hist,
7481 &chunk,
7482 );
7483 chunk.clear();
7484 }
7485 }
7486 if !chunk.is_empty() {
7487 increment_read_hist_from_pair_chunk(
7488 config,
7489 hist_counts,
7490 keep_filter_counts,
7491 &mut hist,
7492 &chunk,
7493 );
7494 }
7495
7496 Ok(hist)
7497}
7498
7499fn collect_primary_sparse_read_hist(
7500 config: &Config,
7501 hist_counts: &dyn CountLookup,
7502 keep_filter_counts: Option<&dyn CountLookup>,
7503 random_seed: u64,
7504) -> Result<SparseReadDepthHist> {
7505 let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7506 let mut hist = SparseReadDepthHist::default();
7507 let mut rng = JavaXoshiro::new(random_seed);
7508 let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7509
7510 while let Some((mut r1, mut r2)) = readers.next_pair()? {
7511 trim_pair(config, &mut r1, r2.as_mut());
7512 let rand = keep_filter_counts.map(|_| rng.next_double());
7513 chunk.push((r1, r2, rand));
7514 if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7515 let chunk_hist =
7516 sparse_read_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, &chunk);
7517 merge_sparse_read_depth_hist(&mut hist, chunk_hist);
7518 chunk.clear();
7519 }
7520 }
7521 if !chunk.is_empty() {
7522 let chunk_hist =
7523 sparse_read_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, &chunk);
7524 merge_sparse_read_depth_hist(&mut hist, chunk_hist);
7525 }
7526
7527 Ok(hist)
7528}
7529
7530#[cfg(test)]
7531fn collect_primary_hist_and_read_hist(
7532 config: &Config,
7533 hist_counts: &dyn CountLookup,
7534 keep_filter_counts: Option<&dyn CountLookup>,
7535 random_seed: u64,
7536) -> Result<(Vec<u64>, ReadDepthHistogram)> {
7537 let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7538 let mut depth_hist = vec![0u64; config.hist_len];
7539 let mut read_hist = ReadDepthHistogram::new(config.hist_len);
7540 let mut rng = JavaXoshiro::new(random_seed);
7541 let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7542
7543 while let Some((mut r1, mut r2)) = readers.next_pair()? {
7544 trim_pair(config, &mut r1, r2.as_mut());
7545 let rand = keep_filter_counts.map(|_| rng.next_double());
7546 chunk.push((r1, r2, rand));
7547 if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7548 increment_hist_and_read_hist_from_pair_chunk(
7549 config,
7550 hist_counts,
7551 keep_filter_counts,
7552 &mut depth_hist,
7553 &mut read_hist,
7554 &chunk,
7555 );
7556 chunk.clear();
7557 }
7558 }
7559 if !chunk.is_empty() {
7560 increment_hist_and_read_hist_from_pair_chunk(
7561 config,
7562 hist_counts,
7563 keep_filter_counts,
7564 &mut depth_hist,
7565 &mut read_hist,
7566 &chunk,
7567 );
7568 }
7569
7570 Ok((depth_hist, read_hist))
7571}
7572
7573fn collect_primary_sparse_hist_and_read_hist(
7574 config: &Config,
7575 hist_counts: &dyn CountLookup,
7576 keep_filter_counts: Option<&dyn CountLookup>,
7577 random_seed: u64,
7578) -> Result<(SparseHist, SparseReadDepthHist)> {
7579 let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7580 let mut depth_hist = SparseHist::default();
7581 let mut read_hist = SparseReadDepthHist::default();
7582 let mut rng = JavaXoshiro::new(random_seed);
7583 let mut chunk = Vec::with_capacity(HIST_PARALLEL_CHUNK_SIZE);
7584
7585 while let Some((mut r1, mut r2)) = readers.next_pair()? {
7586 trim_pair(config, &mut r1, r2.as_mut());
7587 let rand = keep_filter_counts.map(|_| rng.next_double());
7588 chunk.push((r1, r2, rand));
7589 if chunk.len() >= HIST_PARALLEL_CHUNK_SIZE {
7590 let (chunk_depth_hist, chunk_read_hist) = sparse_hist_and_read_hist_from_pair_chunk(
7591 config,
7592 hist_counts,
7593 keep_filter_counts,
7594 &chunk,
7595 );
7596 merge_sparse_hist(&mut depth_hist, chunk_depth_hist);
7597 merge_sparse_read_depth_hist(&mut read_hist, chunk_read_hist);
7598 chunk.clear();
7599 }
7600 }
7601 if !chunk.is_empty() {
7602 let (chunk_depth_hist, chunk_read_hist) = sparse_hist_and_read_hist_from_pair_chunk(
7603 config,
7604 hist_counts,
7605 keep_filter_counts,
7606 &chunk,
7607 );
7608 merge_sparse_hist(&mut depth_hist, chunk_depth_hist);
7609 merge_sparse_read_depth_hist(&mut read_hist, chunk_read_hist);
7610 }
7611
7612 Ok((depth_hist, read_hist))
7613}
7614
7615fn emit_read_local_side_outputs(config: &Config) -> Result<()> {
7616 if !read_local_side_outputs_enabled(config) {
7617 return Ok(());
7618 }
7619
7620 let mut hist = collect_read_local_side_hists(config)?;
7621 if let Some(quality) = hist.quality.take() {
7622 emit_quality_side_outputs(config, &quality)?;
7623 }
7624 if let (Some(path), Some(length)) = (&config.length_hist_out, hist.length.as_ref()) {
7625 write_length_hist(path, length, config)?;
7626 }
7627 if let (Some(path), Some(gc)) = (&config.gc_hist_out, hist.gc.as_ref()) {
7628 write_gc_hist(path, gc, config)?;
7629 }
7630 if let (Some(path), Some(base)) = (&config.base_hist_out, hist.base.as_ref()) {
7631 write_base_content_hist(path, base, config)?;
7632 }
7633 if let (Some(path), Some(entropy)) = (&config.entropy_hist_out, hist.entropy.as_ref()) {
7634 write_entropy_hist(path, entropy, config)?;
7635 }
7636 if let (Some(path), Some(identity)) = (&config.identity_hist_out, hist.identity.as_ref()) {
7637 write_identity_hist(path, identity, config)?;
7638 }
7639 if let Some(alignment) = hist.alignment.as_ref() {
7640 emit_alignment_fallback_side_outputs(config, alignment)?;
7641 }
7642 if let (Some(path), Some(barcodes)) = (&config.barcode_stats_out, hist.barcodes.as_ref()) {
7643 write_barcode_stats(path, barcodes, config)?;
7644 }
7645 Ok(())
7646}
7647
7648fn read_local_side_outputs_enabled(config: &Config) -> bool {
7649 config.quality_hist_out.is_some()
7650 || config.base_quality_hist_out.is_some()
7651 || config.quality_count_hist_out.is_some()
7652 || config.average_quality_hist_out.is_some()
7653 || config.overall_base_quality_hist_out.is_some()
7654 || config.length_hist_out.is_some()
7655 || config.gc_hist_out.is_some()
7656 || config.base_hist_out.is_some()
7657 || config.entropy_hist_out.is_some()
7658 || config.identity_hist_out.is_some()
7659 || config.barcode_stats_out.is_some()
7660 || alignment_fallback_side_outputs_enabled(config)
7661}
7662
7663fn quality_side_outputs_enabled(config: &Config) -> bool {
7664 config.quality_hist_out.is_some()
7665 || config.base_quality_hist_out.is_some()
7666 || config.quality_count_hist_out.is_some()
7667 || config.average_quality_hist_out.is_some()
7668 || config.overall_base_quality_hist_out.is_some()
7669}
7670
7671fn alignment_fallback_side_outputs_enabled(config: &Config) -> bool {
7672 config.match_hist_out.is_some()
7673 || config.insert_hist_out.is_some()
7674 || config.quality_accuracy_hist_out.is_some()
7675 || config.indel_hist_out.is_some()
7676 || config.error_hist_out.is_some()
7677}
7678
7679fn emit_quality_side_outputs(config: &Config, hist: &QualitySideHistograms) -> Result<()> {
7680 if let Some(path) = &config.quality_hist_out {
7681 write_quality_hist(path, &hist.overall, config)?;
7682 }
7683 if let Some(path) = &config.quality_count_hist_out {
7684 write_quality_count_hist(
7685 path,
7686 &hist.first_counts,
7687 &hist.second_counts,
7688 hist.paired,
7689 config,
7690 )?;
7691 }
7692 if let Some(path) = &config.average_quality_hist_out {
7693 write_average_quality_hist(path, &hist.first_avg, &hist.second_avg, hist.paired, config)?;
7694 }
7695 if let Some(path) = &config.overall_base_quality_hist_out {
7696 write_overall_base_quality_hist(path, &hist.overall, config)?;
7697 }
7698 if let Some(path) = &config.base_quality_hist_out {
7699 write_base_quality_hist(path, hist, config)?;
7700 }
7701 Ok(())
7702}
7703
7704fn collect_read_local_side_hists(config: &Config) -> Result<ReadLocalSideHistograms> {
7705 let mut readers = PrimaryReaders::open(config, config.max_reads)?;
7706 let quality_len = side_hist_len(config);
7707 let side_len = side_hist_len(config);
7708 let mut hist = ReadLocalSideHistograms {
7709 quality: quality_side_outputs_enabled(config).then(|| QualitySideHistograms {
7710 overall: vec![0; quality_len],
7711 first_counts: vec![0; quality_len],
7712 second_counts: vec![0; quality_len],
7713 first_avg: vec![0; quality_len],
7714 second_avg: vec![0; quality_len],
7715 first_by_pos: vec![vec![0; quality_len]; side_len],
7716 second_by_pos: vec![vec![0; quality_len]; side_len],
7717 paired: false,
7718 }),
7719 length: config
7720 .length_hist_out
7721 .is_some()
7722 .then(|| ReadDepthHistogram::new(side_len)),
7723 gc: config
7724 .gc_hist_out
7725 .is_some()
7726 .then(|| ReadDepthHistogram::new(gc_hist_len(config))),
7727 base: config
7728 .base_hist_out
7729 .is_some()
7730 .then(|| BaseContentHistogram {
7731 first: vec![BaseCounts::default(); side_len],
7732 second: vec![BaseCounts::default(); side_len],
7733 }),
7734 entropy: config
7735 .entropy_hist_out
7736 .is_some()
7737 .then(|| vec![0u64; config.entropy_bins.saturating_add(1).max(1)]),
7738 identity: config
7739 .identity_hist_out
7740 .is_some()
7741 .then(|| ReadDepthHistogram::new(config.identity_bins.saturating_add(1).max(1))),
7742 alignment: alignment_fallback_side_outputs_enabled(config).then(|| {
7743 AlignmentFallbackHistograms {
7744 first_match: vec![MatchCounts::default(); side_len],
7745 second_match: vec![MatchCounts::default(); side_len],
7746 quality_match: vec![0; quality_len],
7747 ..AlignmentFallbackHistograms::default()
7748 }
7749 }),
7750 barcodes: config.barcode_stats_out.is_some().then(BTreeMap::new),
7751 };
7752
7753 while let Some((mut r1, mut r2)) = readers.next_pair()? {
7754 trim_pair(config, &mut r1, r2.as_mut());
7755 if let Some(barcodes) = hist.barcodes.as_mut() {
7756 increment_barcode_stats(barcodes, &r1, r2.is_some());
7757 }
7758 increment_read_local_side_hists(config, &mut hist, &r1, false);
7759 if let Some(mate) = r2.as_ref() {
7760 increment_read_local_side_hists(config, &mut hist, mate, true);
7761 }
7762 }
7763
7764 Ok(hist)
7765}
7766
7767fn side_hist_len(config: &Config) -> usize {
7768 config.side_hist_len.unwrap_or(config.hist_len).max(1)
7769}
7770
7771fn gc_hist_len(config: &Config) -> usize {
7772 config.gc_bins.unwrap_or(101).max(1)
7773}
7774
7775fn increment_length_hist(hist: &mut ReadDepthHistogram, read_len: usize) {
7776 let idx = read_len.min(hist.reads.len().saturating_sub(1));
7777 hist.reads[idx] += 1;
7778 hist.bases[idx] += read_len as u64;
7779}
7780
7781fn increment_read_local_side_hists(
7782 config: &Config,
7783 hist: &mut ReadLocalSideHistograms,
7784 record: &SequenceRecord,
7785 second: bool,
7786) {
7787 if let Some(quality) = hist.quality.as_mut() {
7788 if second {
7789 quality.paired = true;
7790 }
7791 increment_quality_side_hists(config, quality, record, second);
7792 }
7793 if let Some(length) = hist.length.as_mut() {
7794 increment_length_hist(length, record.len());
7795 }
7796 if let Some(gc) = hist.gc.as_mut() {
7797 increment_gc_hist(gc, record);
7798 }
7799 if let Some(base) = hist.base.as_mut() {
7800 if second {
7801 increment_base_content_hist(&mut base.second, record);
7802 } else {
7803 increment_base_content_hist(&mut base.first, record);
7804 }
7805 }
7806 if let Some(entropy) = hist.entropy.as_mut() {
7807 increment_entropy_hist(config, entropy, record);
7808 }
7809 if let Some(identity) = hist.identity.as_mut() {
7810 increment_sequence_identity_hist(identity, record);
7811 }
7812 if let Some(alignment) = hist.alignment.as_mut() {
7813 increment_alignment_fallback_hists(config, alignment, record, second);
7814 }
7815}
7816
7817fn increment_gc_hist(hist: &mut ReadDepthHistogram, record: &SequenceRecord) {
7818 let mut gc = 0usize;
7819 let mut acgt = 0usize;
7820 for base in &record.bases {
7821 match *base {
7822 b'G' | b'C' | b'g' | b'c' => {
7823 gc += 1;
7824 acgt += 1;
7825 }
7826 b'A' | b'T' | b'U' | b'a' | b't' | b'u' => acgt += 1,
7827 _ => {}
7828 }
7829 }
7830 let idx = if acgt == 0 {
7831 0
7832 } else {
7833 ((gc * hist.reads.len()) / acgt).min(hist.reads.len().saturating_sub(1))
7834 };
7835 hist.reads[idx] += 1;
7836 hist.bases[idx] += record.len() as u64;
7837}
7838
7839fn increment_quality_side_hists(
7840 config: &Config,
7841 hist: &mut QualitySideHistograms,
7842 record: &SequenceRecord,
7843 second: bool,
7844) {
7845 if record.is_empty() {
7846 return;
7847 }
7848
7849 let quality_len = hist.overall.len();
7850 let last_quality_idx = quality_len.saturating_sub(1);
7851 let (counts, avg_counts, by_pos) = if second {
7852 (
7853 &mut hist.second_counts,
7854 &mut hist.second_avg,
7855 &mut hist.second_by_pos,
7856 )
7857 } else {
7858 (
7859 &mut hist.first_counts,
7860 &mut hist.first_avg,
7861 &mut hist.first_by_pos,
7862 )
7863 };
7864
7865 let mut sum = 0usize;
7866 for idx in 0..record.len() {
7867 let quality = record_quality_at(config, record, idx).min(last_quality_idx);
7868 hist.overall[quality] += 1;
7869 counts[quality] += 1;
7870 sum += quality;
7871 if idx < by_pos.len() {
7872 by_pos[idx][quality] += 1;
7873 }
7874 }
7875
7876 let avg = ((sum as f64) / (record.len() as f64)).round() as usize;
7877 avg_counts[avg.min(last_quality_idx)] += 1;
7878}
7879
7880fn record_quality_at(config: &Config, record: &SequenceRecord, idx: usize) -> usize {
7881 record
7882 .qualities
7883 .as_ref()
7884 .and_then(|qualities| qualities.get(idx))
7885 .map_or(config.fake_quality as usize, |quality| {
7886 quality.saturating_sub(33) as usize
7887 })
7888}
7889
7890fn increment_base_content_hist(hist: &mut [BaseCounts], record: &SequenceRecord) {
7891 for (idx, base) in record.bases.iter().copied().enumerate().take(hist.len()) {
7892 let counts = &mut hist[idx];
7893 match base {
7894 b'A' | b'a' => counts.a += 1,
7895 b'C' | b'c' => counts.c += 1,
7896 b'G' | b'g' => counts.g += 1,
7897 b'T' | b't' | b'U' | b'u' => counts.t += 1,
7898 _ => counts.n += 1,
7899 }
7900 }
7901}
7902
7903fn increment_entropy_hist(config: &Config, hist: &mut [u64], record: &SequenceRecord) {
7904 if record.is_empty() {
7905 return;
7906 }
7907 if let Some(entropy) = read_entropy(config, &record.bases) {
7908 let bins = hist.len().saturating_sub(1);
7909 let idx = ((entropy * hist.len() as f64) as usize).min(bins);
7910 hist[idx] += 1;
7911 }
7912}
7913
7914fn increment_sequence_identity_hist(hist: &mut ReadDepthHistogram, record: &SequenceRecord) {
7915 let idx = hist.reads.len().saturating_sub(1);
7916 hist.reads[idx] += 1;
7917 hist.bases[idx] += record.len() as u64;
7918}
7919
7920fn increment_barcode_stats(
7921 barcodes: &mut BTreeMap<String, u64>,
7922 record: &SequenceRecord,
7923 paired: bool,
7924) {
7925 let barcode = header_to_barcode(&record.id).unwrap_or("NONE");
7926 let count = if paired { 2 } else { 1 };
7927 *barcodes.entry(barcode.to_string()).or_insert(0) += count;
7928}
7929
7930fn header_to_barcode(id: &str) -> Option<&str> {
7931 let loc = id.rfind(':')?;
7932 let loc2 = id
7933 .find(' ')
7934 .map(|idx| idx as isize)
7935 .unwrap_or(-1)
7936 .max(id.find('/').map(|idx| idx as isize).unwrap_or(-1));
7937 if (loc as isize) <= loc2 || loc >= id.len().saturating_sub(1) {
7938 return None;
7939 }
7940 let start = loc + 1;
7941 let stop = id[start..]
7942 .find([' ', '\t'])
7943 .map_or(id.len(), |offset| start + offset);
7944 Some(&id[start..stop])
7945}
7946
7947fn increment_alignment_fallback_hists(
7948 config: &Config,
7949 hist: &mut AlignmentFallbackHistograms,
7950 record: &SequenceRecord,
7951 second: bool,
7952) {
7953 hist.read_count += 1;
7954 hist.base_count += record.len() as u64;
7955 if second {
7956 hist.paired = true;
7957 hist.pair_count += 1;
7958 }
7959
7960 let match_hist = if second {
7961 &mut hist.second_match
7962 } else {
7963 &mut hist.first_match
7964 };
7965 for (idx, base) in record
7966 .bases
7967 .iter()
7968 .copied()
7969 .enumerate()
7970 .take(match_hist.len())
7971 {
7972 if is_acgt(base) {
7973 match_hist[idx].matches += 1;
7974 } else {
7975 match_hist[idx].n += 1;
7976 }
7977 }
7978
7979 for idx in 0..record.len() {
7980 let quality =
7981 record_quality_at(config, record, idx).min(hist.quality_match.len().saturating_sub(1));
7982 hist.quality_match[quality] += 1;
7983 }
7984}
7985
7986fn read_entropy(config: &Config, bases: &[u8]) -> Option<f64> {
7987 let k = config.entropy_k.clamp(1, 15);
7988 if bases.len() < k {
7989 return base_entropy(config, bases);
7990 }
7991
7992 let window = config.entropy_window.max(k).min(bases.len());
7993 let mut sum = 0.0;
7994 let mut count = 0usize;
7995 for start in 0..=bases.len() - window {
7996 if let Some(entropy) = window_kmer_entropy(config, &bases[start..start + window], k) {
7997 sum += entropy;
7998 count += 1;
7999 }
8000 }
8001
8002 if count == 0 {
8003 None
8004 } else {
8005 Some((sum / count as f64).clamp(0.0, 1.0))
8006 }
8007}
8008
8009fn window_kmer_entropy(config: &Config, window: &[u8], k: usize) -> Option<f64> {
8010 if window.len() < k {
8011 return base_entropy(config, window);
8012 }
8013
8014 let mut counts: FxHashMap<Vec<u8>, u64> = FxHashMap::default();
8015 let mut total = 0u64;
8016 for kmer in window.windows(k) {
8017 if !config.allow_entropy_ns && kmer.iter().any(|base| !is_acgt(*base)) {
8018 continue;
8019 }
8020 let key: Vec<u8> = kmer
8021 .iter()
8022 .copied()
8023 .map(|base| match base.to_ascii_uppercase() {
8024 b'A' | b'C' | b'G' | b'T' => base.to_ascii_uppercase(),
8025 _ => b'N',
8026 })
8027 .collect();
8028 *counts.entry(key).or_insert(0) += 1;
8029 total += 1;
8030 }
8031
8032 if total == 0 {
8033 return None;
8034 }
8035 let entropy = shannon_entropy(counts.values().copied(), total);
8036 let max_entropy = (total as f64).ln();
8037 Some(if max_entropy > 0.0 {
8038 entropy / max_entropy
8039 } else {
8040 0.0
8041 })
8042}
8043
8044fn base_entropy(config: &Config, bases: &[u8]) -> Option<f64> {
8045 let mut counts = [0u64; 5];
8046 let mut total = 0u64;
8047 for base in bases {
8048 let idx = match base.to_ascii_uppercase() {
8049 b'A' => Some(0),
8050 b'C' => Some(1),
8051 b'G' => Some(2),
8052 b'T' | b'U' => Some(3),
8053 _ if config.allow_entropy_ns => Some(4),
8054 _ => None,
8055 };
8056 if let Some(idx) = idx {
8057 counts[idx] += 1;
8058 total += 1;
8059 }
8060 }
8061 if total == 0 {
8062 return None;
8063 }
8064 let entropy = shannon_entropy(counts, total);
8065 let nonzero = counts.into_iter().filter(|count| *count > 0).count();
8066 let max_entropy = (nonzero.max(1) as f64).ln();
8067 Some(if max_entropy > 0.0 {
8068 entropy / max_entropy
8069 } else {
8070 0.0
8071 })
8072}
8073
8074fn shannon_entropy(counts: impl IntoIterator<Item = u64>, total: u64) -> f64 {
8075 let total = total as f64;
8076 counts
8077 .into_iter()
8078 .filter(|count| *count > 0)
8079 .map(|count| {
8080 let p = count as f64 / total;
8081 -p * p.ln()
8082 })
8083 .sum()
8084}
8085
8086fn is_acgt(base: u8) -> bool {
8087 matches!(base, b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't')
8088}
8089
8090fn analyze_pair(
8091 config: &Config,
8092 counts: &dyn CountLookup,
8093 r1: &SequenceRecord,
8094 r2: Option<&SequenceRecord>,
8095) -> PairAnalysis {
8096 let (read1, read2) = match r2 {
8097 Some(record) if r1.len() + record.len() >= PAIRED_ANALYSIS_JOIN_MIN_BASES => {
8098 let (read1, read2) = rayon::join(
8099 || analyze_read(config, counts, r1),
8100 || analyze_read(config, counts, record),
8101 );
8102 (read1, Some(read2))
8103 }
8104 Some(record) => (
8105 analyze_read(config, counts, r1),
8106 Some(analyze_read(config, counts, record)),
8107 ),
8108 None => (analyze_read(config, counts, r1), None),
8109 };
8110 pair_analysis_from_reads(config, read1, read2)
8111}
8112
8113fn analyze_pair_for_two_configs(
8114 config: &Config,
8115 other_config: &Config,
8116 counts: &dyn CountLookup,
8117 r1: &SequenceRecord,
8118 r2: Option<&SequenceRecord>,
8119) -> (PairAnalysis, PairAnalysis) {
8120 if !can_share_read_coverage(config, other_config) {
8121 return (
8122 analyze_pair(config, counts, r1, r2),
8123 analyze_pair(other_config, counts, r1, r2),
8124 );
8125 }
8126
8127 let ((read1, other_read1), read2_pair) = match r2 {
8128 Some(record) if r1.len() + record.len() >= PAIRED_ANALYSIS_JOIN_MIN_BASES => {
8129 let (first, second) = rayon::join(
8130 || analyze_read_for_two_configs(config, other_config, counts, r1),
8131 || analyze_read_for_two_configs(config, other_config, counts, record),
8132 );
8133 (first, Some(second))
8134 }
8135 Some(record) => (
8136 analyze_read_for_two_configs(config, other_config, counts, r1),
8137 Some(analyze_read_for_two_configs(
8138 config,
8139 other_config,
8140 counts,
8141 record,
8142 )),
8143 ),
8144 None => (
8145 analyze_read_for_two_configs(config, other_config, counts, r1),
8146 None,
8147 ),
8148 };
8149 let (read2, other_read2) = read2_pair
8150 .map(|(read, other_read)| (Some(read), Some(other_read)))
8151 .unwrap_or((None, None));
8152 (
8153 pair_analysis_from_reads(config, read1, read2),
8154 pair_analysis_from_reads(other_config, other_read1, other_read2),
8155 )
8156}
8157
8158fn can_share_read_coverage(config: &Config, other_config: &Config) -> bool {
8159 config.k == other_config.k
8160 && (config.canonical || config.k <= 31) == (other_config.canonical || other_config.k <= 31)
8161 && config.fix_spikes == other_config.fix_spikes
8162}
8163
8164fn pair_analysis_from_reads(
8165 config: &Config,
8166 read1: ReadAnalysis,
8167 read2: Option<ReadAnalysis>,
8168) -> PairAnalysis {
8169 let depth_proxy_al = match (&read2, config.use_lower_depth) {
8170 (Some(read2), true) => min_option(read1.depth_al, read2.depth_al),
8171 (Some(read2), false) => max_option(read1.depth_al, read2.depth_al),
8172 (None, _) => read1.depth_al,
8173 };
8174 let max_true_depth = match &read2 {
8175 Some(read2) => max_option(read1.true_depth, read2.true_depth),
8176 None => read1.true_depth,
8177 };
8178 let low_kmer_count =
8179 read1.low_kmer_count + read2.as_ref().map(|read| read.low_kmer_count).unwrap_or(0);
8180 let total_kmer_count = read1.total_kmer_count
8181 + read2
8182 .as_ref()
8183 .map(|read| read.total_kmer_count)
8184 .unwrap_or(0);
8185 PairAnalysis {
8186 error1: read1.error,
8187 error2: read2.as_ref().is_some_and(|read| read.error),
8188 read1,
8189 read2,
8190 depth_proxy_al,
8191 max_true_depth,
8192 low_kmer_count,
8193 total_kmer_count,
8194 }
8195}
8196
8197fn analyze_read(
8198 config: &Config,
8199 counts: &dyn CountLookup,
8200 record: &SequenceRecord,
8201) -> ReadAnalysis {
8202 let coverage = read_coverage_desc(config, counts, record);
8203 analyze_read_from_coverage(config, coverage.coverage_desc, coverage.had_kmer_windows)
8204}
8205
8206fn analyze_read_for_two_configs(
8207 config: &Config,
8208 other_config: &Config,
8209 counts: &dyn CountLookup,
8210 record: &SequenceRecord,
8211) -> (ReadAnalysis, ReadAnalysis) {
8212 let coverage = read_coverage_desc(config, counts, record);
8213 let other_coverage = coverage.coverage_desc.clone();
8214 (
8215 analyze_read_from_coverage(config, coverage.coverage_desc, coverage.had_kmer_windows),
8216 analyze_read_from_coverage(other_config, other_coverage, coverage.had_kmer_windows),
8217 )
8218}
8219
8220struct ReadCoverageDesc {
8221 coverage_desc: Vec<i64>,
8222 had_kmer_windows: bool,
8223}
8224
8225fn read_coverage_desc(
8226 config: &Config,
8227 counts: &dyn CountLookup,
8228 record: &SequenceRecord,
8229) -> ReadCoverageDesc {
8230 let windows = unfiltered_kmer_windows_for_record(record, config);
8231 let mut coverage: Vec<i64> = windows
8232 .iter()
8233 .map(|window| match window {
8234 Some(kmer) => u64_to_i64_saturating(counts.depth(kmer)),
8235 None => -1,
8236 })
8237 .collect();
8238 if coverage.is_empty() {
8239 return ReadCoverageDesc {
8240 coverage_desc: coverage,
8241 had_kmer_windows: record.len() >= config.k,
8242 };
8243 }
8244 if config.fix_spikes {
8245 fix_spikes(&mut coverage, &windows, counts, config.k);
8246 }
8247 if coverage.len() >= COVERAGE_PAR_SORT_MIN_WINDOWS {
8248 coverage.par_sort_unstable_by(|a, b| b.cmp(a));
8249 } else {
8250 coverage.sort_unstable_by(|a, b| b.cmp(a));
8251 }
8252 ReadCoverageDesc {
8253 coverage_desc: coverage,
8254 had_kmer_windows: true,
8255 }
8256}
8257
8258fn analyze_read_from_coverage(
8259 config: &Config,
8260 coverage: Vec<i64>,
8261 had_kmer_windows: bool,
8262) -> ReadAnalysis {
8263 if coverage.is_empty() {
8264 return ReadAnalysis {
8265 had_kmer_windows,
8266 ..ReadAnalysis::default()
8267 };
8268 }
8269 let cov_last = coverage.len() - 1;
8270 let high = coverage[percentile_index(cov_last, config.high_percentile)];
8271 let low = coverage[percentile_index(cov_last, config.low_percentile)];
8272 let true_depth = coverage[percentile_index(cov_last, config.depth_percentile)];
8273 let min_true_depth = low;
8274 let min_depth = u64_to_i64_saturating(config.min_depth)
8275 .max(high / u64_to_i64_saturating(config.error_detect_ratio));
8276
8277 let mut above_limit = cov_last as isize;
8278 while above_limit >= 0 && coverage[above_limit as usize] < min_depth {
8279 above_limit -= 1;
8280 }
8281
8282 let depth_al = if above_limit >= 0
8283 && ((above_limit as usize + 1) >= config.min_kmers_over_min_depth
8284 || config.min_kmers_over_min_depth > coverage.len())
8285 {
8286 let idx = ((above_limit as f64) * (1.0 - config.depth_percentile)) as usize;
8287 non_negative_depth(coverage[idx])
8288 } else {
8289 None
8290 };
8291
8292 let low_thresh = u64_to_i64_saturating(config.low_thresh);
8293 let high_thresh = u64_to_i64_saturating(config.high_thresh);
8294 let error_detect_ratio = u64_to_i64_saturating(config.error_detect_ratio);
8295 let error = high <= low_thresh
8296 || (high >= high_thresh && low <= low_thresh)
8297 || high >= low.saturating_mul(error_detect_ratio);
8298 let low_kmer_count =
8299 low_kmer_count(&coverage, low_thresh, high_thresh, high, error_detect_ratio);
8300
8301 ReadAnalysis {
8302 depth_al,
8303 true_depth: non_negative_depth(true_depth),
8304 min_true_depth: non_negative_depth(min_true_depth),
8305 low_kmer_count,
8306 total_kmer_count: coverage.len(),
8307 error,
8308 had_kmer_windows: true,
8309 coverage_desc: coverage,
8310 }
8311}
8312
8313fn low_kmer_count(
8314 coverage_desc: &[i64],
8315 low_thresh: i64,
8316 high_thresh: i64,
8317 high_depth: i64,
8318 error_detect_ratio: i64,
8319) -> usize {
8320 if coverage_desc.is_empty() {
8321 return 0;
8322 }
8323 if coverage_desc[0] <= low_thresh {
8324 return coverage_desc.len();
8325 }
8326 if high_depth < high_thresh {
8327 return 0;
8328 }
8329 let limit = low_thresh.min(high_depth / error_detect_ratio.max(1));
8330 coverage_desc
8331 .iter()
8332 .rev()
8333 .take_while(|&&depth| depth <= limit)
8334 .count()
8335}
8336
8337fn correct_pair_errors(
8338 config: &Config,
8339 counts: &dyn CountLookup,
8340 r1: &mut SequenceRecord,
8341 r2: Option<&mut SequenceRecord>,
8342) -> CorrectionResult {
8343 let mut result = CorrectionResult::default();
8344 let mut r2 = r2;
8345 if config.overlap_error_correct
8346 && !config.mark_errors_only
8347 && let Some(mate) = r2.as_deref_mut()
8348 {
8349 let overlap = correct_pair_by_overlap(config, r1, mate);
8350 result.corrected += overlap.corrected;
8351 result.marked += overlap.marked;
8352 result.uncorrectable |= overlap.uncorrectable;
8353 }
8354
8355 let read_result = correct_read_errors(config, counts, r1);
8356 result.corrected += read_result.corrected;
8357 result.marked += read_result.marked;
8358 result.uncorrectable |= read_result.uncorrectable;
8359 if let Some(mate) = r2 {
8360 let mate_result = correct_read_errors(config, counts, mate);
8361 result.corrected += mate_result.corrected;
8362 result.marked += mate_result.marked;
8363 result.uncorrectable |= mate_result.uncorrectable;
8364 }
8365 result
8366}
8367
8368fn correct_pair_errors_with_rollback(
8369 config: &Config,
8370 counts: &dyn CountLookup,
8371 r1: &mut SequenceRecord,
8372 mut r2: Option<&mut SequenceRecord>,
8373) -> CorrectionResult {
8374 let rollback =
8375 (!config.mark_uncorrectable_errors).then(|| (r1.clone(), r2.as_deref().cloned()));
8376 let correction = correct_pair_errors(config, counts, r1, r2.as_deref_mut());
8377 if correction.uncorrectable
8378 && let Some((original_r1, original_r2)) = rollback
8379 {
8380 *r1 = original_r1;
8381 if let (Some(mate), Some(original)) = (r2, original_r2) {
8382 *mate = original;
8383 }
8384 }
8385 correction
8386}
8387
8388fn correct_pair_by_overlap(
8389 config: &Config,
8390 r1: &mut SequenceRecord,
8391 r2: &mut SequenceRecord,
8392) -> CorrectionResult {
8393 let Some(overlap) = best_pair_overlap(r1, r2) else {
8394 return CorrectionResult::default();
8395 };
8396 if overlap_expected_mismatch_rejects(r1, r2, &overlap) {
8397 return CorrectionResult::default();
8398 }
8399 if overlap_probability_rejects(r1, r2, &overlap) {
8400 return CorrectionResult::default();
8401 }
8402 let mut corrected = 0usize;
8403
8404 for pair in overlap.pairs {
8405 let b1 = r1.bases[pair.r1_index].to_ascii_uppercase();
8406 let b2 = complement_base(r2.bases[pair.r2_index]).to_ascii_uppercase();
8407 let q1 = base_quality(r1, pair.r1_index);
8408 let q2 = base_quality(r2, pair.r2_index);
8409 let Some((merged_base, merged_quality)) =
8410 overlap_consensus_base_and_quality(config, b1, b2, q1, q2)
8411 else {
8412 continue;
8413 };
8414
8415 let merged_r2_base = complement_base(merged_base);
8416 if r1.bases[pair.r1_index] != merged_base || r2.bases[pair.r2_index] != merged_r2_base {
8417 corrected += 1;
8418 }
8419
8420 r1.bases[pair.r1_index] = merged_base;
8421 r2.bases[pair.r2_index] = merged_r2_base;
8422
8423 if config.change_quality
8424 && let (Some(r1_qualities), Some(r2_qualities)) =
8425 (r1.qualities.as_mut(), r2.qualities.as_mut())
8426 {
8427 let merged_ascii = merged_quality.saturating_add(33);
8428 r1_qualities[pair.r1_index] = merged_ascii;
8429 r2_qualities[pair.r2_index] = merged_ascii;
8430 }
8431 }
8432
8433 CorrectionResult {
8434 corrected,
8435 ..CorrectionResult::default()
8436 }
8437}
8438
8439fn overlap_expected_mismatch_rejects(
8440 r1: &SequenceRecord,
8441 r2: &SequenceRecord,
8442 overlap: &PairOverlap,
8443) -> bool {
8444 let (Some(q1), Some(q2)) = (r1.qualities.as_ref(), r2.qualities.as_ref()) else {
8445 return false;
8446 };
8447
8448 let mut expected = 0.0f64;
8449 for pair in &overlap.pairs {
8450 let b1 = r1.bases[pair.r1_index].to_ascii_uppercase();
8451 let b2 = complement_base(r2.bases[pair.r2_index]).to_ascii_uppercase();
8452 if !is_defined_base(b1) || !is_defined_base(b2) {
8453 continue;
8454 }
8455 let p1 = 1.0 - phred_error_probability(q1[pair.r1_index].saturating_sub(33));
8456 let p2 = 1.0 - phred_error_probability(q2[pair.r2_index].saturating_sub(33));
8457 expected += 1.0 - (p1 * p2);
8458 }
8459
8460 (expected + 0.05) * 4.0 < overlap.mismatches as f64
8461}
8462
8463fn overlap_probability_rejects(
8464 r1: &SequenceRecord,
8465 r2: &SequenceRecord,
8466 overlap: &PairOverlap,
8467) -> bool {
8468 const MIN_PROBABILITY: f64 = 0.0008;
8469
8470 let (Some(q1), Some(q2)) = (r1.qualities.as_ref(), r2.qualities.as_ref()) else {
8471 return false;
8472 };
8473
8474 let mut ln_actual = 0.0f64;
8475 let mut ln_common = 0.0f64;
8476 let mut measured = 0usize;
8477
8478 for pair in &overlap.pairs {
8479 let b1 = r1.bases[pair.r1_index].to_ascii_uppercase();
8480 let b2 = complement_base(r2.bases[pair.r2_index]).to_ascii_uppercase();
8481 if !is_defined_base(b1) || !is_defined_base(b2) {
8482 continue;
8483 }
8484
8485 let prob_correct = overlap_correctness_probability_v4(q1[pair.r1_index])
8486 * overlap_correctness_probability_v4(q2[pair.r2_index]);
8487 let prob_match = prob_correct + (1.0 - prob_correct) * 0.25;
8488 let prob_error = 1.0 - prob_match;
8489
8490 ln_common += prob_match.max(prob_error).ln();
8491 ln_actual += if b1 == b2 { prob_match } else { prob_error }.ln();
8492 measured += 1;
8493 }
8494
8495 if measured == 0 {
8496 return false;
8497 }
8498
8499 0.5 * (ln_actual - ln_common) < MIN_PROBABILITY.ln()
8500}
8501
8502fn overlap_consensus_base_and_quality(
8503 config: &Config,
8504 r1_base: u8,
8505 r2_base: u8,
8506 q1: u8,
8507 q2: u8,
8508) -> Option<(u8, u8)> {
8509 const MAX_MERGE_QUALITY: u8 = 50;
8510
8511 if !is_defined_base(r1_base) && !is_defined_base(r2_base) {
8512 return None;
8513 }
8514 if !is_defined_base(r1_base) {
8515 return Some((r2_base, q2));
8516 }
8517 if !is_defined_base(r2_base) {
8518 return Some((r1_base, q1));
8519 }
8520 if r1_base == r2_base {
8521 let merged_quality = q1
8522 .max(q2)
8523 .saturating_add(q1.min(q2) / 4)
8524 .min(MAX_MERGE_QUALITY);
8525 return Some((r1_base, merged_quality));
8526 }
8527 if q1 == q2 {
8528 return Some((b'N', 0));
8529 }
8530 if q1 > q2 {
8531 if q2 > config.max_quality_to_correct {
8532 return None;
8533 }
8534 return Some((r1_base, q1.saturating_sub(q2)));
8535 }
8536 if q1 > config.max_quality_to_correct {
8537 return None;
8538 }
8539 Some((r2_base, q2.saturating_sub(q1)))
8540}
8541
8542fn overlap_entropy_min_overlap(bases: &[u8]) -> usize {
8543 overlap_entropy_min_overlap_side(bases.iter().copied()).max(overlap_entropy_min_overlap_side(
8544 bases.iter().rev().copied(),
8545 ))
8546}
8547
8548fn overlap_entropy_min_overlap_side(bases: impl IntoIterator<Item = u8>) -> usize {
8549 const K: usize = 3;
8550 const MASK: usize = (1 << (2 * K)) - 1;
8551 const MIN_SCORE: usize = 42;
8552
8553 let mut counts = [0u16; 1 << (2 * K)];
8554 let mut kmer = 0usize;
8555 let mut len = 0usize;
8556 let mut ones = 0usize;
8557 let mut twos = 0usize;
8558 let mut seen = 0usize;
8559
8560 for base in bases {
8561 let Some(bits) = base_to_two_bit(base) else {
8562 len = 0;
8563 kmer = 0;
8564 seen += 1;
8565 continue;
8566 };
8567 len += 1;
8568 kmer = ((kmer << 2) | bits) & MASK;
8569 if len >= K {
8570 counts[kmer] = counts[kmer].saturating_add(1);
8571 if counts[kmer] == 1 {
8572 ones += 1;
8573 } else if counts[kmer] == 2 {
8574 twos += 1;
8575 }
8576 if ones * 4 + twos >= MIN_SCORE {
8577 return seen;
8578 }
8579 }
8580 seen += 1;
8581 }
8582
8583 seen + 1
8584}
8585
8586fn base_to_two_bit(base: u8) -> Option<usize> {
8587 match base.to_ascii_uppercase() {
8588 b'A' => Some(0),
8589 b'C' => Some(1),
8590 b'G' => Some(2),
8591 b'T' => Some(3),
8592 _ => None,
8593 }
8594}
8595
8596#[derive(Debug, Clone, Copy)]
8597struct OverlapBasePair {
8598 r1_index: usize,
8599 r2_index: usize,
8600}
8601
8602#[derive(Debug, Clone)]
8603struct PairOverlap {
8604 pairs: Vec<OverlapBasePair>,
8605 mismatches: usize,
8606}
8607
8608const OVERLAP_MAX_RATIO: f64 = 0.075;
8609const OVERLAP_MIN_SECOND_RATIO: f64 = 0.12;
8610const OVERLAP_RATIO_MARGIN: f64 = 7.5;
8611const OVERLAP_RATIO_OFFSET: f64 = 0.55;
8612const OVERLAP_PROB_CORRECT4: &[f64] = &[
8613 0.0000, 0.2501, 0.3690, 0.4988, 0.6019, 0.6838, 0.7488, 0.8005, 0.8415, 0.8741, 0.9000, 0.9206,
8614 0.9369, 0.9499, 0.9602, 0.9684, 0.9749, 0.9800, 0.9842, 0.9874, 0.9900, 0.9921, 0.9937, 0.9950,
8615 0.9960, 0.9968, 0.9975, 0.9980, 0.9984, 0.9987, 0.9990, 0.9992, 0.9994, 0.9995, 0.9996, 0.9997,
8616 0.9997, 0.9998, 0.9998, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999,
8617 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999,
8618];
8619
8620fn best_pair_overlap(r1: &SequenceRecord, r2: &SequenceRecord) -> Option<PairOverlap> {
8621 best_pair_overlap_without_qualities(&r1.bases, &r2.bases)
8622}
8623
8624fn overlap_correctness_probability_v4(quality_ascii: u8) -> f64 {
8625 let phred = quality_ascii.saturating_sub(33) as usize;
8626 OVERLAP_PROB_CORRECT4[phred.min(OVERLAP_PROB_CORRECT4.len() - 1)]
8627}
8628
8629fn best_pair_overlap_without_qualities(r1: &[u8], r2: &[u8]) -> Option<PairOverlap> {
8630 if r1.is_empty() || r2.is_empty() {
8631 return None;
8632 }
8633 if r1.len().min(r2.len()) < 35 {
8634 return None;
8635 }
8636 let min_overlap = 11usize
8637 .max(overlap_entropy_min_overlap(r1))
8638 .max(overlap_entropy_min_overlap(r2));
8639 let min_length = r1.len().min(r2.len());
8640 if min_overlap > min_length {
8641 return None;
8642 }
8643
8644 let best_ratio_cap = find_best_overlap_ratio_without_qualities(r1, r2, min_overlap);
8645 if best_ratio_cap > OVERLAP_MAX_RATIO {
8646 return None;
8647 }
8648 let max_ratio = best_ratio_cap.min(OVERLAP_MAX_RATIO);
8649 let margin2 = (OVERLAP_RATIO_MARGIN + OVERLAP_RATIO_OFFSET) / min_length as f64;
8650 let mut best_insert: Option<usize> = None;
8651 let mut best_overlap = 0usize;
8652 let mut best_bad = min_length as f64;
8653 let mut best_good = 0.0f64;
8654 let mut best_ratio = 1.0f64;
8655 let mut best_mismatches = 0usize;
8656 let mut second_best_ratio = 1.0f64;
8657 let mut ambig = false;
8658
8659 let largest_insert_to_test = r1.len() + r2.len() - 5;
8660 for insert in (25..=largest_insert_to_test).rev() {
8661 let start1 = if insert <= r2.len() {
8662 0
8663 } else {
8664 insert - r2.len()
8665 };
8666 let start2 = if insert >= r2.len() {
8667 0
8668 } else {
8669 r2.len() - insert
8670 };
8671 let overlap = (r1.len() - start1).min(r2.len() - start2).min(insert);
8672 if overlap < 5 {
8673 continue;
8674 }
8675
8676 let bad_limit =
8677 1.2 * best_ratio.min(max_ratio) * OVERLAP_RATIO_MARGIN * overlap as f64 + 1.0;
8678 let mut good = 0.0f64;
8679 let mut bad = 0.0f64;
8680 let mut mismatches = 0usize;
8681
8682 for step in 0..overlap {
8683 let r1_index = start1 + step;
8684 let r2_rc_index = start2 + step;
8685 let r2_index = r2.len() - 1 - r2_rc_index;
8686 let b1 = r1[r1_index].to_ascii_uppercase();
8687 let b2 = complement_base(r2[r2_index]).to_ascii_uppercase();
8688 if b1 == b2 {
8689 if b1 != b'N' {
8690 good += 0.95;
8691 }
8692 } else {
8693 bad += 0.95;
8694 mismatches += 1;
8695 if bad > bad_limit {
8696 break;
8697 }
8698 }
8699 }
8700
8701 if bad > bad_limit {
8702 continue;
8703 }
8704 if bad == 0.0 && good > 5.0 && good < min_overlap as f64 {
8705 return None;
8706 }
8707
8708 let ratio = (bad + OVERLAP_RATIO_OFFSET) / overlap as f64;
8709 if ratio < best_ratio * OVERLAP_RATIO_MARGIN {
8710 ambig = ratio * OVERLAP_RATIO_MARGIN >= best_ratio || good < min_overlap as f64;
8711
8712 if ratio < best_ratio {
8713 second_best_ratio = best_ratio;
8714 best_insert = Some(insert);
8715 best_overlap = overlap;
8716 best_bad = bad;
8717 best_good = good;
8718 best_ratio = ratio;
8719 best_mismatches = mismatches;
8720 } else if ratio < second_best_ratio {
8721 second_best_ratio = ratio;
8722 }
8723
8724 if (ambig && best_ratio < margin2) || second_best_ratio < OVERLAP_MIN_SECOND_RATIO {
8725 return None;
8726 }
8727 }
8728 }
8729
8730 if second_best_ratio < OVERLAP_MIN_SECOND_RATIO {
8731 ambig = true;
8732 }
8733 if !ambig && best_ratio > max_ratio {
8734 return None;
8735 }
8736
8737 let insert = best_insert?;
8738 let start1 = if insert <= r2.len() {
8739 0
8740 } else {
8741 insert - r2.len()
8742 };
8743 let start2 = if insert >= r2.len() {
8744 0
8745 } else {
8746 r2.len() - insert
8747 };
8748 let mut pairs = Vec::with_capacity(best_overlap);
8749 for step in 0..best_overlap {
8750 let r1_index = start1 + step;
8751 let r2_rc_index = start2 + step;
8752 let r2_index = r2.len() - 1 - r2_rc_index;
8753 pairs.push(OverlapBasePair { r1_index, r2_index });
8754 }
8755
8756 let _ = (best_bad, best_good);
8757 Some(PairOverlap {
8758 pairs,
8759 mismatches: best_mismatches,
8760 })
8761}
8762
8763fn find_best_overlap_ratio_without_qualities(r1: &[u8], r2: &[u8], min_overlap: usize) -> f64 {
8764 let mut best_ratio = OVERLAP_MAX_RATIO + 0.0001;
8765 let largest_insert_to_test = r1.len() + r2.len() - min_overlap;
8766
8767 for insert in (35..=largest_insert_to_test).rev() {
8768 let start1 = if insert <= r2.len() {
8769 0
8770 } else {
8771 insert - r2.len()
8772 };
8773 let start2 = if insert >= r2.len() {
8774 0
8775 } else {
8776 r2.len() - insert
8777 };
8778 let overlap = (r1.len() - start1).min(r2.len() - start2).min(insert);
8779 if overlap < min_overlap {
8780 continue;
8781 }
8782
8783 let mut good = 0.0f64;
8784 let mut bad = 0.0f64;
8785 let bad_limit = best_ratio * overlap as f64 + 1.0;
8786
8787 for step in 0..overlap {
8788 let r1_index = start1 + step;
8789 let r2_rc_index = start2 + step;
8790 let r2_index = r2.len() - 1 - r2_rc_index;
8791 let b1 = r1[r1_index].to_ascii_uppercase();
8792 let b2 = complement_base(r2[r2_index]).to_ascii_uppercase();
8793 if b1 == b2 {
8794 if b1 != b'N' {
8795 good += 0.95;
8796 }
8797 } else {
8798 bad += 0.95;
8799 if bad > bad_limit {
8800 break;
8801 }
8802 }
8803 }
8804
8805 if bad > bad_limit {
8806 continue;
8807 }
8808 if bad == 0.0 && good > 5.0 && good < min_overlap as f64 {
8809 return 100.0;
8810 }
8811 let ratio = (bad + OVERLAP_RATIO_OFFSET) / overlap as f64;
8812 if ratio < best_ratio {
8813 best_ratio = ratio;
8814 if good >= min_overlap as f64 && ratio < OVERLAP_MAX_RATIO * 0.5 {
8815 return best_ratio;
8816 }
8817 }
8818 }
8819
8820 best_ratio
8821}
8822
8823fn correct_read_errors(
8824 config: &Config,
8825 counts: &dyn CountLookup,
8826 record: &mut SequenceRecord,
8827) -> CorrectionResult {
8828 if config.max_errors_to_correct == 0 || record.len() < config.k || config.k > 31 {
8829 return CorrectionResult::default();
8830 }
8831 let mut coverage = coverage_windows_for_record(config, counts, record);
8832 if coverage.len() <= config.prefix_len.max(1) {
8833 return CorrectionResult::default();
8834 }
8835 if !has_error_discontinuity(config, &coverage) {
8836 return CorrectionResult::default();
8837 }
8838
8839 if config.mark_errors_only {
8840 return mark_read_errors(config, record, &coverage);
8841 }
8842
8843 let original_bases = record.bases.clone();
8844 let original_qualities = record.qualities.clone();
8845 let mut result = CorrectionResult::default();
8846 let mut remaining = config.max_errors_to_correct;
8847
8848 if config.correct_from_left {
8849 let left = correct_errors_from_left(config, counts, record, &mut coverage, remaining);
8850 if left.uncorrectable {
8851 record.bases = original_bases;
8852 record.qualities = original_qualities;
8853 if config.mark_uncorrectable_errors {
8854 result.marked += mark_read_errors(config, record, &coverage).marked;
8855 }
8856 result.uncorrectable = true;
8857 return result;
8858 }
8859 remaining = remaining.saturating_sub(left.corrected);
8860 result.corrected += left.corrected;
8861 }
8862
8863 if config.correct_from_right && remaining > 0 {
8864 let checkpoint_bases = record.bases.clone();
8865 let checkpoint_qualities = record.qualities.clone();
8866 let right = correct_errors_from_right(config, counts, record, &mut coverage, remaining);
8867 if right.uncorrectable {
8868 record.bases = checkpoint_bases;
8869 record.qualities = checkpoint_qualities;
8870 if config.mark_uncorrectable_errors {
8871 result.marked += mark_read_errors(config, record, &coverage).marked;
8872 }
8873 result.uncorrectable = true;
8874 return result;
8875 }
8876 result.corrected += right.corrected;
8877 }
8878
8879 result
8880}
8881
8882fn correct_errors_from_left(
8883 config: &Config,
8884 counts: &dyn CountLookup,
8885 record: &mut SequenceRecord,
8886 coverage: &mut Vec<i64>,
8887 max_to_correct: usize,
8888) -> CorrectionResult {
8889 let mut found = 0usize;
8890 let mut corrected = 0usize;
8891 let low = u64_to_i64_saturating(config.error_correct_low_thresh);
8892 let high = u64_to_i64_saturating(config.error_correct_high_thresh);
8893 let mult = u64_to_i64_saturating(config.error_correct_ratio);
8894
8895 for i in config.prefix_len..coverage.len() {
8896 let a = min_coverage(&coverage[i - config.prefix_len..i]);
8897 let b = coverage[i];
8898 if !is_correction_discontinuity(a, b, low, high, mult) {
8899 continue;
8900 }
8901 found += 1;
8902 let loc = i + config.k - 1;
8903 if found > max_to_correct || base_quality(record, loc) > config.max_quality_to_correct {
8904 return CorrectionResult {
8905 corrected,
8906 uncorrectable: true,
8907 ..CorrectionResult::default()
8908 };
8909 }
8910 let target_lower = high.max(a / 2);
8911 let target_upper = a.saturating_mul(2);
8912 let target = CorrectionTarget {
8913 low,
8914 lower_bound: target_lower,
8915 upper_bound: target_upper,
8916 mult,
8917 };
8918 if try_correct_base(config, counts, record, loc, target) {
8919 corrected += 1;
8920 *coverage = coverage_windows_for_record(config, counts, record);
8921 } else {
8922 return CorrectionResult {
8923 corrected,
8924 uncorrectable: true,
8925 ..CorrectionResult::default()
8926 };
8927 }
8928 }
8929
8930 CorrectionResult {
8931 corrected,
8932 ..CorrectionResult::default()
8933 }
8934}
8935
8936fn correct_errors_from_right(
8937 config: &Config,
8938 counts: &dyn CountLookup,
8939 record: &mut SequenceRecord,
8940 coverage: &mut Vec<i64>,
8941 max_to_correct: usize,
8942) -> CorrectionResult {
8943 if coverage.len() <= config.prefix_len {
8944 return CorrectionResult::default();
8945 }
8946 let mut found = 0usize;
8947 let mut corrected = 0usize;
8948 let low = u64_to_i64_saturating(config.error_correct_low_thresh);
8949 let high = u64_to_i64_saturating(config.error_correct_high_thresh);
8950 let mult = u64_to_i64_saturating(config.error_correct_ratio);
8951 let start = coverage.len() - config.prefix_len - 1;
8952
8953 for i in (0..=start).rev() {
8954 let a = min_coverage(&coverage[i + 1..=i + config.prefix_len]);
8955 let b = coverage[i];
8956 if !is_correction_discontinuity(a, b, low, high, mult) {
8957 continue;
8958 }
8959 found += 1;
8960 let loc = i;
8961 if found > max_to_correct || base_quality(record, loc) > config.max_quality_to_correct {
8962 return CorrectionResult {
8963 corrected,
8964 uncorrectable: true,
8965 ..CorrectionResult::default()
8966 };
8967 }
8968 let target_lower = high.max(a / 2);
8969 let target_upper = a.saturating_mul(2);
8970 let target = CorrectionTarget {
8971 low,
8972 lower_bound: target_lower,
8973 upper_bound: target_upper,
8974 mult,
8975 };
8976 if try_correct_base(config, counts, record, loc, target) {
8977 corrected += 1;
8978 *coverage = coverage_windows_for_record(config, counts, record);
8979 } else {
8980 return CorrectionResult {
8981 corrected,
8982 uncorrectable: true,
8983 ..CorrectionResult::default()
8984 };
8985 }
8986 }
8987
8988 CorrectionResult {
8989 corrected,
8990 ..CorrectionResult::default()
8991 }
8992}
8993
8994fn try_correct_base(
8995 config: &Config,
8996 counts: &dyn CountLookup,
8997 record: &mut SequenceRecord,
8998 loc: usize,
8999 target: CorrectionTarget,
9000) -> bool {
9001 let original = record.bases[loc];
9002 let mut candidates = [(b'A', 0i64), (b'C', 0), (b'G', 0), (b'T', 0)];
9003 for (base, support) in &mut candidates {
9004 *support = substitution_support(config, counts, record, loc, *base);
9005 }
9006 candidates.sort_by(|left, right| right.1.cmp(&left.1));
9007 let (best_base, best_support) = candidates[0];
9008 let second_best = candidates[1].1;
9009 if best_base == original.to_ascii_uppercase() {
9010 return false;
9011 }
9012 if best_support < target.lower_bound || best_support > target.upper_bound {
9013 return false;
9014 }
9015 if !(second_best <= target.low || second_best.saturating_mul(target.mult) <= best_support) {
9016 return false;
9017 }
9018
9019 record.bases[loc] = best_base;
9020 if !is_defined_base(original)
9021 && let Some(qualities) = record.qualities.as_mut()
9022 {
9023 qualities[loc] = 20u8.saturating_add(33);
9024 }
9025 true
9026}
9027
9028fn substitution_support(
9029 config: &Config,
9030 counts: &dyn CountLookup,
9031 record: &SequenceRecord,
9032 loc: usize,
9033 base: u8,
9034) -> i64 {
9035 let mut candidate = record.clone();
9036 candidate.bases[loc] = base;
9037 let windows = unfiltered_kmer_windows_for_record(&candidate, config);
9038 if windows.is_empty() {
9039 return 0;
9040 }
9041 let first = (loc + 1).saturating_sub(config.k);
9042 let last = loc.min(windows.len() - 1);
9043 let mut support = i64::MAX;
9044 for window in windows.iter().take(last + 1).skip(first) {
9045 let depth = window
9046 .as_ref()
9047 .map(|kmer| u64_to_i64_saturating(counts.depth(kmer)))
9048 .unwrap_or(0);
9049 support = support.min(depth);
9050 }
9051 if support == i64::MAX { 0 } else { support }
9052}
9053
9054fn mark_read_errors(
9055 config: &Config,
9056 record: &mut SequenceRecord,
9057 coverage: &[i64],
9058) -> CorrectionResult {
9059 let low = u64_to_i64_saturating(config.error_correct_low_thresh);
9060 let high = u64_to_i64_saturating(config.error_correct_high_thresh);
9061 let mult = u64_to_i64_saturating(config.error_correct_ratio);
9062 let mut marked = 0usize;
9063 let mut marks = Vec::new();
9064
9065 if config.correct_from_left {
9066 for i in config.prefix_len..coverage.len() {
9067 let a = min_coverage(&coverage[i - config.prefix_len..i]);
9068 let b = coverage[i];
9069 if is_correction_discontinuity(a, b, low, high, mult) {
9070 marks.push(i + config.k - 1);
9071 }
9072 }
9073 }
9074 if config.correct_from_right && coverage.len() > config.prefix_len {
9075 let start = coverage.len() - config.prefix_len - 1;
9076 for i in (0..=start).rev() {
9077 let a = min_coverage(&coverage[i + 1..=i + config.prefix_len]);
9078 let b = coverage[i];
9079 if is_correction_discontinuity(a, b, low, high, mult) {
9080 marks.push(i);
9081 }
9082 }
9083 }
9084
9085 marks.sort_unstable();
9086 marks.dedup();
9087 for loc in marks {
9088 if let Some(qualities) = record.qualities.as_mut() {
9089 let phred = qualities[loc].saturating_sub(33);
9090 if phred == 0 {
9091 continue;
9092 }
9093 let new_phred = if config.mark_with_one {
9094 1
9095 } else {
9096 (phred / 2).saturating_sub(3).max(1)
9097 };
9098 qualities[loc] = new_phred.saturating_add(33);
9099 } else {
9100 record.bases[loc] = b'N';
9101 }
9102 marked += 1;
9103 }
9104
9105 CorrectionResult {
9106 marked,
9107 ..CorrectionResult::default()
9108 }
9109}
9110
9111fn coverage_windows_for_record(
9112 config: &Config,
9113 counts: &dyn CountLookup,
9114 record: &SequenceRecord,
9115) -> Vec<i64> {
9116 unfiltered_kmer_windows_for_record(record, config)
9117 .iter()
9118 .map(|window| {
9119 window
9120 .as_ref()
9121 .map(|kmer| u64_to_i64_saturating(counts.depth(kmer)))
9122 .unwrap_or(0)
9123 })
9124 .collect()
9125}
9126
9127fn has_error_discontinuity(config: &Config, coverage: &[i64]) -> bool {
9128 let low = u64_to_i64_saturating(config.error_correct_low_thresh);
9129 let high = u64_to_i64_saturating(config.error_correct_high_thresh);
9130 let mult = u64_to_i64_saturating(config.error_correct_ratio);
9131 if coverage.len() <= config.prefix_len {
9132 return false;
9133 }
9134 for i in config.prefix_len..coverage.len() {
9135 if is_correction_discontinuity(
9136 min_coverage(&coverage[i - config.prefix_len..i]),
9137 coverage[i],
9138 low,
9139 high,
9140 mult,
9141 ) {
9142 return true;
9143 }
9144 }
9145 let start = coverage.len() - config.prefix_len - 1;
9146 for i in (0..=start).rev() {
9147 if is_correction_discontinuity(
9148 min_coverage(&coverage[i + 1..=i + config.prefix_len]),
9149 coverage[i],
9150 low,
9151 high,
9152 mult,
9153 ) {
9154 return true;
9155 }
9156 }
9157 false
9158}
9159
9160fn is_correction_discontinuity(a: i64, b: i64, low: i64, high: i64, mult: i64) -> bool {
9161 a >= high && (b <= low || a >= b.saturating_mul(mult))
9162}
9163
9164fn min_coverage(values: &[i64]) -> i64 {
9165 values.iter().copied().min().unwrap_or(0)
9166}
9167
9168fn base_quality(record: &SequenceRecord, loc: usize) -> u8 {
9169 record
9170 .qualities
9171 .as_ref()
9172 .and_then(|qualities| qualities.get(loc))
9173 .copied()
9174 .map(|quality| quality.saturating_sub(33))
9175 .unwrap_or(10)
9176}
9177
9178fn is_defined_base(base: u8) -> bool {
9179 matches!(base.to_ascii_uppercase(), b'A' | b'C' | b'G' | b'T')
9180}
9181
9182fn complement_base(base: u8) -> u8 {
9183 match base.to_ascii_uppercase() {
9184 b'A' => b'T',
9185 b'C' => b'G',
9186 b'G' => b'C',
9187 b'T' => b'A',
9188 _ => b'N',
9189 }
9190}
9191
9192fn fix_spikes(
9193 coverage: &mut [i64],
9194 windows: &[Option<KmerKey>],
9195 counts: &dyn CountLookup,
9196 k: usize,
9197) {
9198 if k == 0 || coverage.len() < 3 {
9199 return;
9200 }
9201 if coverage[1] - coverage[0] > 1 {
9202 coverage[0] = precise_kmer_count(windows[0].as_ref(), counts, k);
9203 }
9204
9205 let last = coverage.len() - 1;
9206 if coverage[last] - coverage[last - 1] > 1 {
9207 coverage[last] = precise_kmer_count(windows[last].as_ref(), counts, k);
9208 }
9209
9210 for i in 1..last {
9211 let b = coverage[i];
9212 if b <= 1 {
9213 continue;
9214 }
9215 let a = coverage[i - 1].max(1);
9216 let c = coverage[i + 1].max(1);
9217 if b > a && b > c && (b < 6 || b > a + 1 || b > c + 1) {
9218 coverage[i] = precise_min_kmer_count(windows[i].as_ref(), counts, k);
9219 }
9220 }
9221}
9222
9223fn precise_kmer_count(window: Option<&KmerKey>, counts: &dyn CountLookup, k: usize) -> i64 {
9224 let Some(window) = window else {
9225 return 0;
9226 };
9227 let key = raw_kmer_key(window);
9228 let b = kmer_count(window, key, counts, k);
9229 if b < 1 {
9230 return b;
9231 }
9232 let a = left_kmer_count(window, key, counts, k);
9233 if a >= b {
9234 return b;
9235 }
9236 let c = right_kmer_count(window, key, counts, k);
9237 if c >= b {
9238 return b;
9239 }
9240 (a + c) / 2
9241}
9242
9243fn precise_min_kmer_count(window: Option<&KmerKey>, counts: &dyn CountLookup, k: usize) -> i64 {
9244 let Some(window) = window else {
9245 return 0;
9246 };
9247 let key = raw_kmer_key(window);
9248 let b = kmer_count(window, key, counts, k);
9249 if b < 1 {
9250 return b;
9251 }
9252 let a = left_kmer_count(window, key, counts, k);
9253 if a < 1 {
9254 return a;
9255 }
9256 let c = right_kmer_count(window, key, counts, k);
9257 a.min(b).min(c)
9258}
9259
9260fn raw_kmer_key(window: &KmerKey) -> u64 {
9261 match window {
9262 KmerKey::Short(key) | KmerKey::LongHash(key) => *key,
9263 }
9264}
9265
9266fn kmer_count(template: &KmerKey, raw_key: u64, counts: &dyn CountLookup, k: usize) -> i64 {
9267 let key = match template {
9268 KmerKey::Short(_) => KmerKey::Short(canonical_short_code(raw_key, k)),
9269 KmerKey::LongHash(_) => KmerKey::LongHash(java_canonical_long_key(raw_key, k)),
9270 };
9271 u64_to_i64_saturating(counts.depth(&key))
9272}
9273
9274fn left_kmer_count(template: &KmerKey, key: u64, counts: &dyn CountLookup, k: usize) -> i64 {
9275 let key2 = key >> 2;
9276 let shift = ((2 * (k - 1)) & 63) as u32;
9277 (0..4)
9278 .map(|base| kmer_count(template, key2 | (base << shift), counts, k))
9279 .fold(0i64, i64::saturating_add)
9280}
9281
9282fn right_kmer_count(template: &KmerKey, key: u64, counts: &dyn CountLookup, k: usize) -> i64 {
9283 let mask = if k >= 32 {
9284 u64::MAX
9285 } else {
9286 (1u64 << (2 * k)) - 1
9287 };
9288 let key2 = (key << 2) & mask;
9289 (0..4)
9290 .map(|base| kmer_count(template, key2 | base, counts, k))
9291 .fold(0i64, i64::saturating_add)
9292}
9293
9294fn java_canonical_long_key(key: u64, k: usize) -> u64 {
9295 let reverse = java_reverse_complement_binary_fast(key, k);
9296 key.max(reverse)
9297}
9298
9299fn java_reverse_complement_binary_fast(key: u64, k: usize) -> u64 {
9300 let mut x = !key;
9301 x = ((x & 0x3333_3333_3333_3333) << 2) | ((x & 0xCCCC_CCCC_CCCC_CCCC) >> 2);
9302 x = ((x & 0x0F0F_0F0F_0F0F_0F0F) << 4) | ((x & 0xF0F0_F0F0_F0F0_F0F0) >> 4);
9303 x = ((x & 0x00FF_00FF_00FF_00FF) << 8) | ((x & 0xFF00_FF00_FF00_FF00) >> 8);
9304 x = ((x & 0x0000_FFFF_0000_FFFF) << 16) | ((x & 0xFFFF_0000_FFFF_0000) >> 16);
9305 x = x.rotate_right(32);
9306
9307 let shift = (2usize.wrapping_mul(32usize.wrapping_sub(k)) & 63) as u32;
9308 x >> shift
9309}
9310
9311fn decide_pair(
9312 config: &Config,
9313 input_counts: &dyn CountLookup,
9314 r1: &SequenceRecord,
9315 r2: Option<&SequenceRecord>,
9316 rand: Option<f64>,
9317) -> PairDecision {
9318 let analysis = analyze_pair(config, input_counts, r1, r2);
9319 decide_pair_from_analysis(config, r1, r2, analysis, rand)
9320}
9321
9322fn decide_pair_from_analysis(
9323 config: &Config,
9324 r1: &SequenceRecord,
9325 r2: Option<&SequenceRecord>,
9326 analysis: PairAnalysis,
9327 rand: Option<f64>,
9328) -> PairDecision {
9329 let (target_depth, max_depth) = dynamic_depth_limits(config, &analysis);
9330 let mut toss = false;
9331
9332 match analysis.depth_proxy_al {
9333 Some(depth) => {
9334 if depth > max_depth && (analysis.error1 || analysis.error2 || !config.discard_bad_only)
9335 {
9336 let coin = deterministic_coin(rand, depth);
9337 if coin > target_depth {
9338 toss = true;
9339 }
9340 }
9341 }
9342 None => toss = true,
9343 }
9344
9345 if r1.len() < config.min_length || r2.is_some_and(|mate| mate.len() < config.min_length) {
9346 toss = true;
9347 }
9348
9349 if config.toss_error_reads && (analysis.error1 || analysis.error2) {
9350 let save_rare = config.save_rare_reads
9351 && analysis
9352 .depth_proxy_al
9353 .is_some_and(|depth| depth <= target_depth && depth >= config.high_thresh);
9354 if !save_rare
9355 && (!config.require_both_bad || r2.is_none() || (analysis.error1 && analysis.error2))
9356 {
9357 toss = true;
9358 }
9359 }
9360
9361 if config.toss_by_low_true_depth && !config.save_rare_reads {
9362 let low_enough = analysis
9363 .max_true_depth
9364 .is_some_and(|depth| depth < config.min_depth);
9365 let required_bad = !config.require_both_bad
9366 || r2.is_none()
9367 || (depth_below_min(analysis.read1.min_true_depth, config.min_depth)
9368 && analysis
9369 .read2
9370 .as_ref()
9371 .is_some_and(|read| depth_below_min(read.min_true_depth, config.min_depth)));
9372 if low_enough && required_bad {
9373 toss = true;
9374 }
9375 }
9376
9377 if config.keep_all {
9378 toss = false;
9379 }
9380
9381 PairDecision { toss, analysis }
9382}
9383
9384fn dynamic_depth_limits(config: &Config, analysis: &PairAnalysis) -> (u64, u64) {
9385 let default_max_depth = config.max_depth.unwrap_or(config.target_depth);
9386 if analysis.low_kmer_count == 0 || analysis.total_kmer_count == 0 {
9387 return (config.target_depth, default_max_depth);
9388 }
9389
9390 let low_target = ((config.target_depth as f64) * config.target_bad_percent_low)
9391 .round()
9392 .max(1.0);
9393 let high_target = ((config.target_depth as f64) * config.target_bad_percent_high)
9394 .round()
9395 .max(low_target)
9396 .min(config.target_depth as f64);
9397 let fraction_good = (analysis.total_kmer_count - analysis.low_kmer_count) as f64
9398 / analysis.total_kmer_count as f64;
9399 let adjusted = low_target + (high_target - low_target) * (fraction_good * fraction_good);
9400 let target = adjusted as u64;
9401 (target.max(1), target.max(1))
9402}
9403
9404fn maybe_rename_pair(
9405 config: &Config,
9406 r1: &SequenceRecord,
9407 r2: Option<&SequenceRecord>,
9408 analysis: &PairAnalysis,
9409) -> (SequenceRecord, Option<SequenceRecord>) {
9410 if !config.rename_reads {
9411 return (r1.clone(), r2.cloned());
9412 }
9413 let d1 = depth_label(analysis.read1.depth_al);
9414 let out1 = match r2 {
9415 Some(_) => {
9416 let mut id = format!(
9417 "id={},d1={},d2={}",
9418 r1.numeric_id,
9419 d1,
9420 depth_label(analysis.read2.as_ref().and_then(|a| a.depth_al))
9421 );
9422 if config.error_correct {
9423 id.push_str(",e1=0,e2=0");
9424 }
9425 id.push_str(" /1");
9426 r1.renamed(id)
9427 }
9428 None => {
9429 let mut id = format!("id={},d1={}", r1.numeric_id, d1);
9430 if config.error_correct {
9431 id.push_str(",e1=0");
9432 }
9433 r1.renamed(id)
9434 }
9435 };
9436 let out2 = r2.map(|mate| {
9437 let mut id = format!(
9438 "id={},d1={},d2={}",
9439 r1.numeric_id,
9440 d1,
9441 depth_label(analysis.read2.as_ref().and_then(|a| a.depth_al))
9442 );
9443 if config.error_correct {
9444 id.push_str(",e1=0,e2=0");
9445 }
9446 id.push_str(" /2");
9447 mate.renamed(id)
9448 });
9449 (out1, out2)
9450}
9451
9452fn depth_label(depth: Option<u64>) -> String {
9453 depth
9454 .map(|value| value.to_string())
9455 .unwrap_or_else(|| "-1".to_string())
9456}
9457
9458fn increment_pair_counts(
9459 config: &Config,
9460 counts: &mut CountMap,
9461 r1: &SequenceRecord,
9462 r2: Option<&SequenceRecord>,
9463) {
9464 increment_pair_counts_with_prefilter(config, counts, r1, r2, None);
9465}
9466
9467fn increment_pair_counts_with_prefilter(
9468 config: &Config,
9469 counts: &mut CountMap,
9470 r1: &SequenceRecord,
9471 r2: Option<&SequenceRecord>,
9472 prefilter: Option<PrefilterGate<'_>>,
9473) {
9474 if config.remove_duplicate_kmers && config.k <= 31 {
9475 for kmer in unique_pair_kmers(config, r1, r2) {
9476 if prefilter.is_none_or(|gate| gate.should_count_in_main(&kmer)) {
9477 *counts.entry(kmer).or_insert(0) += 1;
9478 }
9479 }
9480 } else {
9481 for_each_kmer_for_record(r1, config, |kmer| {
9482 if prefilter.is_none_or(|gate| gate.should_count_in_main(&kmer)) {
9483 *counts.entry(kmer).or_insert(0) += 1;
9484 }
9485 });
9486 if let Some(mate) = r2 {
9487 for_each_kmer_for_record(mate, config, |kmer| {
9488 if prefilter.is_none_or(|gate| gate.should_count_in_main(&kmer)) {
9489 *counts.entry(kmer).or_insert(0) += 1;
9490 }
9491 });
9492 }
9493 }
9494}
9495
9496fn increment_counts_from_pair_chunk(
9497 config: &Config,
9498 counts: &mut CountMap,
9499 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
9500) {
9501 let chunk_counts = pairs
9502 .par_iter()
9503 .fold(
9504 || count_chunk_local_map(config, pairs),
9505 |mut local_counts, (r1, r2)| {
9506 increment_pair_counts(config, &mut local_counts, r1, r2.as_ref());
9507 local_counts
9508 },
9509 )
9510 .reduce(CountMap::default, |mut left, right| {
9511 merge_count_maps(&mut left, right);
9512 left
9513 });
9514 merge_count_maps(counts, chunk_counts);
9515}
9516
9517fn increment_sketch_from_pair_chunk(
9518 config: &Config,
9519 sketch: &mut PackedCountMinSketch,
9520 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
9521 prefilter: Option<PrefilterGate<'_>>,
9522) {
9523 if config.deterministic && sketch.update_mode == CountMinUpdateMode::Conservative {
9524 increment_sketch_from_pair_chunk_sorted_replay(config, sketch, pairs, prefilter);
9525 return;
9526 }
9527 let chunk_counts = pairs
9528 .par_iter()
9529 .fold(
9530 || count_chunk_local_map(config, pairs),
9531 |mut local_counts, (r1, r2)| {
9532 increment_pair_counts_with_prefilter(
9533 config,
9534 &mut local_counts,
9535 r1,
9536 r2.as_ref(),
9537 prefilter,
9538 );
9539 local_counts
9540 },
9541 )
9542 .reduce(CountMap::default, |mut left, right| {
9543 merge_count_maps(&mut left, right);
9544 left
9545 });
9546 let key_increments = chunk_counts.values().copied().sum();
9547 sketch.add_key_counts(&chunk_counts);
9548 sketch.add_key_increments(key_increments);
9549}
9550
9551fn increment_sketch_from_pair_chunk_sorted_replay(
9552 config: &Config,
9553 sketch: &mut PackedCountMinSketch,
9554 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
9555 prefilter: Option<PrefilterGate<'_>>,
9556) {
9557 let mut entries = pairs
9558 .par_iter()
9559 .fold(
9560 || count_chunk_local_map(config, pairs),
9561 |mut local_counts, (r1, r2)| {
9562 increment_pair_counts_with_prefilter(
9563 config,
9564 &mut local_counts,
9565 r1,
9566 r2.as_ref(),
9567 prefilter,
9568 );
9569 local_counts
9570 },
9571 )
9572 .map(|counts| counts.into_iter().collect::<Vec<_>>())
9573 .reduce(Vec::new, |mut left, mut right| {
9574 left.append(&mut right);
9575 left
9576 });
9577 entries.par_sort_unstable_by(|(left, _), (right, _)| left.cmp(right));
9578
9579 let mut key_increments = 0u64;
9580 let mut iter = entries.into_iter();
9581 let Some((mut current_key, mut current_count)) = iter.next() else {
9582 return;
9583 };
9584 for (key, count) in iter {
9585 if key == current_key {
9586 current_count = current_count.saturating_add(count);
9587 } else {
9588 key_increments = key_increments.saturating_add(current_count);
9589 sketch.add_key_count(¤t_key, current_count);
9590 current_key = key;
9591 current_count = count;
9592 }
9593 }
9594 key_increments = key_increments.saturating_add(current_count);
9595 sketch.add_key_count(¤t_key, current_count);
9596 sketch.add_key_increments(key_increments);
9597}
9598
9599fn increment_atomic_packed_sketch_from_pair_chunk(
9600 config: &Config,
9601 sketch: &AtomicPackedCountMinSketch,
9602 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
9603) {
9604 let (key_increments, newly_occupied) = pairs
9605 .par_iter()
9606 .map(|(r1, r2)| increment_pair_atomic_packed_sketch(config, sketch, r1, r2.as_ref()))
9607 .reduce(
9608 || (0u64, 0usize),
9609 |left, right| {
9610 (
9611 left.0.saturating_add(right.0),
9612 left.1.saturating_add(right.1),
9613 )
9614 },
9615 );
9616 sketch.add_key_increments(key_increments);
9617 sketch.add_occupied_slots(newly_occupied);
9618}
9619
9620fn increment_pair_atomic_packed_sketch(
9621 config: &Config,
9622 sketch: &AtomicPackedCountMinSketch,
9623 r1: &SequenceRecord,
9624 r2: Option<&SequenceRecord>,
9625) -> (u64, usize) {
9626 if config.remove_duplicate_kmers && config.k <= 31 {
9627 let keys = unique_pair_kmers(config, r1, r2);
9628 let mut newly_occupied = 0usize;
9629 for key in &keys {
9630 newly_occupied += sketch.add_key_count_counting_newly_occupied(key, 1);
9631 }
9632 return (keys.len() as u64, newly_occupied);
9633 }
9634 let mut key_increments = 0u64;
9635 let mut newly_occupied = 0usize;
9636 for_each_kmer_for_record(r1, config, |kmer| {
9637 newly_occupied += sketch.add_key_count_counting_newly_occupied(&kmer, 1);
9638 key_increments += 1;
9639 });
9640 if let Some(mate) = r2 {
9641 for_each_kmer_for_record(mate, config, |kmer| {
9642 newly_occupied += sketch.add_key_count_counting_newly_occupied(&kmer, 1);
9643 key_increments += 1;
9644 });
9645 }
9646 (key_increments, newly_occupied)
9647}
9648
9649fn increment_atomic_sketch_from_pair_chunk(
9650 config: &Config,
9651 sketch: &AtomicCountMinSketch,
9652 pairs: &[(SequenceRecord, Option<SequenceRecord>)],
9653 prefilter: Option<PrefilterGate<'_>>,
9654) {
9655 if !config.deterministic {
9656 let (key_increments, newly_occupied) = pairs
9657 .par_iter()
9658 .map(|(r1, r2)| {
9659 increment_pair_atomic_sketch_direct(config, sketch, r1, r2.as_ref(), prefilter)
9660 })
9661 .reduce(
9662 || (0u64, 0usize),
9663 |left, right| {
9664 (
9665 left.0.saturating_add(right.0),
9666 left.1.saturating_add(right.1),
9667 )
9668 },
9669 );
9670 sketch.add_key_increments(key_increments);
9671 sketch.add_occupied_slots(newly_occupied);
9672 return;
9673 }
9674
9675 let mut entries = pairs
9676 .par_iter()
9677 .fold(
9678 || count_chunk_local_map(config, pairs),
9679 |mut local_counts, (r1, r2)| {
9680 increment_pair_counts_with_prefilter(
9681 config,
9682 &mut local_counts,
9683 r1,
9684 r2.as_ref(),
9685 prefilter,
9686 );
9687 local_counts
9688 },
9689 )
9690 .map(|counts| counts.into_iter().collect::<Vec<_>>())
9691 .reduce(Vec::new, |mut left, mut right| {
9692 left.append(&mut right);
9693 left
9694 });
9695 entries.par_sort_unstable_by(|(left, _), (right, _)| left.cmp(right));
9696
9697 let mut key_increments = 0u64;
9698 let mut iter = entries.into_iter();
9699 let Some((mut current_key, mut current_count)) = iter.next() else {
9700 return;
9701 };
9702 for (key, count) in iter {
9703 if key == current_key {
9704 current_count = current_count.saturating_add(count);
9705 } else {
9706 key_increments = key_increments.saturating_add(current_count);
9707 sketch.add_key_count(¤t_key, current_count);
9708 current_key = key;
9709 current_count = count;
9710 }
9711 }
9712 key_increments = key_increments.saturating_add(current_count);
9713 sketch.add_key_count(¤t_key, current_count);
9714 sketch.add_key_increments(key_increments);
9715}
9716
9717fn increment_pair_atomic_sketch_direct(
9718 config: &Config,
9719 sketch: &AtomicCountMinSketch,
9720 r1: &SequenceRecord,
9721 r2: Option<&SequenceRecord>,
9722 prefilter: Option<PrefilterGate<'_>>,
9723) -> (u64, usize) {
9724 if config.remove_duplicate_kmers && config.k <= 31 {
9725 let keys = unique_pair_kmers(config, r1, r2);
9726 let mut key_increments = 0u64;
9727 let mut newly_occupied = 0usize;
9728 for key in &keys {
9729 if prefilter.is_none_or(|gate| gate.should_count_in_main(key)) {
9730 newly_occupied += sketch.add_key_count_counting_newly_occupied(key, 1);
9731 key_increments += 1;
9732 }
9733 }
9734 return (key_increments, newly_occupied);
9735 }
9736
9737 let mut key_increments = 0u64;
9738 let mut newly_occupied = 0usize;
9739 for_each_kmer_for_record(r1, config, |kmer| {
9740 if prefilter.is_none_or(|gate| gate.should_count_in_main(&kmer)) {
9741 newly_occupied += sketch.add_key_count_counting_newly_occupied(&kmer, 1);
9742 key_increments += 1;
9743 }
9744 });
9745 if let Some(mate) = r2 {
9746 for_each_kmer_for_record(mate, config, |kmer| {
9747 if prefilter.is_none_or(|gate| gate.should_count_in_main(&kmer)) {
9748 newly_occupied += sketch.add_key_count_counting_newly_occupied(&kmer, 1);
9749 key_increments += 1;
9750 }
9751 });
9752 }
9753 (key_increments, newly_occupied)
9754}
9755
9756#[cfg(test)]
9757fn retain_prefilter_saturated_counts(counts: &mut CountMap, prefilter: Option<PrefilterGate<'_>>) {
9758 let Some(prefilter) = prefilter else {
9759 return;
9760 };
9761 counts.retain(|key, _| prefilter.should_count_in_main(key));
9762}
9763
9764fn merge_count_maps(counts: &mut CountMap, source: CountMap) {
9765 for (kmer, count) in source {
9766 *counts.entry(kmer).or_insert(0) += count;
9767 }
9768}
9769
9770fn trim_pair(config: &Config, r1: &mut SequenceRecord, r2: Option<&mut SequenceRecord>) {
9771 if !config.trim_left && !config.trim_right {
9772 return;
9773 }
9774 trim_record(config, r1);
9775 if let Some(mate) = r2 {
9776 trim_record(config, mate);
9777 }
9778}
9779
9780fn trim_record(config: &Config, record: &mut SequenceRecord) {
9781 if record.is_empty() {
9782 return;
9783 }
9784 let (left0, right0) = if config.trim_optimal {
9785 optimal_trim_amounts(record, config)
9786 } else if config.trim_window {
9787 (0, window_trim_right_amount(record, config))
9788 } else {
9789 simple_trim_amounts(record, config)
9790 };
9791 let left = if config.trim_left { left0 } else { 0 };
9792 let right = if config.trim_right { right0 } else { 0 };
9793 trim_by_amount(record, left, right, 1);
9794}
9795
9796fn optimal_trim_amounts(record: &SequenceRecord, config: &Config) -> (usize, usize) {
9797 let avg_error_rate = config
9798 .trim_optimal_bias
9799 .unwrap_or_else(|| phred_to_prob_error(config.trim_quality));
9800 if let Some(qualities) = record.qualities.as_deref() {
9801 let nprob = (avg_error_rate * 1.1).clamp(0.75, 1.0);
9802 let mut max_score = 0.0f64;
9803 let mut score = 0.0f64;
9804 let mut max_loc = 0usize;
9805 let mut max_count = 0usize;
9806 let mut count = 0usize;
9807
9808 for (idx, (&base, &quality)) in record.bases.iter().zip(qualities).enumerate() {
9809 let phred = quality.saturating_sub(33);
9810 let prob_error = if base == b'N' || phred < 1 {
9811 nprob
9812 } else {
9813 phred_to_prob_error(f64::from(phred))
9814 };
9815 score += avg_error_rate - prob_error;
9816 if score > 0.0 {
9817 count += 1;
9818 if score > max_score || (score == max_score && count > max_count) {
9819 max_score = score;
9820 max_count = count;
9821 max_loc = idx;
9822 }
9823 } else {
9824 score = 0.0;
9825 count = 0;
9826 }
9827 }
9828
9829 if max_score > 0.0 {
9830 (max_loc + 1 - max_count, record.len() - max_loc - 1)
9831 } else {
9832 (0, record.len())
9833 }
9834 } else if avg_error_rate >= 1.0 {
9835 (0, 0)
9836 } else {
9837 (
9838 test_left_n(&record.bases, config.trim_min_good_interval),
9839 test_right_n(&record.bases, config.trim_min_good_interval),
9840 )
9841 }
9842}
9843
9844fn simple_trim_amounts(record: &SequenceRecord, config: &Config) -> (usize, usize) {
9845 let trimq = config.trim_quality as u8;
9846 if let Some(qualities) = record.qualities.as_deref() {
9847 (
9848 test_left_quality(qualities, trimq, config.trim_min_good_interval),
9849 test_right_quality(qualities, trimq, config.trim_min_good_interval),
9850 )
9851 } else {
9852 (
9853 test_left_n(&record.bases, config.trim_min_good_interval),
9854 test_right_n(&record.bases, config.trim_min_good_interval),
9855 )
9856 }
9857}
9858
9859fn window_trim_right_amount(record: &SequenceRecord, config: &Config) -> usize {
9860 let trimq = config.trim_quality as i32;
9861 let Some(qualities) = record.qualities.as_deref() else {
9862 return if trimq > 0 {
9863 0
9864 } else {
9865 test_right_n(&record.bases, config.trim_min_good_interval)
9866 };
9867 };
9868 if qualities.len() < config.trim_window_length {
9869 return if trimq > 0 {
9870 0
9871 } else {
9872 test_right_n(&record.bases, config.trim_min_good_interval)
9873 };
9874 }
9875
9876 let Ok(window) = isize::try_from(config.trim_window_length) else {
9877 return 0;
9878 };
9879 let threshold = (config.trim_window_length as i32 * trimq).max(1);
9880 let mut sum = 0i32;
9881 for (idx, &quality) in qualities.iter().enumerate() {
9882 let Ok(idx) = isize::try_from(idx) else {
9883 return 0;
9884 };
9885 let j = idx - window;
9886 sum += i32::from(quality.saturating_sub(33));
9887 if j >= -1 {
9888 if j >= 0 {
9889 sum -= i32::from(qualities[j as usize].saturating_sub(33));
9890 }
9891 if sum < threshold {
9892 return qualities.len() - j as usize - 1;
9893 }
9894 }
9895 }
9896 0
9897}
9898
9899fn test_left_quality(qualities: &[u8], trimq: u8, min_good_interval: usize) -> usize {
9900 let mut good = 0usize;
9901 let mut last_bad = None;
9902 for (idx, &quality) in qualities.iter().enumerate() {
9903 if good >= min_good_interval {
9904 break;
9905 }
9906 if quality.saturating_sub(33) > trimq {
9907 good += 1;
9908 } else {
9909 good = 0;
9910 last_bad = Some(idx);
9911 }
9912 }
9913 last_bad.map_or(0, |idx| idx + 1)
9914}
9915
9916fn test_right_quality(qualities: &[u8], trimq: u8, min_good_interval: usize) -> usize {
9917 let mut good = 0usize;
9918 let mut last_bad = qualities.len();
9919 for (idx, &quality) in qualities.iter().enumerate().rev() {
9920 if good >= min_good_interval {
9921 break;
9922 }
9923 if quality.saturating_sub(33) > trimq {
9924 good += 1;
9925 } else {
9926 good = 0;
9927 last_bad = idx;
9928 }
9929 }
9930 qualities.len() - last_bad
9931}
9932
9933fn test_left_n(bases: &[u8], min_good_interval: usize) -> usize {
9934 let mut good = 0usize;
9935 let mut last_bad = None;
9936 for (idx, &base) in bases.iter().enumerate() {
9937 if good >= min_good_interval {
9938 break;
9939 }
9940 if base != b'N' {
9941 good += 1;
9942 } else {
9943 good = 0;
9944 last_bad = Some(idx);
9945 }
9946 }
9947 last_bad.map_or(0, |idx| idx + 1)
9948}
9949
9950fn test_right_n(bases: &[u8], min_good_interval: usize) -> usize {
9951 let mut good = 0usize;
9952 let mut last_bad = bases.len();
9953 for (idx, &base) in bases.iter().enumerate().rev() {
9954 if good >= min_good_interval {
9955 break;
9956 }
9957 if base != b'N' {
9958 good += 1;
9959 } else {
9960 good = 0;
9961 last_bad = idx;
9962 }
9963 }
9964 bases.len() - last_bad
9965}
9966
9967fn trim_by_amount(
9968 record: &mut SequenceRecord,
9969 mut left_trim: usize,
9970 mut right_trim: usize,
9971 min_resulting_length: usize,
9972) -> usize {
9973 let len = record.len();
9974 if len == 0 {
9975 return 0;
9976 }
9977 let min_resulting_length = min_resulting_length.min(len);
9978 if left_trim + right_trim + min_resulting_length > len {
9979 right_trim = 1usize.max(len.saturating_sub(min_resulting_length));
9980 left_trim = 0;
9981 }
9982 let total = left_trim + right_trim;
9983 if total > 0 {
9984 record.bases = record.bases[left_trim..len - right_trim].to_vec();
9985 if let Some(qualities) = record.qualities.take() {
9986 let qlen = qualities.len();
9987 record.qualities = if total >= qlen {
9988 None
9989 } else {
9990 Some(qualities[left_trim..qlen - right_trim].to_vec())
9991 };
9992 }
9993 }
9994 total
9995}
9996
9997fn phred_to_prob_error(q: f64) -> f64 {
9998 if q <= 0.0 {
9999 0.75
10000 } else if q <= 1.0 {
10001 0.75 - q * 0.05
10002 } else {
10003 0.7_f64.min(10_f64.powf(-0.1 * q))
10004 }
10005}
10006
10007fn increment_sparse_hist_from_analysis(
10008 hist: &mut SparseHist,
10009 analysis: &ReadAnalysis,
10010 hist_len: usize,
10011) {
10012 for depth in &analysis.coverage_desc {
10013 if *depth < 0 {
10014 continue;
10015 }
10016 let idx = (*depth as usize).min(hist_len - 1);
10017 *hist.entry(idx).or_insert(0) += 1;
10018 }
10019}
10020
10021#[cfg(test)]
10022fn increment_hist_from_pair_chunk(
10023 config: &Config,
10024 hist_counts: &dyn CountLookup,
10025 keep_filter_counts: Option<&dyn CountLookup>,
10026 hist: &mut [u64],
10027 pairs: &[AnalysisPair],
10028) {
10029 let chunk_hist = sparse_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, pairs);
10030 merge_sparse_hist_into_dense(hist, chunk_hist);
10031}
10032
10033fn sparse_hist_from_pair_chunk(
10034 config: &Config,
10035 hist_counts: &dyn CountLookup,
10036 keep_filter_counts: Option<&dyn CountLookup>,
10037 pairs: &[AnalysisPair],
10038) -> SparseHist {
10039 pairs
10040 .par_iter()
10041 .fold(SparseHist::default, |mut local_hist, (r1, r2, rand)| {
10042 if let Some(input_counts) = keep_filter_counts {
10043 let decision = decide_pair(config, input_counts, r1, r2.as_ref(), *rand);
10044 if decision.toss {
10045 return local_hist;
10046 }
10047 }
10048
10049 let analysis = analyze_pair(config, hist_counts, r1, r2.as_ref());
10050 increment_sparse_hist_from_analysis(&mut local_hist, &analysis.read1, config.hist_len);
10051 if let Some(read2) = &analysis.read2 {
10052 increment_sparse_hist_from_analysis(&mut local_hist, read2, config.hist_len);
10053 }
10054 local_hist
10055 })
10056 .reduce(SparseHist::default, |mut left, right| {
10057 merge_sparse_hist(&mut left, right);
10058 left
10059 })
10060}
10061
10062fn merge_sparse_hist(target: &mut SparseHist, source: SparseHist) {
10063 for (idx, count) in source {
10064 *target.entry(idx).or_insert(0) += count;
10065 }
10066}
10067
10068#[cfg(test)]
10069fn merge_sparse_hist_into_dense(target: &mut [u64], source: SparseHist) {
10070 for (idx, count) in source {
10071 target[idx] += count;
10072 }
10073}
10074
10075fn increment_sparse_read_hist(
10076 hist: &mut SparseReadDepthHist,
10077 analysis: &ReadAnalysis,
10078 read_len: usize,
10079 hist_len: usize,
10080) {
10081 if !analysis.had_kmer_windows {
10082 return;
10083 }
10084 let depth = analysis.depth_al.or(analysis.true_depth).unwrap_or(0);
10085 let idx = (depth as usize).min(hist_len - 1);
10086 let entry = hist.entry(idx).or_insert((0, 0));
10087 entry.0 += 1;
10088 entry.1 += read_len as u64;
10089}
10090
10091#[cfg(test)]
10092fn increment_read_hist_from_pair_chunk(
10093 config: &Config,
10094 hist_counts: &dyn CountLookup,
10095 keep_filter_counts: Option<&dyn CountLookup>,
10096 hist: &mut ReadDepthHistogram,
10097 pairs: &[AnalysisPair],
10098) {
10099 let chunk_hist =
10100 sparse_read_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, pairs);
10101 merge_sparse_read_depth_hist_into_dense(hist, chunk_hist);
10102}
10103
10104fn sparse_read_hist_from_pair_chunk(
10105 config: &Config,
10106 hist_counts: &dyn CountLookup,
10107 keep_filter_counts: Option<&dyn CountLookup>,
10108 pairs: &[AnalysisPair],
10109) -> SparseReadDepthHist {
10110 pairs
10111 .par_iter()
10112 .fold(
10113 SparseReadDepthHist::default,
10114 |mut local_hist, (r1, r2, rand)| {
10115 if let Some(input_counts) = keep_filter_counts {
10116 let decision = decide_pair(config, input_counts, r1, r2.as_ref(), *rand);
10117 if decision.toss {
10118 return local_hist;
10119 }
10120 }
10121
10122 let analysis = analyze_pair(config, hist_counts, r1, r2.as_ref());
10123 increment_sparse_read_hist(
10124 &mut local_hist,
10125 &analysis.read1,
10126 r1.len(),
10127 config.hist_len,
10128 );
10129 if let (Some(read2_analysis), Some(read2)) = (&analysis.read2, r2.as_ref()) {
10130 increment_sparse_read_hist(
10131 &mut local_hist,
10132 read2_analysis,
10133 read2.len(),
10134 config.hist_len,
10135 );
10136 }
10137 local_hist
10138 },
10139 )
10140 .reduce(SparseReadDepthHist::default, |mut left, right| {
10141 merge_sparse_read_depth_hist(&mut left, right);
10142 left
10143 })
10144}
10145
10146#[cfg(test)]
10147fn increment_hist_and_read_hist_from_pair_chunk(
10148 config: &Config,
10149 hist_counts: &dyn CountLookup,
10150 keep_filter_counts: Option<&dyn CountLookup>,
10151 depth_hist: &mut [u64],
10152 read_hist: &mut ReadDepthHistogram,
10153 pairs: &[AnalysisPair],
10154) {
10155 let (chunk_depth_hist, chunk_read_hist) =
10156 sparse_hist_and_read_hist_from_pair_chunk(config, hist_counts, keep_filter_counts, pairs);
10157 merge_sparse_hist_into_dense(depth_hist, chunk_depth_hist);
10158 merge_sparse_read_depth_hist_into_dense(read_hist, chunk_read_hist);
10159}
10160
10161fn sparse_hist_and_read_hist_from_pair_chunk(
10162 config: &Config,
10163 hist_counts: &dyn CountLookup,
10164 keep_filter_counts: Option<&dyn CountLookup>,
10165 pairs: &[AnalysisPair],
10166) -> (SparseHist, SparseReadDepthHist) {
10167 pairs
10168 .par_iter()
10169 .fold(
10170 || (SparseHist::default(), SparseReadDepthHist::default()),
10171 |mut local, (r1, r2, rand)| {
10172 if let Some(input_counts) = keep_filter_counts {
10173 let decision = decide_pair(config, input_counts, r1, r2.as_ref(), *rand);
10174 if decision.toss {
10175 return local;
10176 }
10177 }
10178
10179 let analysis = analyze_pair(config, hist_counts, r1, r2.as_ref());
10180 increment_sparse_hist_from_analysis(&mut local.0, &analysis.read1, config.hist_len);
10181 increment_sparse_read_hist(
10182 &mut local.1,
10183 &analysis.read1,
10184 r1.len(),
10185 config.hist_len,
10186 );
10187 if let Some(read2_analysis) = &analysis.read2 {
10188 increment_sparse_hist_from_analysis(
10189 &mut local.0,
10190 read2_analysis,
10191 config.hist_len,
10192 );
10193 if let Some(read2) = r2.as_ref() {
10194 increment_sparse_read_hist(
10195 &mut local.1,
10196 read2_analysis,
10197 read2.len(),
10198 config.hist_len,
10199 );
10200 }
10201 }
10202 local
10203 },
10204 )
10205 .reduce(
10206 || (SparseHist::default(), SparseReadDepthHist::default()),
10207 |mut left, right| {
10208 merge_sparse_hist(&mut left.0, right.0);
10209 merge_sparse_read_depth_hist(&mut left.1, right.1);
10210 left
10211 },
10212 )
10213}
10214
10215fn merge_sparse_read_depth_hist(target: &mut SparseReadDepthHist, source: SparseReadDepthHist) {
10216 for (idx, (reads, bases)) in source {
10217 let entry = target.entry(idx).or_insert((0, 0));
10218 entry.0 += reads;
10219 entry.1 += bases;
10220 }
10221}
10222
10223#[cfg(test)]
10224fn merge_sparse_read_depth_hist_into_dense(
10225 target: &mut ReadDepthHistogram,
10226 source: SparseReadDepthHist,
10227) {
10228 for (idx, (reads, bases)) in source {
10229 target.reads[idx] += reads;
10230 target.bases[idx] += bases;
10231 }
10232}
10233
10234#[cfg(test)]
10235fn write_depth_hist(path: &Path, raw_hist: &[u64], config: &Config) -> Result<()> {
10236 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10237 .with_context(|| format!("creating histogram {}", path.display()))?;
10238 match config.hist_columns {
10239 1 => writeln!(writer, "#tUnique_Kmers")?,
10240 2 => writeln!(writer, "#Depth\tUnique_Kmers")?,
10241 3 => writeln!(writer, "#Depth\tRaw_Count\tUnique_Kmers")?,
10242 _ => unreachable!("validated hist column count"),
10243 }
10244
10245 let total_raw = raw_hist.iter().copied().fold(0u64, u64::saturating_add);
10246 let mut seen_raw = 0u64;
10247 let lim = raw_hist.len().saturating_sub(1);
10248 for depth in 0..lim {
10249 let raw = adjusted_depth_hist_raw(raw_hist, config.zero_bin, depth);
10250 seen_raw = seen_raw.saturating_add(raw);
10251 let unique = unique_from_raw(depth, raw);
10252 if config.print_zero_coverage || unique > 0 || config.hist_columns == 1 {
10253 write_hist_row(&mut writer, config.hist_columns, depth, raw, unique)?;
10254 }
10255 if seen_raw >= total_raw {
10256 break;
10257 }
10258 }
10259
10260 let overflow_raw = (lim..raw_hist.len())
10261 .map(|depth| adjusted_depth_hist_raw(raw_hist, config.zero_bin, depth))
10262 .fold(0u64, u64::saturating_add);
10263 if overflow_raw > 0 {
10264 write_hist_row(
10265 &mut writer,
10266 config.hist_columns,
10267 lim,
10268 overflow_raw,
10269 unique_from_raw(lim, overflow_raw),
10270 )?;
10271 }
10272 writer.flush()?;
10273 Ok(())
10274}
10275
10276fn write_sparse_depth_hist(
10277 path: &Path,
10278 raw_hist: &SparseHist,
10279 hist_len: usize,
10280 config: &Config,
10281) -> Result<()> {
10282 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10283 .with_context(|| format!("creating histogram {}", path.display()))?;
10284 match config.hist_columns {
10285 1 => writeln!(writer, "#tUnique_Kmers")?,
10286 2 => writeln!(writer, "#Depth\tUnique_Kmers")?,
10287 3 => writeln!(writer, "#Depth\tRaw_Count\tUnique_Kmers")?,
10288 _ => unreachable!("validated hist column count"),
10289 }
10290
10291 let hist_len = hist_len.max(1);
10292 let lim = hist_len.saturating_sub(1);
10293 let total_raw = raw_hist.values().copied().fold(0u64, u64::saturating_add);
10294 let mut seen_raw = 0u64;
10295
10296 if config.print_zero_coverage || config.hist_columns == 1 {
10297 for depth in 0..lim {
10298 let raw = adjusted_sparse_depth_hist_raw(raw_hist, hist_len, config.zero_bin, depth);
10299 seen_raw = seen_raw.saturating_add(raw);
10300 write_hist_row(
10301 &mut writer,
10302 config.hist_columns,
10303 depth,
10304 raw,
10305 unique_from_raw(depth, raw),
10306 )?;
10307 if seen_raw >= total_raw {
10308 break;
10309 }
10310 }
10311 } else {
10312 let mut depths: Vec<usize> = raw_hist
10313 .iter()
10314 .filter_map(|(&depth, &raw)| {
10315 let mapped_depth = if !config.zero_bin && hist_len > 1 && depth == 0 {
10316 1
10317 } else {
10318 depth
10319 };
10320 (mapped_depth < lim && raw > 0).then_some(mapped_depth)
10321 })
10322 .collect();
10323 depths.sort_unstable();
10324 depths.dedup();
10325 for depth in depths {
10326 let raw = adjusted_sparse_depth_hist_raw(raw_hist, hist_len, config.zero_bin, depth);
10327 seen_raw = seen_raw.saturating_add(raw);
10328 let unique = unique_from_raw(depth, raw);
10329 if unique > 0 {
10330 write_hist_row(&mut writer, config.hist_columns, depth, raw, unique)?;
10331 }
10332 if seen_raw >= total_raw {
10333 break;
10334 }
10335 }
10336 }
10337
10338 let mut overflow_depths: Vec<usize> = raw_hist
10339 .keys()
10340 .copied()
10341 .filter_map(|depth| {
10342 let mapped_depth = if !config.zero_bin && hist_len > 1 && depth == 0 {
10343 1
10344 } else {
10345 depth
10346 };
10347 (mapped_depth >= lim).then_some(mapped_depth)
10348 })
10349 .collect();
10350 overflow_depths.sort_unstable();
10351 overflow_depths.dedup();
10352 let overflow_raw = overflow_depths.into_iter().fold(0u64, |sum, depth| {
10353 sum.saturating_add(adjusted_sparse_depth_hist_raw(
10354 raw_hist,
10355 hist_len,
10356 config.zero_bin,
10357 depth,
10358 ))
10359 });
10360 if overflow_raw > 0 {
10361 write_hist_row(
10362 &mut writer,
10363 config.hist_columns,
10364 lim,
10365 overflow_raw,
10366 unique_from_raw(lim, overflow_raw),
10367 )?;
10368 }
10369 writer.flush()?;
10370 Ok(())
10371}
10372
10373#[cfg(test)]
10374fn adjusted_depth_hist_raw(raw_hist: &[u64], zero_bin: bool, depth: usize) -> u64 {
10375 let raw = raw_hist.get(depth).copied().unwrap_or(0);
10376 if zero_bin || raw_hist.len() <= 1 {
10377 return raw;
10378 }
10379 match depth {
10380 0 => 0,
10381 1 => raw.saturating_add(raw_hist[0]),
10382 _ => raw,
10383 }
10384}
10385
10386fn adjusted_sparse_depth_hist_raw(
10387 raw_hist: &SparseHist,
10388 hist_len: usize,
10389 zero_bin: bool,
10390 depth: usize,
10391) -> u64 {
10392 let raw = raw_hist.get(&depth).copied().unwrap_or(0);
10393 if zero_bin || hist_len <= 1 {
10394 return raw;
10395 }
10396 match depth {
10397 0 => 0,
10398 1 => raw.saturating_add(raw_hist.get(&0).copied().unwrap_or(0)),
10399 _ => raw,
10400 }
10401}
10402
10403#[cfg(test)]
10404fn sparse_hist_to_dense(raw_hist: &SparseHist, hist_len: usize) -> Vec<u64> {
10405 let mut dense = vec![0u64; hist_len.max(1)];
10406 for (&depth, &raw) in raw_hist {
10407 let idx = depth.min(dense.len() - 1);
10408 dense[idx] = dense[idx].saturating_add(raw);
10409 }
10410 dense
10411}
10412
10413fn sparse_hist_to_peak_dense(raw_hist: &SparseHist, hist_len: usize) -> Vec<u64> {
10414 let hist_len = hist_len.max(1);
10415 let last_index = hist_len - 1;
10416 let last_nonzero = raw_hist
10417 .iter()
10418 .filter_map(|(&depth, &raw)| (raw > 0).then_some(depth.min(last_index)))
10419 .max()
10420 .unwrap_or(0);
10421 let dense_len = hist_len.min(
10422 last_nonzero
10423 .saturating_add(PEAK_COMPACT_ZERO_TAIL)
10424 .saturating_add(1),
10425 );
10426 let mut dense = vec![0u64; dense_len.max(1)];
10427 for (&depth, &raw) in raw_hist {
10428 if raw == 0 {
10429 continue;
10430 }
10431 let idx = depth.min(last_index);
10432 if idx < dense.len() {
10433 dense[idx] = dense[idx].saturating_add(raw);
10434 } else {
10435 dense.resize(idx + 1, 0);
10436 dense[idx] = dense[idx].saturating_add(raw);
10437 }
10438 }
10439 dense
10440}
10441
10442fn write_hist_row(
10443 writer: &mut Box<dyn Write>,
10444 columns: u8,
10445 depth: usize,
10446 raw: u64,
10447 unique: u64,
10448) -> Result<()> {
10449 match columns {
10450 1 => writeln!(writer, "{unique}")?,
10451 2 => writeln!(writer, "{depth}\t{unique}")?,
10452 3 => writeln!(writer, "{depth}\t{raw}\t{unique}")?,
10453 _ => unreachable!("validated hist column count"),
10454 }
10455 Ok(())
10456}
10457
10458#[cfg(test)]
10459fn write_read_depth_hist(path: &Path, hist: &ReadDepthHistogram, config: &Config) -> Result<()> {
10460 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10461 .with_context(|| format!("creating read histogram {}", path.display()))?;
10462 writeln!(writer, "#Depth\tReads\tBases")?;
10463
10464 let total_reads: u64 = hist.reads.iter().sum();
10465 let mut seen_reads = 0u64;
10466 let lim = hist.reads.len().saturating_sub(1);
10467
10468 for depth in 0..lim {
10469 let reads = hist.reads[depth];
10470 let bases = hist.bases[depth];
10471 seen_reads += reads;
10472 if config.print_zero_coverage || bases > 0 {
10473 writeln!(writer, "{depth}\t{reads}\t{bases}")?;
10474 }
10475 if seen_reads >= total_reads {
10476 break;
10477 }
10478 }
10479
10480 let overflow_reads: u64 = hist.reads.iter().skip(lim).sum();
10481 let overflow_bases: u64 = hist.bases.iter().skip(lim).sum();
10482 if overflow_reads > 0 || overflow_bases > 0 {
10483 writeln!(writer, "{lim}\t{overflow_reads}\t{overflow_bases}")?;
10484 }
10485 writer.flush()?;
10486 Ok(())
10487}
10488
10489fn write_sparse_read_depth_hist(
10490 path: &Path,
10491 hist: &SparseReadDepthHist,
10492 hist_len: usize,
10493 config: &Config,
10494) -> Result<()> {
10495 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10496 .with_context(|| format!("creating read histogram {}", path.display()))?;
10497 writeln!(writer, "#Depth\tReads\tBases")?;
10498
10499 let hist_len = hist_len.max(1);
10500 let lim = hist_len.saturating_sub(1);
10501 let total_reads = hist
10502 .values()
10503 .map(|(reads, _)| *reads)
10504 .fold(0u64, u64::saturating_add);
10505 let mut seen_reads = 0u64;
10506
10507 if config.print_zero_coverage {
10508 for depth in 0..lim {
10509 let (reads, bases) = hist.get(&depth).copied().unwrap_or_default();
10510 seen_reads = seen_reads.saturating_add(reads);
10511 writeln!(writer, "{depth}\t{reads}\t{bases}")?;
10512 if seen_reads >= total_reads {
10513 break;
10514 }
10515 }
10516 } else {
10517 let mut depths: Vec<usize> = hist.keys().copied().filter(|depth| *depth < lim).collect();
10518 depths.sort_unstable();
10519 for depth in depths {
10520 let (reads, bases) = hist.get(&depth).copied().unwrap_or_default();
10521 seen_reads = seen_reads.saturating_add(reads);
10522 if bases > 0 {
10523 writeln!(writer, "{depth}\t{reads}\t{bases}")?;
10524 }
10525 if seen_reads >= total_reads {
10526 break;
10527 }
10528 }
10529 }
10530
10531 let (overflow_reads, overflow_bases) = hist.iter().filter(|(depth, _)| **depth >= lim).fold(
10532 (0u64, 0u64),
10533 |(read_sum, base_sum), (_, (reads, bases))| {
10534 (
10535 read_sum.saturating_add(*reads),
10536 base_sum.saturating_add(*bases),
10537 )
10538 },
10539 );
10540 if overflow_reads > 0 || overflow_bases > 0 {
10541 writeln!(writer, "{lim}\t{overflow_reads}\t{overflow_bases}")?;
10542 }
10543 writer.flush()?;
10544 Ok(())
10545}
10546
10547fn write_quality_hist(path: &Path, hist: &[u64], config: &Config) -> Result<()> {
10548 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10549 .with_context(|| format!("creating quality histogram {}", path.display()))?;
10550 writeln!(writer, "#Quality\tBases")?;
10551
10552 let total_bases: u64 = hist.iter().sum();
10553 let mut seen_bases = 0u64;
10554 let lim = hist.len().saturating_sub(1);
10555
10556 for (quality, bases) in hist.iter().copied().enumerate().take(lim) {
10557 seen_bases += bases;
10558 if config.print_zero_coverage || bases > 0 {
10559 writeln!(writer, "{quality}\t{bases}")?;
10560 }
10561 if seen_bases >= total_bases {
10562 break;
10563 }
10564 }
10565
10566 let overflow_bases: u64 = hist.iter().skip(lim).sum();
10567 if overflow_bases > 0 {
10568 writeln!(writer, "{lim}\t{overflow_bases}")?;
10569 }
10570 writer.flush()?;
10571 Ok(())
10572}
10573
10574fn write_quality_count_hist(
10575 path: &Path,
10576 first: &[u64],
10577 second: &[u64],
10578 paired: bool,
10579 config: &Config,
10580) -> Result<()> {
10581 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10582 .with_context(|| format!("creating quality-count histogram {}", path.display()))?;
10583 writeln!(
10584 writer,
10585 "#Quality\tcount1\tfraction1{}",
10586 if paired { "\tcount2\tfraction2" } else { "" }
10587 )?;
10588 write_paired_quality_count_rows(&mut writer, first, second, paired, config)?;
10589 writer.flush()?;
10590 Ok(())
10591}
10592
10593fn write_average_quality_hist(
10594 path: &Path,
10595 first: &[u64],
10596 second: &[u64],
10597 paired: bool,
10598 config: &Config,
10599) -> Result<()> {
10600 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10601 .with_context(|| format!("creating average-quality histogram {}", path.display()))?;
10602 writeln!(
10603 writer,
10604 "#Quality\tcount1\tfraction1{}",
10605 if paired { "\tcount2\tfraction2" } else { "" }
10606 )?;
10607 write_paired_quality_count_rows(&mut writer, first, second, paired, config)?;
10608 writer.flush()?;
10609 Ok(())
10610}
10611
10612fn write_paired_quality_count_rows(
10613 writer: &mut Box<dyn Write>,
10614 first: &[u64],
10615 second: &[u64],
10616 paired: bool,
10617 config: &Config,
10618) -> Result<()> {
10619 let total1: u64 = first.iter().sum();
10620 let total2: u64 = second.iter().sum();
10621 let mut remaining = total1 + if paired { total2 } else { 0 };
10622 let denom1 = total1.max(1) as f64;
10623 let denom2 = total2.max(1) as f64;
10624
10625 for (quality, count1) in first.iter().copied().enumerate() {
10626 let count2 = second.get(quality).copied().unwrap_or(0);
10627 if count1 > 0 || (paired && count2 > 0) || config.print_zero_coverage {
10628 write!(writer, "{quality}\t{count1}\t{:.5}", count1 as f64 / denom1)?;
10629 if paired {
10630 write!(writer, "\t{count2}\t{:.5}", count2 as f64 / denom2)?;
10631 }
10632 writeln!(writer)?;
10633 }
10634 remaining = remaining.saturating_sub(count1 + if paired { count2 } else { 0 });
10635 if remaining == 0 && !config.print_zero_coverage {
10636 break;
10637 }
10638 }
10639 Ok(())
10640}
10641
10642fn write_overall_base_quality_hist(path: &Path, hist: &[u64], config: &Config) -> Result<()> {
10643 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10644 .with_context(|| format!("creating overall base-quality histogram {}", path.display()))?;
10645 let median = percentile_histogram(hist, 0.5);
10646 let mean = average_histogram(hist);
10647 let stdev = stdev_histogram(hist, mean, 0);
10648 let mean30 = average_histogram_min(hist, 30);
10649 let stdev30 = stdev_histogram(hist, mean30, 30);
10650 writeln!(writer, "#Median\t{median}")?;
10651 writeln!(writer, "#Mean\t{mean:.3}")?;
10652 writeln!(writer, "#STDev\t{stdev:.3}")?;
10653 writeln!(writer, "#Mean_30\t{mean30:.3}")?;
10654 writeln!(writer, "#STDev_30\t{stdev30:.3}")?;
10655 writeln!(writer, "#Quality\tbases\tfraction")?;
10656
10657 let total: u64 = hist.iter().sum();
10658 let denom = total.max(1) as f64;
10659 let mut remaining = total;
10660 for (quality, bases) in hist.iter().copied().enumerate() {
10661 if bases > 0 || config.print_zero_coverage {
10662 writeln!(writer, "{quality}\t{bases}\t{:.5}", bases as f64 / denom)?;
10663 }
10664 remaining = remaining.saturating_sub(bases);
10665 if remaining == 0 && !config.print_zero_coverage {
10666 break;
10667 }
10668 }
10669 writer.flush()?;
10670 Ok(())
10671}
10672
10673fn write_base_quality_hist(
10674 path: &Path,
10675 hist: &QualitySideHistograms,
10676 config: &Config,
10677) -> Result<()> {
10678 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10679 .with_context(|| format!("creating base-quality histogram {}", path.display()))?;
10680 write!(
10681 writer,
10682 "#BaseNum\tcount_1\tmin_1\tmax_1\tmean_1\tQ1_1\tmed_1\tQ3_1\tLW_1\tRW_1"
10683 )?;
10684 if hist.paired {
10685 write!(
10686 writer,
10687 "\tcount_2\tmin_2\tmax_2\tmean_2\tQ1_2\tmed_2\tQ3_2\tLW_2\tRW_2"
10688 )?;
10689 }
10690 writeln!(writer)?;
10691
10692 for pos in 0..hist.first_by_pos.len() {
10693 let sum1: u64 = hist.first_by_pos[pos].iter().sum();
10694 let sum2: u64 = hist.second_by_pos[pos].iter().sum();
10695 if sum1 == 0 && sum2 == 0 && !config.print_zero_coverage {
10696 break;
10697 }
10698 write!(writer, "{pos}")?;
10699 write_base_quality_summary(&mut writer, &hist.first_by_pos[pos])?;
10700 if hist.paired {
10701 write_base_quality_summary(&mut writer, &hist.second_by_pos[pos])?;
10702 }
10703 writeln!(writer)?;
10704 }
10705 writer.flush()?;
10706 Ok(())
10707}
10708
10709fn write_base_quality_summary(writer: &mut Box<dyn Write>, hist: &[u64]) -> Result<()> {
10710 let count: u64 = hist.iter().sum();
10711 let min = min_histogram(hist);
10712 let max = max_histogram(hist);
10713 let mean = average_histogram(hist);
10714 let q1 = percentile_histogram(hist, 0.25);
10715 let med = percentile_histogram(hist, 0.5);
10716 let q3 = percentile_histogram(hist, 0.75);
10717 let left_whisker = percentile_histogram(hist, 0.02);
10718 let right_whisker = percentile_histogram(hist, 0.98);
10719 write!(
10720 writer,
10721 "\t{count}\t{min}\t{max}\t{mean:.2}\t{q1}\t{med}\t{q3}\t{left_whisker}\t{right_whisker}"
10722 )?;
10723 Ok(())
10724}
10725
10726fn min_histogram(hist: &[u64]) -> usize {
10727 hist.iter().position(|count| *count > 0).unwrap_or_default()
10728}
10729
10730fn max_histogram(hist: &[u64]) -> usize {
10731 hist.iter()
10732 .rposition(|count| *count > 0)
10733 .unwrap_or_default()
10734}
10735
10736fn mode_histogram(hist: &[u64]) -> usize {
10737 hist.iter()
10738 .copied()
10739 .enumerate()
10740 .max_by_key(|(_, count)| *count)
10741 .map_or(0, |(idx, _)| idx)
10742}
10743
10744fn percentile_histogram(hist: &[u64], percentile: f64) -> usize {
10745 let total: u64 = hist.iter().sum();
10746 if total == 0 {
10747 return 0;
10748 }
10749 let threshold = ((total as f64) * percentile).ceil().max(1.0) as u64;
10750 let mut seen = 0u64;
10751 for (idx, count) in hist.iter().copied().enumerate() {
10752 seen += count;
10753 if seen >= threshold {
10754 return idx;
10755 }
10756 }
10757 hist.len().saturating_sub(1)
10758}
10759
10760fn average_histogram(hist: &[u64]) -> f64 {
10761 average_histogram_min(hist, 0)
10762}
10763
10764fn average_histogram_min(hist: &[u64], min_quality: usize) -> f64 {
10765 let mut count = 0u64;
10766 let mut sum = 0u64;
10767 for (quality, bases) in hist.iter().copied().enumerate().skip(min_quality) {
10768 count += bases;
10769 sum += quality as u64 * bases;
10770 }
10771 if count == 0 {
10772 0.0
10773 } else {
10774 sum as f64 / count as f64
10775 }
10776}
10777
10778fn stdev_histogram(hist: &[u64], mean: f64, min_quality: usize) -> f64 {
10779 let mut count = 0u64;
10780 let mut sum = 0.0;
10781 for (quality, bases) in hist.iter().copied().enumerate().skip(min_quality) {
10782 count += bases;
10783 let delta = quality as f64 - mean;
10784 sum += delta * delta * bases as f64;
10785 }
10786 if count == 0 {
10787 0.0
10788 } else {
10789 (sum / count as f64).sqrt()
10790 }
10791}
10792
10793fn write_length_hist(path: &Path, hist: &ReadDepthHistogram, config: &Config) -> Result<()> {
10794 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10795 .with_context(|| format!("creating length histogram {}", path.display()))?;
10796 writeln!(writer, "#Length\tReads\tBases")?;
10797
10798 let total_reads: u64 = hist.reads.iter().sum();
10799 let mut seen_reads = 0u64;
10800 let lim = hist.reads.len().saturating_sub(1);
10801
10802 for len in 0..lim {
10803 let reads = hist.reads[len];
10804 let bases = hist.bases[len];
10805 seen_reads += reads;
10806 if config.print_zero_coverage || reads > 0 {
10807 writeln!(writer, "{len}\t{reads}\t{bases}")?;
10808 }
10809 if seen_reads >= total_reads {
10810 break;
10811 }
10812 }
10813
10814 let overflow_reads: u64 = hist.reads.iter().skip(lim).sum();
10815 let overflow_bases: u64 = hist.bases.iter().skip(lim).sum();
10816 if overflow_reads > 0 || overflow_bases > 0 {
10817 writeln!(writer, "{lim}\t{overflow_reads}\t{overflow_bases}")?;
10818 }
10819 writer.flush()?;
10820 Ok(())
10821}
10822
10823fn write_gc_hist(path: &Path, hist: &ReadDepthHistogram, config: &Config) -> Result<()> {
10824 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10825 .with_context(|| format!("creating GC histogram {}", path.display()))?;
10826 writeln!(writer, "#GC_Bin\tReads\tBases")?;
10827
10828 let total_reads: u64 = hist.reads.iter().sum();
10829 let mut seen_reads = 0u64;
10830 for (bin, reads) in hist.reads.iter().copied().enumerate() {
10831 let bases = hist.bases[bin];
10832 seen_reads += reads;
10833 if config.print_zero_coverage || reads > 0 {
10834 writeln!(writer, "{bin}\t{reads}\t{bases}")?;
10835 }
10836 if seen_reads >= total_reads {
10837 break;
10838 }
10839 }
10840 writer.flush()?;
10841 Ok(())
10842}
10843
10844fn write_base_content_hist(
10845 path: &Path,
10846 hist: &BaseContentHistogram,
10847 config: &Config,
10848) -> Result<()> {
10849 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10850 .with_context(|| format!("creating base-content histogram {}", path.display()))?;
10851 writeln!(writer, "#Pos\tA\tC\tG\tT\tN")?;
10852 let first_rows = write_base_content_rows(&mut writer, &hist.first, 0, config)?;
10853 write_base_content_rows(&mut writer, &hist.second, first_rows, config)?;
10854 writer.flush()?;
10855 Ok(())
10856}
10857
10858fn write_base_content_rows(
10859 writer: &mut Box<dyn Write>,
10860 hist: &[BaseCounts],
10861 offset: usize,
10862 config: &Config,
10863) -> Result<usize> {
10864 let rows = if config.print_zero_coverage {
10865 hist.len()
10866 } else {
10867 hist.iter()
10868 .rposition(|counts| counts.total() > 0)
10869 .map_or(0, |idx| idx + 1)
10870 };
10871
10872 for (pos, counts) in hist.iter().copied().enumerate().take(rows) {
10873 let total = counts.total() as f64;
10874 let fraction = |value: u64| {
10875 if total == 0.0 {
10876 0.0
10877 } else {
10878 value as f64 / total
10879 }
10880 };
10881 writeln!(
10882 writer,
10883 "{}\t{:.5}\t{:.5}\t{:.5}\t{:.5}\t{:.5}",
10884 pos + offset,
10885 fraction(counts.a),
10886 fraction(counts.c),
10887 fraction(counts.g),
10888 fraction(counts.t),
10889 fraction(counts.n)
10890 )?;
10891 }
10892 Ok(rows)
10893}
10894
10895fn write_entropy_hist(path: &Path, hist: &[u64], config: &Config) -> Result<()> {
10896 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10897 .with_context(|| format!("creating entropy histogram {}", path.display()))?;
10898 let bins = hist.len().saturating_sub(1).max(1);
10899 let mult = 1.0 / bins as f64;
10900 let mean = average_histogram(hist) * mult;
10901 let median = percentile_histogram(hist, 0.5) as f64 * mult;
10902 let mode = mode_histogram(hist) as f64 * mult;
10903 let stdev = stdev_histogram(hist, average_histogram(hist), 0) * mult;
10904
10905 writeln!(writer, "#Mean\t{mean:.6}")?;
10906 writeln!(writer, "#Median\t{median:.6}")?;
10907 writeln!(writer, "#Mode\t{mode:.6}")?;
10908 writeln!(writer, "#STDev\t{stdev:.6}")?;
10909 writeln!(writer, "#Value\tCount")?;
10910
10911 for (idx, count) in hist.iter().copied().enumerate() {
10912 if config.print_zero_coverage || count > 0 {
10913 writeln!(writer, "{:.4}\t{count}", idx as f64 * mult)?;
10914 }
10915 }
10916 writer.flush()?;
10917 Ok(())
10918}
10919
10920fn write_identity_hist(path: &Path, hist: &ReadDepthHistogram, config: &Config) -> Result<()> {
10921 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10922 .with_context(|| format!("creating identity histogram {}", path.display()))?;
10923 let bins = hist.reads.len().saturating_sub(1).max(1);
10924 let mult = 100.0 / bins as f64;
10925 let mean_reads = average_histogram(&hist.reads) * mult;
10926 let mean_bases = average_histogram(&hist.bases) * mult;
10927 let median_reads = percentile_histogram(&hist.reads, 0.5) as f64 * mult;
10928 let median_bases = percentile_histogram(&hist.bases, 0.5) as f64 * mult;
10929 let mode_reads = mode_histogram(&hist.reads) as f64 * mult;
10930 let mode_bases = mode_histogram(&hist.bases) as f64 * mult;
10931 let stdev_reads = stdev_histogram(&hist.reads, average_histogram(&hist.reads), 0) * mult;
10932 let stdev_bases = stdev_histogram(&hist.bases, average_histogram(&hist.bases), 0) * mult;
10933
10934 writeln!(writer, "#Mean_reads\t{mean_reads:.3}")?;
10935 writeln!(writer, "#Mean_bases\t{mean_bases:.3}")?;
10936 writeln!(writer, "#Median_reads\t{median_reads:.0}")?;
10937 writeln!(writer, "#Median_bases\t{median_bases:.0}")?;
10938 writeln!(writer, "#Mode_reads\t{mode_reads:.0}")?;
10939 writeln!(writer, "#Mode_bases\t{mode_bases:.0}")?;
10940 writeln!(writer, "#STDev_reads\t{stdev_reads:.3}")?;
10941 writeln!(writer, "#STDev_bases\t{stdev_bases:.3}")?;
10942 writeln!(writer, "#Identity\tReads\tBases")?;
10943
10944 for (idx, reads) in hist.reads.iter().copied().enumerate() {
10945 let bases = hist.bases[idx];
10946 if config.print_zero_coverage || reads > 0 || bases > 0 {
10947 writeln!(writer, "{:.1}\t{reads}\t{bases}", idx as f64 * mult)?;
10948 }
10949 }
10950 writer.flush()?;
10951 Ok(())
10952}
10953
10954fn emit_alignment_fallback_side_outputs(
10955 config: &Config,
10956 hist: &AlignmentFallbackHistograms,
10957) -> Result<()> {
10958 if let Some(path) = &config.match_hist_out {
10959 write_match_fallback_hist(path, hist, config)?;
10960 }
10961 if let Some(path) = &config.insert_hist_out {
10962 write_insert_fallback_hist(path, hist, config)?;
10963 }
10964 if let Some(path) = &config.quality_accuracy_hist_out {
10965 write_quality_accuracy_fallback_hist(path, hist, config)?;
10966 }
10967 if let Some(path) = &config.indel_hist_out {
10968 write_indel_fallback_hist(path, config)?;
10969 }
10970 if let Some(path) = &config.error_hist_out {
10971 write_error_fallback_hist(path, hist, config)?;
10972 }
10973 Ok(())
10974}
10975
10976fn write_match_fallback_hist(
10977 path: &Path,
10978 hist: &AlignmentFallbackHistograms,
10979 config: &Config,
10980) -> Result<()> {
10981 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
10982 .with_context(|| format!("creating match histogram {}", path.display()))?;
10983 if hist.paired {
10984 writeln!(
10985 writer,
10986 "#BaseNum\tMatch1\tSub1\tDel1\tIns1\tN1\tOther1\tMatch2\tSub2\tDel2\tIns2\tN2\tOther2"
10987 )?;
10988 } else {
10989 writeln!(writer, "#BaseNum\tMatch1\tSub1\tDel1\tIns1\tN1\tOther1")?;
10990 }
10991
10992 for pos in 0..hist.first_match.len() {
10993 let first = hist.first_match[pos];
10994 let second = hist.second_match[pos];
10995 if first.matches + first.n + second.matches + second.n == 0 && !config.print_zero_coverage {
10996 break;
10997 }
10998 write!(writer, "{}", pos + 1)?;
10999 write_match_fallback_columns(&mut writer, first)?;
11000 if hist.paired {
11001 write_match_fallback_columns(&mut writer, second)?;
11002 }
11003 writeln!(writer)?;
11004 }
11005 writer.flush()?;
11006 Ok(())
11007}
11008
11009fn write_match_fallback_columns(writer: &mut Box<dyn Write>, counts: MatchCounts) -> Result<()> {
11010 let total = (counts.matches + counts.n).max(1) as f64;
11011 write!(
11012 writer,
11013 "\t{:.5}\t0.00000\t0.00000\t0.00000\t{:.5}\t0.00000",
11014 counts.matches as f64 / total,
11015 counts.n as f64 / total
11016 )?;
11017 Ok(())
11018}
11019
11020fn write_insert_fallback_hist(
11021 path: &Path,
11022 hist: &AlignmentFallbackHistograms,
11023 config: &Config,
11024) -> Result<()> {
11025 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
11026 .with_context(|| format!("creating insert-size histogram {}", path.display()))?;
11027 let percent = if hist.read_count == 0 {
11028 0.0
11029 } else {
11030 (hist.pair_count * 2) as f64 * 100.0 / hist.read_count as f64
11031 };
11032 writeln!(writer, "#Mean\t0.000")?;
11033 writeln!(writer, "#Median\t0")?;
11034 writeln!(writer, "#Mode\t0")?;
11035 writeln!(writer, "#STDev\t0.000")?;
11036 writeln!(writer, "#PercentOfPairs\t{percent:.3}")?;
11037 writeln!(writer, "#InsertSize\tCount")?;
11038 writer.flush()?;
11039 Ok(())
11040}
11041
11042fn write_quality_accuracy_fallback_hist(
11043 path: &Path,
11044 hist: &AlignmentFallbackHistograms,
11045 config: &Config,
11046) -> Result<()> {
11047 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
11048 .with_context(|| format!("creating quality-accuracy histogram {}", path.display()))?;
11049 writeln!(writer, "#Deviation\t0.000")?;
11050 writeln!(writer, "#DeviationSub\t0.000")?;
11051 writeln!(writer, "#Avg_STDev\t0.000")?;
11052 writeln!(writer, "#Diversity\t0.000")?;
11053 writeln!(writer, "#Entropy\t0.000")?;
11054 writeln!(
11055 writer,
11056 "#Quality\tMatch\tSub\tIns\tDel\tTrueQuality\tTrueQualitySub"
11057 )?;
11058
11059 let mut remaining: u64 = hist.quality_match.iter().sum();
11060 for (quality, matches) in hist.quality_match.iter().copied().enumerate() {
11061 if matches > 0 || config.print_zero_coverage {
11062 writeln!(writer, "{quality}\t{matches}\t0\t0\t0\t\t")?;
11063 }
11064 remaining = remaining.saturating_sub(matches);
11065 if remaining == 0 && !config.print_zero_coverage {
11066 break;
11067 }
11068 }
11069 writer.flush()?;
11070 Ok(())
11071}
11072
11073fn write_indel_fallback_hist(path: &Path, config: &Config) -> Result<()> {
11074 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
11075 .with_context(|| format!("creating indel histogram {}", path.display()))?;
11076 writeln!(writer, "#Length\tDeletions\tInsertions")?;
11077 if config.print_zero_coverage {
11078 writeln!(writer, "0\t0\t0")?;
11079 }
11080 writer.flush()?;
11081 Ok(())
11082}
11083
11084fn write_error_fallback_hist(
11085 path: &Path,
11086 hist: &AlignmentFallbackHistograms,
11087 config: &Config,
11088) -> Result<()> {
11089 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
11090 .with_context(|| format!("creating error histogram {}", path.display()))?;
11091 writeln!(writer, "#Errors\tCount")?;
11092 if hist.read_count > 0 || config.print_zero_coverage {
11093 writeln!(writer, "0\t{}", hist.read_count)?;
11094 }
11095 writer.flush()?;
11096 Ok(())
11097}
11098
11099fn write_barcode_stats(
11100 path: &Path,
11101 barcodes: &BTreeMap<String, u64>,
11102 config: &Config,
11103) -> Result<()> {
11104 let mut writer = crate::seqio::create_output(path, config.overwrite || config.append)
11105 .with_context(|| format!("creating barcode stats {}", path.display()))?;
11106 let total: u64 = barcodes.values().copied().sum();
11107 writeln!(writer, "#Reads\t{total}")?;
11108 writeln!(writer, "#Barcodes\t{}", barcodes.len())?;
11109
11110 let mut sorted: Vec<_> = barcodes.iter().collect();
11111 sorted.sort_by(|(left_name, left_count), (right_name, right_count)| {
11112 right_count
11113 .cmp(left_count)
11114 .then_with(|| left_name.cmp(right_name))
11115 });
11116 for (barcode, count) in sorted {
11117 writeln!(writer, "{barcode}\t{count}")?;
11118 }
11119 writer.flush()?;
11120 Ok(())
11121}
11122
11123fn unique_from_raw(depth: usize, raw: u64) -> u64 {
11124 if depth < 1 {
11125 raw
11126 } else {
11127 (raw + (depth as u64 / 2)) / depth as u64
11128 }
11129}
11130
11131fn percentile_index(cov_last: usize, percentile: f64) -> usize {
11132 ((cov_last as f64) * (1.0 - percentile)) as usize
11133}
11134
11135fn deterministic_coin(rand: Option<f64>, depth: u64) -> u64 {
11136 debug_assert!(depth > 0);
11137 (((rand.unwrap_or(0.0) * depth as f64) as u64) + 1).min(depth)
11138}
11139
11140fn non_negative_depth(depth: i64) -> Option<u64> {
11141 u64::try_from(depth).ok()
11142}
11143
11144fn depth_below_min(depth: Option<u64>, min_depth: u64) -> bool {
11145 depth.is_none_or(|depth| depth < min_depth)
11146}
11147
11148fn u64_to_i64_saturating(value: u64) -> i64 {
11149 i64::try_from(value).unwrap_or(i64::MAX)
11150}
11151
11152fn min_option(a: Option<u64>, b: Option<u64>) -> Option<u64> {
11153 match (a, b) {
11154 (Some(a), Some(b)) => Some(a.min(b)),
11155 (Some(a), None) => Some(a),
11156 (None, Some(b)) => Some(b),
11157 (None, None) => None,
11158 }
11159}
11160
11161fn max_option(a: Option<u64>, b: Option<u64>) -> Option<u64> {
11162 match (a, b) {
11163 (Some(a), Some(b)) => Some(a.max(b)),
11164 (Some(a), None) => Some(a),
11165 (None, Some(b)) => Some(b),
11166 (None, None) => None,
11167 }
11168}
11169
11170fn limit_reached(limit: Option<u64>, reads_seen: u64) -> bool {
11171 limit.is_some_and(|limit| reads_seen >= limit)
11172}
11173
11174fn primary_input_lists(config: &Config) -> Option<InputLists> {
11175 if config.interleaved {
11176 return None;
11177 }
11178 let input = config.in1.as_ref()?;
11179 if input.exists() {
11180 return None;
11181 }
11182 let text = input.to_string_lossy();
11183 if !text.contains(',') {
11184 return None;
11185 }
11186 let first = split_path_list(&text);
11187 if first.len() <= 1 {
11188 return None;
11189 }
11190 let second = config.in2.as_ref().map(|path| {
11191 let text = path.to_string_lossy();
11192 split_path_list(&text)
11193 });
11194 Some(InputLists { first, second })
11195}
11196
11197fn split_path_list(value: &str) -> Vec<PathBuf> {
11198 value
11199 .split(',')
11200 .filter_map(|part| {
11201 let trimmed = part.trim();
11202 (!trimmed.is_empty()).then(|| PathBuf::from(trimmed))
11203 })
11204 .collect()
11205}
11206
11207fn sequence_settings(config: &Config) -> SequenceSettings {
11208 SequenceSettings {
11209 bases: BaseSettings {
11210 u_to_t: config.u_to_t,
11211 to_upper_case: config.to_upper_case,
11212 lower_case_to_n: config.lower_case_to_n,
11213 dot_dash_x_to_n: config.dot_dash_x_to_n,
11214 iupac_to_n: config.iupac_to_n,
11215 fix_junk_and_iupac: config.fix_junk_and_iupac,
11216 junk_mode: config.junk_mode,
11217 },
11218 qualities: QualitySettings {
11219 input_offset: config.quality_in_offset,
11220 min_called: config.min_called_quality,
11221 max_called: config.max_called_quality,
11222 change_quality: config.change_quality,
11223 },
11224 }
11225}
11226
11227fn open_sequence_writer(
11228 path: Option<&Path>,
11229 overwrite: bool,
11230 append: bool,
11231 quality_out_offset: u8,
11232 fake_quality: u8,
11233 fasta_wrap: usize,
11234 gzip_threads: Option<usize>,
11235) -> Result<Option<SequenceWriter>> {
11236 path.map(|path| {
11237 SequenceWriter::from_path_with_append_and_gzip_threads(
11238 path,
11239 overwrite,
11240 append,
11241 quality_out_offset,
11242 fake_quality,
11243 fasta_wrap,
11244 gzip_threads,
11245 )
11246 })
11247 .transpose()
11248}
11249
11250#[cfg(test)]
11251mod tests {
11252 use super::*;
11253 use crate::kmer::kmers_for_record;
11254 use crate::seqio::SequenceRecord;
11255 use std::fs;
11256
11257 fn record(id: &str, bases: &[u8]) -> SequenceRecord {
11258 SequenceRecord {
11259 id: id.to_string(),
11260 numeric_id: 0,
11261 bases: bases.to_vec(),
11262 qualities: Some(vec![b'I'; bases.len()]),
11263 }
11264 }
11265
11266 fn quality_record(id: &str, bases: &[u8], qualities: &[u8]) -> SequenceRecord {
11267 SequenceRecord {
11268 id: id.to_string(),
11269 numeric_id: 0,
11270 bases: bases.to_vec(),
11271 qualities: Some(qualities.to_vec()),
11272 }
11273 }
11274
11275 #[test]
11276 fn gzip_threads_are_split_across_concurrent_gzip_streams() {
11277 assert_eq!(gzip_threads_for_streams(None, 2), None);
11278 assert_eq!(gzip_threads_for_streams(Some(1), 2), Some(1));
11279 assert_eq!(gzip_threads_for_streams(Some(8), 0), Some(8));
11280 assert_eq!(gzip_threads_for_streams(Some(8), 1), Some(8));
11281 assert_eq!(gzip_threads_for_streams(Some(8), 2), Some(4));
11282 assert_eq!(gzip_threads_for_streams(Some(8), 3), Some(2));
11283 assert_eq!(gzip_threads_for_streams(Some(2), 4), Some(1));
11284
11285 assert_eq!(
11286 gzip_threads_for_paths(
11287 Some(8),
11288 [
11289 Some(Path::new("reads_R1.fq.gz")),
11290 Some(Path::new("reads_R2.fq.gz")),
11291 ],
11292 ),
11293 Some(4)
11294 );
11295 assert_eq!(
11296 gzip_threads_for_paths(
11297 Some(8),
11298 [
11299 Some(Path::new("reads_R1.fq")),
11300 Some(Path::new("reads_R2.fq.gz")),
11301 ],
11302 ),
11303 Some(8)
11304 );
11305 }
11306
11307 #[test]
11308 fn write_depth_hist_folds_zero_bin_without_cloning_input_hist() {
11309 let dir = tempfile::tempdir().unwrap();
11310 let path = dir.path().join("hist.tsv");
11311 let hist = vec![5, 7, 4];
11312 let config = Config {
11313 overwrite: true,
11314 ..Config::default()
11315 };
11316
11317 write_depth_hist(&path, &hist, &config).unwrap();
11318
11319 assert_eq!(hist, vec![5, 7, 4]);
11320 assert_eq!(
11321 fs::read_to_string(path).unwrap(),
11322 "#Depth\tRaw_Count\tUnique_Kmers\n1\t12\t12\n2\t4\t2\n"
11323 );
11324 }
11325
11326 #[test]
11327 fn write_depth_hist_preserves_zero_bin_when_requested() {
11328 let dir = tempfile::tempdir().unwrap();
11329 let path = dir.path().join("hist.tsv");
11330 let hist = vec![5, 7, 4];
11331 let config = Config {
11332 overwrite: true,
11333 zero_bin: true,
11334 ..Config::default()
11335 };
11336
11337 write_depth_hist(&path, &hist, &config).unwrap();
11338
11339 assert_eq!(
11340 fs::read_to_string(path).unwrap(),
11341 "#Depth\tRaw_Count\tUnique_Kmers\n0\t5\t5\n1\t7\t7\n2\t4\t2\n"
11342 );
11343 }
11344
11345 #[test]
11346 fn write_sparse_depth_hist_matches_dense_output() {
11347 let dir = tempfile::tempdir().unwrap();
11348 let dense_path = dir.path().join("dense.hist.tsv");
11349 let sparse_path = dir.path().join("sparse.hist.tsv");
11350 let hist = vec![5, 7, 4];
11351 let sparse = SparseHist::from_iter([(0, 5), (1, 7), (2, 4)]);
11352 let config = Config {
11353 overwrite: true,
11354 ..Config::default()
11355 };
11356
11357 write_depth_hist(&dense_path, &hist, &config).unwrap();
11358 write_sparse_depth_hist(&sparse_path, &sparse, hist.len(), &config).unwrap();
11359
11360 assert_eq!(
11361 fs::read_to_string(sparse_path).unwrap(),
11362 fs::read_to_string(dense_path).unwrap()
11363 );
11364 }
11365
11366 #[test]
11367 fn write_sparse_depth_hist_matches_dense_zero_coverage_columns_one() {
11368 let dir = tempfile::tempdir().unwrap();
11369 let dense_path = dir.path().join("dense.hist.tsv");
11370 let sparse_path = dir.path().join("sparse.hist.tsv");
11371 let hist = vec![0, 0, 6, 0, 4];
11372 let sparse = SparseHist::from_iter([(2, 6), (4, 4)]);
11373 let config = Config {
11374 overwrite: true,
11375 hist_columns: 1,
11376 print_zero_coverage: true,
11377 ..Config::default()
11378 };
11379
11380 write_depth_hist(&dense_path, &hist, &config).unwrap();
11381 write_sparse_depth_hist(&sparse_path, &sparse, hist.len(), &config).unwrap();
11382
11383 assert_eq!(
11384 fs::read_to_string(sparse_path).unwrap(),
11385 fs::read_to_string(dense_path).unwrap()
11386 );
11387 }
11388
11389 #[test]
11390 fn output_counts_sparse_depth_hist_matches_dense_hist() {
11391 let hist_len = 5;
11392 let mut exact = CountMap::default();
11393 exact.insert(KmerKey::Short(1), 1);
11394 exact.insert(KmerKey::Short(2), 3);
11395 exact.insert(KmerKey::Short(3), 9);
11396 let exact = OutputCounts::Exact(exact);
11397 assert_eq!(
11398 sparse_hist_to_dense(&exact.sparse_depth_hist(hist_len), hist_len),
11399 exact.depth_hist(hist_len)
11400 );
11401
11402 let mut packed = PackedCountMinSketch::new(8, 1, 4).unwrap();
11403 packed.set_cell(0, 1);
11404 packed.set_cell(1, 2);
11405 packed.set_cell(2, 9);
11406 let packed = OutputCounts::Sketch(packed);
11407 assert_eq!(
11408 sparse_hist_to_dense(&packed.sparse_depth_hist(hist_len), hist_len),
11409 packed.depth_hist(hist_len)
11410 );
11411
11412 let atomic = AtomicCountMinSketch::new(64, 1).unwrap();
11413 atomic.add_key_count(&KmerKey::Short(7), 2);
11414 atomic.add_key_count(&KmerKey::Short(11), 4);
11415 atomic.add_key_count(&KmerKey::Short(13), 9);
11416 let atomic = OutputCounts::AtomicSketch(atomic);
11417 assert_eq!(
11418 sparse_hist_to_dense(&atomic.sparse_depth_hist(hist_len), hist_len),
11419 atomic.depth_hist(hist_len)
11420 );
11421 }
11422
11423 #[test]
11424 fn sparse_peak_dense_trims_trailing_zero_histlen_without_changing_peaks() {
11425 let dir = tempfile::tempdir().unwrap();
11426 let dense_path = dir.path().join("dense.peaks.tsv");
11427 let compact_path = dir.path().join("compact.peaks.tsv");
11428 let hist_len = 10_000;
11429 let mut dense = vec![0u64; hist_len];
11430 dense[18] = 180;
11431 dense[19] = 380;
11432 dense[20] = 720;
11433 dense[21] = 380;
11434 dense[22] = 180;
11435 let sparse = SparseHist::from_iter(
11436 dense
11437 .iter()
11438 .copied()
11439 .enumerate()
11440 .filter_map(|(depth, raw)| (raw > 0).then_some((depth, raw))),
11441 );
11442 let compact = sparse_hist_to_peak_dense(&sparse, hist_len);
11443 let config = Config {
11444 overwrite: true,
11445 k: 5,
11446 peak_min_height: 1,
11447 peak_min_volume: 1,
11448 peak_min_width: 1,
11449 peak_min_peak: 1,
11450 peak_max_peak: 100,
11451 peak_max_count: 8,
11452 ..Config::default()
11453 };
11454
11455 assert!(compact.len() < 128);
11456 write_peaks(&dense_path, &dense, &config).unwrap();
11457 write_peaks(&compact_path, &compact, &config).unwrap();
11458
11459 assert_eq!(
11460 fs::read_to_string(compact_path).unwrap(),
11461 fs::read_to_string(dense_path).unwrap()
11462 );
11463 }
11464
11465 #[test]
11466 fn write_sparse_read_depth_hist_matches_dense_output() {
11467 let dir = tempfile::tempdir().unwrap();
11468 let dense_path = dir.path().join("dense.rhist.tsv");
11469 let sparse_path = dir.path().join("sparse.rhist.tsv");
11470 let mut dense = ReadDepthHistogram::new(4);
11471 dense.reads[0] = 5;
11472 dense.bases[0] = 500;
11473 dense.reads[1] = 7;
11474 dense.bases[1] = 700;
11475 dense.reads[3] = 4;
11476 dense.bases[3] = 400;
11477 let mut sparse = SparseReadDepthHist::default();
11478 sparse.insert(0, (5, 500));
11479 sparse.insert(1, (7, 700));
11480 sparse.insert(3, (4, 400));
11481 let config = Config {
11482 overwrite: true,
11483 ..Config::default()
11484 };
11485
11486 write_read_depth_hist(&dense_path, &dense, &config).unwrap();
11487 write_sparse_read_depth_hist(&sparse_path, &sparse, 4, &config).unwrap();
11488
11489 assert_eq!(
11490 fs::read_to_string(sparse_path).unwrap(),
11491 fs::read_to_string(dense_path).unwrap()
11492 );
11493 }
11494
11495 #[test]
11496 fn write_sparse_read_depth_hist_streams_zero_coverage_without_dense_histogram() {
11497 let dir = tempfile::tempdir().unwrap();
11498 let path = dir.path().join("sparse.rhist.tsv");
11499 let mut sparse = SparseReadDepthHist::default();
11500 sparse.insert(2, (1, 8));
11501 let config = Config {
11502 overwrite: true,
11503 print_zero_coverage: true,
11504 ..Config::default()
11505 };
11506
11507 write_sparse_read_depth_hist(&path, &sparse, 8, &config).unwrap();
11508
11509 assert_eq!(
11510 fs::read_to_string(path).unwrap(),
11511 "#Depth\tReads\tBases\n0\t0\t0\n1\t0\t0\n2\t1\t8\n"
11512 );
11513 }
11514
11515 #[test]
11516 fn output_gzip_threads_are_split_across_all_active_output_streams() {
11517 fn plan(first: Option<&str>, second: Option<&str>) -> OutputPathPlan {
11518 OutputPathPlan {
11519 pairs: vec![OutputPathPair {
11520 first: first.map(PathBuf::from),
11521 second: second.map(PathBuf::from),
11522 }],
11523 fanout: false,
11524 }
11525 }
11526
11527 let keep = plan(Some("keep1.fq.gz"), Some("keep2.fq.gz"));
11528 let toss = plan(Some("toss1.fq.gz"), Some("toss2.fq.gz"));
11529 let low = plan(Some("low.fq.gz"), None);
11530 let mid = plan(Some("mid.fq"), None);
11531 let high = plan(None, None);
11532 let uncorrected = plan(Some("uncorrected1.fq.gz"), Some("uncorrected2.fq.gz"));
11533
11534 assert_eq!(
11535 output_gzip_threads_for_plans(
11536 Some(8),
11537 [&keep, &toss, &low, &mid, &high, &uncorrected],
11538 0
11539 )
11540 .unwrap(),
11541 Some(1)
11542 );
11543
11544 assert_eq!(
11545 output_gzip_threads_for_plans(Some(8), [&keep, &toss], 0).unwrap(),
11546 Some(2)
11547 );
11548 }
11549
11550 fn write_fastq(path: &Path, records: &[(&str, &[u8], &[u8])]) {
11551 let mut text = Vec::new();
11552 for (id, bases, qualities) in records {
11553 text.extend_from_slice(b"@");
11554 text.extend_from_slice(id.as_bytes());
11555 text.extend_from_slice(b"\n");
11556 text.extend_from_slice(bases);
11557 text.extend_from_slice(b"\n+\n");
11558 text.extend_from_slice(qualities);
11559 text.extend_from_slice(b"\n");
11560 }
11561 fs::write(path, text).unwrap();
11562 }
11563
11564 fn write_repeated_fastq(
11565 path: &Path,
11566 prefix: &str,
11567 bases: &[u8],
11568 qualities: &[u8],
11569 count: usize,
11570 ) {
11571 let mut text = Vec::new();
11572 for index in 1..=count {
11573 text.extend_from_slice(b"@");
11574 text.extend_from_slice(format!("{prefix}{index}").as_bytes());
11575 text.extend_from_slice(b"\n");
11576 text.extend_from_slice(bases);
11577 text.extend_from_slice(b"\n+\n");
11578 text.extend_from_slice(qualities);
11579 text.extend_from_slice(b"\n");
11580 }
11581 fs::write(path, text).unwrap();
11582 }
11583
11584 #[test]
11585 fn exact_counts_remove_duplicate_kmers_per_read() {
11586 let config = Config {
11587 k: 3,
11588 min_quality: 0,
11589 min_prob: 0.0,
11590 ..Config::default()
11591 };
11592 let mut counts = CountMap::default();
11593 increment_pair_counts(&config, &mut counts, &record("r1", b"AAAAAA"), None);
11594 assert_eq!(counts.values().copied().sum::<u64>(), 1);
11595 }
11596
11597 #[test]
11598 fn exact_counts_keep_duplicate_long_kmers_like_java_bbnorm() {
11599 let config = Config {
11600 k: 40,
11601 min_quality: 0,
11602 min_prob: 0.0,
11603 ..Config::default()
11604 };
11605 let mut counts = CountMap::default();
11606 let record = record("r1", b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA");
11607 let kmers = kmers_for_record(&record, &config);
11608 assert!(kmers.len() > 1);
11609 assert!(kmers.windows(2).all(|pair| pair[0] == pair[1]));
11610
11611 increment_pair_counts(&config, &mut counts, &record, None);
11612
11613 assert_eq!(counts.len(), 1);
11614 assert_eq!(counts.values().copied().sum::<u64>(), kmers.len() as u64);
11615 }
11616
11617 #[test]
11618 fn constrained_count_min_inflates_colliding_counts() {
11619 let config = Config {
11620 count_min: crate::cli::CountMinSettings {
11621 cells: Some(1),
11622 hashes: Some(2),
11623 bits: Some(8),
11624 memory_bytes: None,
11625 },
11626 ..Config::default()
11627 };
11628 let mut counts = CountMap::default();
11629 counts.insert(KmerKey::Short(7), 2);
11630 counts.insert(KmerKey::Short(11), 5);
11631
11632 apply_count_min_collision_estimates(&config, &mut counts);
11633
11634 assert_eq!(counts.get(&KmerKey::Short(7)), Some(&7));
11635 assert_eq!(counts.get(&KmerKey::Short(11)), Some(&7));
11636 }
11637
11638 #[test]
11639 fn constrained_count_min_honors_cell_bit_saturation() {
11640 let config = Config {
11641 count_min: crate::cli::CountMinSettings {
11642 cells: Some(1),
11643 hashes: Some(1),
11644 bits: Some(2),
11645 memory_bytes: None,
11646 },
11647 ..Config::default()
11648 };
11649 let mut counts = CountMap::default();
11650 counts.insert(KmerKey::Short(7), 2);
11651 counts.insert(KmerKey::Short(11), 5);
11652
11653 apply_count_min_collision_estimates(&config, &mut counts);
11654
11655 assert_eq!(counts.get(&KmerKey::Short(7)), Some(&3));
11656 assert_eq!(counts.get(&KmerKey::Short(11)), Some(&3));
11657 }
11658
11659 #[test]
11660 fn constrained_count_min_caps_wide_cells_like_kcountarray() {
11661 let config = Config {
11662 count_min: crate::cli::CountMinSettings {
11663 cells: Some(1),
11664 hashes: Some(1),
11665 bits: Some(32),
11666 memory_bytes: None,
11667 },
11668 ..Config::default()
11669 };
11670 let mut counts = CountMap::default();
11671 counts.insert(KmerKey::Short(7), i32::MAX as u64 + 10);
11672 counts.insert(KmerKey::Short(11), 1);
11673
11674 apply_count_min_collision_estimates(&config, &mut counts);
11675
11676 assert_eq!(counts.get(&KmerKey::Short(7)), Some(&(i32::MAX as u64)));
11677 assert_eq!(counts.get(&KmerKey::Short(11)), Some(&(i32::MAX as u64)));
11678 assert_eq!(count_min_max_count(31), i32::MAX as u64);
11679 assert_eq!(count_min_max_count(32), i32::MAX as u64);
11680 assert_eq!(count_min_max_count(64), i32::MAX as u64);
11681 }
11682
11683 #[test]
11684 fn count_min_budget_guard_rejects_tables_above_safe_memory() {
11685 let available = 1_000_000usize;
11686 let safe_budget = safe_explicit_count_min_bytes(available);
11687 let fitting_cells = safe_budget / 4;
11688 assert!(
11689 ensure_count_min_budget_fits_ceiling("main", fitting_cells, 32, safe_budget).is_ok()
11690 );
11691
11692 let oversized_cells = safe_budget.div_ceil(4) + 1;
11693 let err = ensure_count_min_budget_fits_ceiling("main", oversized_cells, 32, safe_budget)
11694 .unwrap_err()
11695 .to_string();
11696 assert!(
11697 err.contains("above safe memory budget"),
11698 "unexpected error: {err}"
11699 );
11700 }
11701
11702 #[test]
11703 fn count_min_budget_guard_respects_configured_memory_below_available_ram() {
11704 let configured = 1_000_000usize;
11705 let available = 10_000_000usize;
11706 let safe_budget = count_min_safe_budget_bytes(Some(configured), Some(available)).unwrap();
11707 assert_eq!(safe_budget, configured);
11708
11709 assert!(ensure_count_min_budget_fits_ceiling("main", 250_000, 32, safe_budget).is_ok());
11710
11711 let cells_that_fit_available_but_not_configured = 250_001usize;
11712 let err = ensure_count_min_budget_fits_ceiling(
11713 "main",
11714 cells_that_fit_available_but_not_configured,
11715 32,
11716 safe_budget,
11717 )
11718 .unwrap_err()
11719 .to_string();
11720 assert!(
11721 err.contains("above safe memory budget"),
11722 "unexpected configured-budget error: {err}"
11723 );
11724 }
11725
11726 #[test]
11727 fn count_min_budget_guard_rejects_size_overflow_before_prime_sizing() {
11728 let err = count_min_total_bytes(usize::MAX, 32)
11729 .unwrap_err()
11730 .to_string();
11731 assert!(
11732 err.contains("overflowed"),
11733 "unexpected overflow error: {err}"
11734 );
11735 }
11736
11737 #[test]
11738 fn count_min_hash_uses_bbtools_row_rotation_masks() {
11739 let key = KmerKey::Short(0x1234_5678_9abc_def0);
11740 let first = count_min_bucket(&key, 0, 1024);
11741 let second = count_min_bucket(&key, 1, 1024);
11742 let third = count_min_bucket(&key, 2, 1024);
11743
11744 assert!(first < 1024);
11745 assert!(second < 1024);
11746 assert!(third < 1024);
11747 assert_ne!(first, second);
11748 assert_ne!(second, third);
11749
11750 let row0 = bbtools_mask_hash(raw_kmer_key(&key), 0, BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED);
11751 let row1 = bbtools_mask_hash(
11752 row0.rotate_right(BBTOOLS_HASH_BITS),
11753 1,
11754 BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
11755 );
11756 assert_eq!(
11757 count_min_bucket(&key, 1, 1024),
11758 KCountArrayLayout::new(1024, 32).bucket(row1)
11759 );
11760
11761 let expected = [
11762 0x575a_4571_d954_c5e8,
11763 0x12bb_293c_ca33_0af3,
11764 0x0287_fcd8_b8b4_e1c9,
11765 0x2b62_7d06_2179_52bb,
11766 0x6bc1_463c_9db3_e422,
11767 0x710a_bca5_aeb9_5819,
11768 0x2487_597d_41ef_8ea1,
11769 0x653b_8694_aa03_bbf0,
11770 ];
11771 assert_eq!(
11772 &bbtools_hash_masks(BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED)[0][..8],
11773 expected.as_slice()
11774 );
11775
11776 for row in bbtools_hash_masks(BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED) {
11777 for &mask in row {
11778 assert_eq!((mask & 0xffff_ffff).count_ones(), 16);
11779 assert!((15..=16).contains(&(mask >> 32).count_ones()));
11780 assert_eq!(mask >> 63, 0);
11781 }
11782 }
11783 }
11784
11785 #[test]
11786 fn prefilter_and_main_sketches_use_independent_kcountarray_mask_seeds() {
11787 let config = Config {
11788 count_min: crate::cli::CountMinSettings {
11789 cells: Some(512),
11790 hashes: Some(2),
11791 bits: Some(32),
11792 memory_bytes: None,
11793 },
11794 prefilter: crate::cli::PrefilterSettings {
11795 enabled: true,
11796 force_disabled: false,
11797 ..Default::default()
11798 },
11799 ..Config::default()
11800 };
11801
11802 let prefilter = new_prefilter_count_min_sketch(&config).unwrap();
11803 let main = new_atomic_count_min_sketch_with_mask_seed(
11804 &config,
11805 BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED,
11806 )
11807 .unwrap();
11808 let key = KmerKey::Short(0x1234_5678_9abc_def0);
11809
11810 assert_eq!(
11811 prefilter.layout.mask_seed,
11812 BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED
11813 );
11814 assert_eq!(main.layout.mask_seed, BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED);
11815 assert_ne!(
11816 count_min_bucket_with_layout(&key, 0, prefilter.layout),
11817 count_min_bucket_with_layout(&key, 0, main.layout)
11818 );
11819 }
11820
11821 #[test]
11822 fn nondeterministic_input_prefilter_uses_atomic_packed_sketch() {
11823 let config = Config {
11824 deterministic: false,
11825 count_min: crate::cli::CountMinSettings {
11826 cells: Some(512),
11827 hashes: Some(3),
11828 bits: Some(32),
11829 memory_bytes: None,
11830 },
11831 prefilter: crate::cli::PrefilterSettings {
11832 enabled: true,
11833 force_disabled: false,
11834 cells: Some(256),
11835 hashes: Some(2),
11836 bits: Some(2),
11837 memory_bytes: None,
11838 memory_fraction_micros: None,
11839 },
11840 ..Config::default()
11841 };
11842
11843 let prefilter = new_input_prefilter_count_min_sketch(&config).unwrap();
11844 let layout = prefilter.layout_summary("input_prefilter", Some(prefilter.max_count()));
11845
11846 assert!(matches!(
11847 prefilter,
11848 PrefilterCountMinSketch::AtomicPacked(_)
11849 ));
11850 assert_eq!(layout.kind, "atomic_packed");
11851 assert_eq!(layout.bits, 2);
11852 assert_eq!(layout.hashes, 2);
11853 assert_eq!(layout.update_mode, "conservative");
11854 }
11855
11856 #[test]
11857 fn nondefault_kcountarray_mask_seeds_are_cached() {
11858 let seed = BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED + BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP * 2;
11859 let first = bbtools_hash_masks(seed);
11860 let second = bbtools_hash_masks(seed);
11861 let third = bbtools_hash_masks(seed + BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP);
11862
11863 assert!(std::ptr::eq(first, second));
11864 assert!(!std::ptr::eq(first, third));
11865 assert_ne!(first[0][0], third[0][0]);
11866 }
11867
11868 #[test]
11869 fn countup_prefilter_mask_seed_uses_dedicated_hot_cache() {
11870 let config = Config {
11871 count_up: true,
11872 prefilter: crate::cli::PrefilterSettings {
11873 enabled: true,
11874 force_disabled: false,
11875 ..Default::default()
11876 },
11877 count_min: crate::cli::CountMinSettings {
11878 cells: Some(10_000),
11879 bits: Some(32),
11880 ..Default::default()
11881 },
11882 ..Config::default()
11883 };
11884
11885 let seed = countup_output_mask_seed(&config);
11886 assert_eq!(seed, BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED);
11887 assert!(std::ptr::eq(
11888 bbtools_hash_masks(seed),
11889 bbtools_hash_masks(BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED)
11890 ));
11891 }
11892
11893 #[test]
11894 fn kcount_layout_carries_resolved_mask_table_for_bucket_fills() {
11895 let layout = KCountArrayLayout::new_with_min_arrays_and_mask_seed(
11896 4096,
11897 32,
11898 BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
11899 BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED,
11900 );
11901
11902 assert!(std::ptr::eq(
11903 layout.masks,
11904 bbtools_hash_masks(BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED)
11905 ));
11906 assert_eq!(layout.mask_seed, BBTOOLS_KCOUNT_ARRAY_THIRD_MASK_SEED);
11907 }
11908
11909 #[test]
11910 fn incremental_count_min_buckets_match_row_hash_replay() {
11911 let layout = KCountArrayLayout::new_with_min_arrays_and_mask_seed(
11912 4096,
11913 32,
11914 BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
11915 BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED,
11916 );
11917 for raw in [0, 1, 7, 31, 63, 255, 0x1234_5678_9abc_def0] {
11918 let key = KmerKey::Short(raw);
11919 let mut slots = [usize::MAX; 16];
11920 fill_count_min_buckets(&key, 8, layout, &mut slots);
11921
11922 for (hash_index, slot) in slots.iter().enumerate().take(8) {
11923 assert_eq!(
11924 *slot,
11925 count_min_bucket_with_layout(&key, hash_index, layout)
11926 );
11927 }
11928 }
11929 }
11930
11931 fn find_partial_row_collision(
11932 cells: usize,
11933 bits: u8,
11934 ) -> (KmerKey, KmerKey, usize, usize, usize) {
11935 let layout = KCountArrayLayout::new(cells, bits);
11936 let mut seen: Vec<Option<(KmerKey, usize)>> = vec![None; cells];
11937 for raw in 0..100_000u64 {
11938 let key = KmerKey::Short(raw);
11939 let row0 = count_min_bucket_with_layout(&key, 0, layout);
11940 let row1 = count_min_bucket_with_layout(&key, 1, layout);
11941 if let Some((previous, previous_row1)) = &seen[row0] {
11942 if *previous_row1 != row1 {
11943 return (previous.clone(), key, row0, *previous_row1, row1);
11944 }
11945 } else {
11946 seen[row0] = Some((key, row1));
11947 }
11948 }
11949 panic!("expected to find a partial row collision for {cells} cells");
11950 }
11951
11952 fn find_two_sided_partial_collisions(cells: usize, bits: u8) -> (KmerKey, KmerKey, KmerKey) {
11953 let layout = KCountArrayLayout::new(cells, bits);
11954 let base = KmerKey::Short(0);
11955 let base_row0 = count_min_bucket_with_layout(&base, 0, layout);
11956 let base_row1 = count_min_bucket_with_layout(&base, 1, layout);
11957 let mut row0_match = None;
11958 let mut row1_match = None;
11959 for raw in 1..200_000u64 {
11960 let key = KmerKey::Short(raw);
11961 let row0 = count_min_bucket_with_layout(&key, 0, layout);
11962 let row1 = count_min_bucket_with_layout(&key, 1, layout);
11963 if row0 == base_row0 && row1 != base_row1 && row0_match.is_none() {
11964 row0_match = Some(key.clone());
11965 }
11966 if row1 == base_row1 && row0 != base_row0 && row1_match.is_none() {
11967 row1_match = Some(key);
11968 }
11969 if let (Some(row0_match), Some(row1_match)) = (row0_match.clone(), row1_match.clone()) {
11970 return (base, row0_match, row1_match);
11971 }
11972 }
11973 panic!("expected to find two-sided partial row collisions for {cells} cells");
11974 }
11975
11976 #[test]
11977 fn prefilter_sketch_defaults_to_kcountarray_locked_updates() {
11978 let config = Config {
11979 prefilter: crate::cli::PrefilterSettings {
11980 enabled: true,
11981 force_disabled: false,
11982 cells: Some(128),
11983 hashes: Some(2),
11984 bits: Some(2),
11985 memory_bytes: None,
11986 memory_fraction_micros: None,
11987 },
11988 threads: Some(2),
11989 ..Config::default()
11990 };
11991 let mut prefilter = new_prefilter_count_min_sketch(&config).unwrap();
11992 assert_eq!(prefilter.update_mode, CountMinUpdateMode::Conservative);
11993 let (left, right, row0, _, _) = find_partial_row_collision(prefilter.cells, prefilter.bits);
11994
11995 prefilter.add_key_count(&left, 2);
11996 prefilter.add_key_count(&right, 1);
11997
11998 assert_eq!(prefilter.cell(row0), 2);
11999 }
12000
12001 #[test]
12002 fn lockedincrement_false_uses_independent_row_increments() {
12003 let config = Config {
12004 prefilter: crate::cli::PrefilterSettings {
12005 enabled: true,
12006 force_disabled: false,
12007 cells: Some(128),
12008 hashes: Some(2),
12009 bits: Some(2),
12010 memory_bytes: None,
12011 memory_fraction_micros: None,
12012 },
12013 locked_increment: Some(false),
12014 threads: Some(2),
12015 ..Config::default()
12016 };
12017 let mut unlocked = new_prefilter_count_min_sketch(&config).unwrap();
12018 assert_eq!(unlocked.update_mode, CountMinUpdateMode::Independent);
12019 let (left, right, row0, row1_left, row1_right) =
12020 find_partial_row_collision(unlocked.cells, unlocked.bits);
12021
12022 let mut locked =
12023 PackedCountMinSketch::new(unlocked.cells, unlocked.hashes, unlocked.bits).unwrap();
12024 locked.add_key_count(&left, 2);
12025 locked.add_key_count(&right, 1);
12026 unlocked.add_key_count(&left, 2);
12027 unlocked.add_key_count(&right, 1);
12028
12029 assert_eq!(locked.cell(row0), 2);
12030 assert_eq!(unlocked.cell(row0), 3);
12031 assert_eq!(unlocked.cell(row1_left), 2);
12032 assert_eq!(unlocked.cell(row1_right), 1);
12033 }
12034
12035 #[test]
12036 fn atomic_count_min_honors_unlocked_independent_updates() {
12037 let config = Config {
12038 count_min: crate::cli::CountMinSettings {
12039 cells: Some(128),
12040 hashes: Some(2),
12041 bits: Some(32),
12042 memory_bytes: None,
12043 },
12044 locked_increment: Some(false),
12045 threads: Some(2),
12046 ..Config::default()
12047 };
12048 let unlocked = new_atomic_count_min_sketch(&config).unwrap();
12049 assert_eq!(unlocked.update_mode, CountMinUpdateMode::Independent);
12050 let (left, right, row0, row1_left, row1_right) =
12051 find_partial_row_collision(unlocked.cells, 32);
12052
12053 let locked = AtomicCountMinSketch::new(unlocked.cells, unlocked.hashes).unwrap();
12054 locked.add_key_count(&left, 2);
12055 locked.add_key_count(&right, 1);
12056 unlocked.add_key_count(&left, 2);
12057 unlocked.add_key_count(&right, 1);
12058
12059 assert_eq!(locked.cells_by_hash[row0].load(Ordering::Relaxed), 2);
12060 assert_eq!(unlocked.cells_by_hash[row0].load(Ordering::Relaxed), 3);
12061 assert_eq!(unlocked.cells_by_hash[row1_left].load(Ordering::Relaxed), 2);
12062 assert_eq!(
12063 unlocked.cells_by_hash[row1_right].load(Ordering::Relaxed),
12064 1
12065 );
12066 }
12067
12068 #[test]
12069 fn atomic_count_min_allocates_locks_only_for_conservative_updates() {
12070 let conservative = new_atomic_count_min_sketch(&Config {
12071 count_min: crate::cli::CountMinSettings {
12072 cells: Some(128),
12073 hashes: Some(2),
12074 bits: Some(32),
12075 memory_bytes: None,
12076 },
12077 ..Config::default()
12078 })
12079 .unwrap();
12080 let independent = new_atomic_count_min_sketch(&Config {
12081 count_min: crate::cli::CountMinSettings {
12082 cells: Some(128),
12083 hashes: Some(2),
12084 bits: Some(32),
12085 memory_bytes: None,
12086 },
12087 locked_increment: Some(false),
12088 ..Config::default()
12089 })
12090 .unwrap();
12091
12092 assert_eq!(conservative.locks.len(), BBTOOLS_KCOUNT_ARRAY_LOCKS);
12093 assert!(independent.locks.is_empty());
12094 }
12095
12096 #[test]
12097 fn atomic_count_min_parallel_replay_requires_nondeterministic_mode() {
12098 let deterministic = new_atomic_count_min_sketch(&Config {
12099 count_min: crate::cli::CountMinSettings {
12100 cells: Some(128),
12101 hashes: Some(2),
12102 bits: Some(32),
12103 memory_bytes: None,
12104 },
12105 deterministic: true,
12106 ..Config::default()
12107 })
12108 .unwrap();
12109 let nondeterministic = new_atomic_count_min_sketch(&Config {
12110 count_min: crate::cli::CountMinSettings {
12111 cells: Some(128),
12112 hashes: Some(2),
12113 bits: Some(32),
12114 memory_bytes: None,
12115 },
12116 deterministic: false,
12117 ..Config::default()
12118 })
12119 .unwrap();
12120
12121 assert!(!deterministic.parallel_replay);
12122 assert!(nondeterministic.parallel_replay);
12123 }
12124
12125 #[test]
12126 fn packed_count_min_increment_returns_previous_min_like_kcountarray() {
12127 let key = KmerKey::Short(7);
12128 let mut sketch = PackedCountMinSketch::new(128, 2, 4).unwrap();
12129
12130 assert_eq!(sketch.increment_and_return_unincremented(&key, 1), 0);
12131 assert_eq!(sketch.depth(&key), 1);
12132 assert_eq!(sketch.increment_and_return_unincremented(&key, 3), 1);
12133 assert_eq!(sketch.depth(&key), 4);
12134 }
12135
12136 #[test]
12137 fn packed_count_min_increment_return_saturates_at_cell_max() {
12138 let key = KmerKey::Short(11);
12139 let mut sketch = PackedCountMinSketch::new(1, 2, 2).unwrap();
12140
12141 assert_eq!(sketch.increment_and_return_unincremented(&key, 10), 0);
12142 assert_eq!(sketch.depth(&key), 3);
12143 assert_eq!(sketch.increment_and_return_unincremented(&key, 1), 3);
12144 assert_eq!(sketch.depth(&key), 3);
12145 }
12146
12147 #[test]
12148 fn atomic_count_min_increment_returns_previous_min_like_kcountarray() {
12149 let key = KmerKey::Short(13);
12150 let sketch = AtomicCountMinSketch::new(128, 2).unwrap();
12151
12152 assert_eq!(sketch.increment_and_return_unincremented(&key, 1), 0);
12153 assert_eq!(sketch.depth(&key), 1);
12154 assert_eq!(sketch.increment_and_return_unincremented(&key, 3), 1);
12155 assert_eq!(sketch.depth(&key), 4);
12156 }
12157
12158 #[test]
12159 fn atomic_packed_count_min_matches_packed_sequential_updates() {
12160 let keys = [
12161 (KmerKey::Short(13), 1),
12162 (KmerKey::Short(29), 2),
12163 (KmerKey::Short(13), 1),
12164 (KmerKey::Short(47), 3),
12165 ];
12166 let mut packed = PackedCountMinSketch::new_with_min_arrays_and_mask_seed(
12167 4099,
12168 3,
12169 2,
12170 BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
12171 BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
12172 )
12173 .unwrap();
12174 let atomic = AtomicPackedCountMinSketch::new_with_min_arrays_and_update_mode(
12175 4099,
12176 3,
12177 2,
12178 BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
12179 CountMinUpdateMode::Conservative,
12180 BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
12181 )
12182 .unwrap();
12183
12184 for (key, count) in &keys {
12185 packed.add_key_count(key, *count);
12186 atomic.add_key_count(key, *count);
12187 }
12188 let key_increments = keys.iter().map(|(_, count)| *count).sum();
12189 packed.add_key_increments(key_increments);
12190 atomic.add_key_increments(key_increments);
12191
12192 for slot in 0..packed.cells {
12193 assert_eq!(atomic.cell(slot), packed.cell(slot));
12194 }
12195 let occupied = (0..packed.cells)
12196 .filter(|&slot| packed.cell(slot) > 0)
12197 .count();
12198 assert_eq!(atomic.occupied_slots_at_least(1), occupied);
12199 assert_eq!(atomic.unique_kmers(), packed.unique_kmers());
12200 }
12201
12202 #[test]
12203 fn atomic_count_min_conservative_updates_are_key_locked_like_kcountarray() {
12204 let key = KmerKey::Short(13);
12205 let pool = rayon::ThreadPoolBuilder::new()
12206 .num_threads(4)
12207 .build()
12208 .unwrap();
12209
12210 pool.install(|| {
12211 let sketch = AtomicCountMinSketch::new(128, 3).unwrap();
12212
12213 (0..10_000u64)
12214 .into_par_iter()
12215 .for_each(|_| sketch.add_key_count(&key, 1));
12216
12217 assert_eq!(sketch.depth(&key), 10_000);
12218 });
12219 }
12220
12221 #[test]
12222 fn atomic_count_min_bulk_replay_matches_locked_sequential_updates() {
12223 let mut counts = CountMap::default();
12224 counts.insert(KmerKey::Short(13), 17);
12225 counts.insert(KmerKey::Short(29), 3);
12226 counts.insert(KmerKey::Short(31), 9);
12227 let locked = AtomicCountMinSketch::new(128, 3).unwrap();
12228 let bulk = AtomicCountMinSketch::new(128, 3).unwrap();
12229
12230 for (key, count) in &counts {
12231 locked.add_key_count(key, *count);
12232 }
12233 bulk.add_key_counts(&counts);
12234
12235 for slot in 0..locked.cells {
12236 assert_eq!(
12237 locked.cells_by_hash[slot].load(Ordering::Relaxed),
12238 bulk.cells_by_hash[slot].load(Ordering::Relaxed)
12239 );
12240 }
12241 }
12242
12243 #[test]
12244 fn packed_count_min_reduced_sorted_replay_matches_individual_kmer_updates() {
12245 let keys = [
12246 KmerKey::Short(13),
12247 KmerKey::Short(29),
12248 KmerKey::Short(13),
12249 KmerKey::Short(31),
12250 KmerKey::Short(29),
12251 KmerKey::Short(29),
12252 KmerKey::Short(47),
12253 ];
12254 let mut individual = PackedCountMinSketch::new(4099, 3, 16).unwrap();
12255 let mut reduced = PackedCountMinSketch::new(4099, 3, 16).unwrap();
12256
12257 for key in &keys {
12258 individual.increment(key);
12259 }
12260 for (key, count) in sorted_reduced_test_runs(keys) {
12261 reduced.add_key_count(&key, count);
12262 reduced.add_key_increments(count);
12263 }
12264
12265 assert_eq!(reduced.increments, individual.increments);
12266 assert_eq!(reduced.occupied_slots, individual.occupied_slots);
12267 assert_eq!(reduced.words, individual.words);
12268 }
12269
12270 #[test]
12271 #[ignore = "microbenchmark for packed 16-bit/3-hash sketch kernel"]
12272 fn bench_packed_count_min_16bit_3hash_short_kernel() {
12273 let mut sketch = PackedCountMinSketch::new_with_min_arrays_and_mask_seed(
12274 67_108_859,
12275 3,
12276 16,
12277 BBTOOLS_KCOUNT_ARRAY_MIN_ARRAYS,
12278 BBTOOLS_KCOUNT_ARRAY_FIRST_MASK_SEED,
12279 )
12280 .unwrap();
12281 let keys = (0..1_000_000u64)
12282 .map(|i| KmerKey::Short(i.wrapping_mul(0x9e37_79b9_7f4a_7c15)))
12283 .collect::<Vec<_>>();
12284
12285 let start = Instant::now();
12286 let mut checksum = 0u64;
12287 for key in &keys {
12288 checksum ^= std::hint::black_box(
12289 sketch.increment_16bit_3hash_conservative_and_return_unincremented(key, 1),
12290 );
12291 }
12292 let elapsed = start.elapsed();
12293 eprintln!(
12294 "packed_16bit_3hash_short_kernel\tupdates={}\telapsed_seconds={:.6}\tchecksum={}",
12295 keys.len(),
12296 elapsed.as_secs_f64(),
12297 checksum
12298 );
12299 std::hint::black_box(sketch);
12300 }
12301
12302 #[test]
12303 fn atomic_count_min_reduced_sorted_replay_matches_individual_kmer_updates() {
12304 let keys = [
12305 KmerKey::Short(13),
12306 KmerKey::Short(29),
12307 KmerKey::Short(13),
12308 KmerKey::Short(31),
12309 KmerKey::Short(29),
12310 KmerKey::Short(29),
12311 KmerKey::Short(47),
12312 ];
12313 let individual = AtomicCountMinSketch::new(4099, 3).unwrap();
12314 let reduced = AtomicCountMinSketch::new(4099, 3).unwrap();
12315
12316 for key in &keys {
12317 individual.increment_key(key);
12318 individual.add_key_increments(1);
12319 }
12320 for (key, count) in sorted_reduced_test_runs(keys) {
12321 reduced.add_key_count(&key, count);
12322 reduced.add_key_increments(count);
12323 }
12324
12325 assert_eq!(
12326 reduced.increments.load(Ordering::Relaxed),
12327 individual.increments.load(Ordering::Relaxed)
12328 );
12329 assert_eq!(
12330 reduced.occupied_slots.load(Ordering::Relaxed),
12331 individual.occupied_slots.load(Ordering::Relaxed)
12332 );
12333 for slot in 0..individual.cells {
12334 assert_eq!(
12335 reduced.cells_by_hash[slot].load(Ordering::Relaxed),
12336 individual.cells_by_hash[slot].load(Ordering::Relaxed)
12337 );
12338 }
12339 }
12340
12341 fn sorted_reduced_test_runs<const N: usize>(keys: [KmerKey; N]) -> Vec<(KmerKey, u64)> {
12342 let mut keys = keys;
12343 keys.sort_unstable();
12344 let mut runs = Vec::new();
12345 for key in keys {
12346 if let Some((last_key, count)) = runs.last_mut()
12347 && last_key == &key
12348 {
12349 *count += 1;
12350 continue;
12351 }
12352 runs.push((key, 1));
12353 }
12354 runs
12355 }
12356
12357 #[test]
12358 fn exact_collision_estimates_follow_lockedincrement_mode() {
12359 let mut config = Config {
12360 count_min: crate::cli::CountMinSettings {
12361 cells: Some(128),
12362 hashes: Some(2),
12363 bits: Some(8),
12364 memory_bytes: None,
12365 },
12366 threads: Some(2),
12367 ..Config::default()
12368 };
12369 let cells = count_min_table_cells_from_total_bits(128, 8);
12370 let (left, right0, right1) = find_two_sided_partial_collisions(cells, 8);
12371 let mut locked = CountMap::default();
12372 locked.insert(left.clone(), 2);
12373 locked.insert(right0, 1);
12374 locked.insert(right1, 1);
12375 let mut unlocked = locked.clone();
12376
12377 apply_count_min_collision_estimates(&config, &mut locked);
12378 config.locked_increment = Some(false);
12379 apply_count_min_collision_estimates(&config, &mut unlocked);
12380
12381 assert_eq!(locked.get(&left), Some(&2));
12382 assert_eq!(unlocked.get(&left), Some(&3));
12383 }
12384
12385 #[test]
12386 fn prefilter_exact_estimates_follow_lockedincrement_mode() {
12387 let mut config = Config {
12388 prefilter: crate::cli::PrefilterSettings {
12389 enabled: true,
12390 force_disabled: false,
12391 cells: Some(128),
12392 hashes: Some(2),
12393 bits: Some(8),
12394 memory_bytes: None,
12395 memory_fraction_micros: None,
12396 },
12397 threads: Some(2),
12398 ..Config::default()
12399 };
12400 let cells = count_min_table_cells_from_total_bits(128, 8);
12401 let (left, right0, right1) = find_two_sided_partial_collisions(cells, 8);
12402 let mut locked = CountMap::default();
12403 locked.insert(left.clone(), 2);
12404 locked.insert(right0, 1);
12405 locked.insert(right1, 1);
12406 let mut unlocked = locked.clone();
12407
12408 apply_prefilter_collision_estimates(&config, &mut locked);
12409 config.locked_increment = Some(false);
12410 apply_prefilter_collision_estimates(&config, &mut unlocked);
12411
12412 assert_eq!(locked.get(&left), Some(&2));
12413 assert_eq!(unlocked.get(&left), Some(&3));
12414 }
12415
12416 #[test]
12417 fn prefilter_sketch_saturates_with_independent_row_increments_when_unlocked() {
12418 let config = Config {
12419 prefilter: crate::cli::PrefilterSettings {
12420 enabled: true,
12421 force_disabled: false,
12422 cells: Some(128),
12423 hashes: Some(2),
12424 bits: Some(2),
12425 memory_bytes: None,
12426 memory_fraction_micros: None,
12427 },
12428 locked_increment: Some(false),
12429 threads: Some(2),
12430 ..Config::default()
12431 };
12432 let mut prefilter = new_prefilter_count_min_sketch(&config).unwrap();
12433 let (left, right, row0, row1_left, row1_right) =
12434 find_partial_row_collision(prefilter.cells, prefilter.bits);
12435
12436 let mut conservative =
12437 PackedCountMinSketch::new(prefilter.cells, prefilter.hashes, prefilter.bits).unwrap();
12438 conservative.add_key_count(&left, 2);
12439 conservative.add_key_count(&right, 1);
12440 prefilter.add_key_count(&left, 2);
12441 prefilter.add_key_count(&right, 1);
12442
12443 assert_eq!(conservative.cell(row0), 2);
12444 assert_eq!(prefilter.cell(row0), 3);
12445 assert_eq!(prefilter.cell(row1_left), 2);
12446 assert_eq!(prefilter.cell(row1_right), 1);
12447 }
12448
12449 #[test]
12450 fn packed_count_min_sketch_uses_fixed_saturating_cells() {
12451 let mut sketch = PackedCountMinSketch::new(1, 2, 3).unwrap();
12452 for _ in 0..10 {
12453 sketch.increment(&KmerKey::Short(7));
12454 }
12455
12456 assert_eq!(sketch.words.len(), 1);
12457 assert_eq!(sketch.depth(&KmerKey::Short(7)), 7);
12458 assert_eq!(sketch.depth(&KmerKey::Short(11)), 7);
12459 assert_eq!(sketch.unique_kmers(), 10);
12460 }
12461
12462 #[test]
12463 fn packed_count_min_depth_hist_uses_raw_depth_counts() {
12464 let mut sketch = PackedCountMinSketch::new(8, 2, 4).unwrap();
12465 sketch.set_cell(0, 1);
12466 sketch.set_cell(1, 2);
12467 sketch.set_cell(2, 2);
12468 sketch.set_cell(3, 5);
12469
12470 assert_eq!(sketch.occupied_slots_at_least(1), 4);
12471 assert_eq!(sketch.tracked_slots.as_ref().unwrap().len(), 4);
12472 assert_eq!(sketch.depth_hist(4), vec![0, 1, 4, 5]);
12473 }
12474
12475 #[test]
12476 fn packed_count_min_tracks_occupied_slots_without_duplicates() {
12477 let key = KmerKey::Short(17);
12478 let mut sketch = PackedCountMinSketch::new(128, 1, 4).unwrap();
12479
12480 sketch.add_key_count(&key, 1);
12481 sketch.add_key_count(&key, 2);
12482
12483 assert_eq!(sketch.occupied_slots_at_least(1), 1);
12484 assert_eq!(sketch.occupied_slots_at_least(3), 1);
12485 assert_eq!(sketch.tracked_slots.as_ref().unwrap().len(), 1);
12486 assert_eq!(sketch.depth_hist(5), vec![0, 0, 0, 3, 0]);
12487 }
12488
12489 #[test]
12490 fn packed_count_min_disables_slot_tracking_for_large_tables() {
12491 let sketch = PackedCountMinSketch::new(PACKED_SKETCH_TRACKED_SLOT_LIMIT + 1, 1, 1).unwrap();
12492
12493 assert!(sketch.tracked_slots.is_none());
12494 assert_eq!(sketch.tracked_slot_memory_bytes(), 0);
12495 assert_eq!(
12496 sketch.layout_summary("large", None).memory_bytes,
12497 sketch.words.len() * std::mem::size_of::<u64>()
12498 );
12499 }
12500
12501 #[test]
12502 fn packed_count_min_layout_reports_tracked_slot_memory() {
12503 let key = KmerKey::Short(17);
12504 let mut sketch = PackedCountMinSketch::new(128, 1, 4).unwrap();
12505
12506 sketch.add_key_count(&key, 1);
12507
12508 let backing_bytes = sketch.words.len() * std::mem::size_of::<u64>();
12509 assert!(sketch.tracked_slot_memory_bytes() >= std::mem::size_of::<usize>());
12510 assert_eq!(
12511 sketch.layout_summary("small", None).memory_bytes,
12512 backing_bytes + sketch.tracked_slot_memory_bytes()
12513 );
12514 }
12515
12516 #[test]
12517 fn packed_count_min_depth_hist_uses_compact_cell_bound_but_returns_requested_len() {
12518 let mut sketch = PackedCountMinSketch::new(16, 1, 4).unwrap();
12519 sketch.set_cell(0, 1);
12520 sketch.set_cell(1, 15);
12521
12522 let hist = sketch.depth_hist(1024);
12523
12524 assert_eq!(hist.len(), 1024);
12525 assert_eq!(hist[1], 1);
12526 assert_eq!(hist[15], 15);
12527 assert!(hist[16..].iter().all(|&value| value == 0));
12528 }
12529
12530 #[test]
12531 fn packed_count_min_untracked_depth_hist_uses_compact_reducers() {
12532 let mut sketch = PackedCountMinSketch::new(16, 1, 4).unwrap();
12533 sketch.tracked_slots = None;
12534 sketch.set_cell(0, 1);
12535 sketch.set_cell(1, 15);
12536
12537 let hist = sketch.depth_hist(1024);
12538
12539 assert_eq!(hist.len(), 1024);
12540 assert_eq!(hist[1], 1);
12541 assert_eq!(hist[15], 15);
12542 assert!(hist[16..].iter().all(|&value| value == 0));
12543 }
12544
12545 #[test]
12546 fn packed_count_min_depth_hist_uses_dynamic_reducers_for_wide_cells() {
12547 let mut sketch = PackedCountMinSketch::new(16, 1, 32).unwrap();
12548 sketch.set_cell(0, 1);
12549 sketch.set_cell(1, 4096);
12550
12551 let hist = sketch.depth_hist(8192);
12552
12553 assert_eq!(hist.len(), 8192);
12554 assert_eq!(hist[1], 1);
12555 assert_eq!(hist[4096], 4096);
12556 assert!(hist[4097..].iter().all(|&value| value == 0));
12557 }
12558
12559 #[test]
12560 fn packed_count_min_untracked_depth_hist_uses_dynamic_reducers_for_wide_cells() {
12561 let mut sketch = PackedCountMinSketch::new(16, 1, 32).unwrap();
12562 sketch.tracked_slots = None;
12563 sketch.set_cell(0, 2);
12564 sketch.set_cell(1, 4096);
12565
12566 let hist = sketch.depth_hist(8192);
12567
12568 assert_eq!(hist.len(), 8192);
12569 assert_eq!(hist[2], 2);
12570 assert_eq!(hist[4096], 4096);
12571 assert!(hist[4097..].iter().all(|&value| value == 0));
12572 }
12573
12574 #[test]
12575 fn atomic_count_min_depth_hist_uses_raw_depth_counts() {
12576 let sketch = AtomicCountMinSketch::new(8, 2).unwrap();
12577 sketch.cells_by_hash[0].store(1, Ordering::Relaxed);
12578 sketch.cells_by_hash[1].store(2, Ordering::Relaxed);
12579 sketch.cells_by_hash[2].store(2, Ordering::Relaxed);
12580 sketch.cells_by_hash[3].store(5, Ordering::Relaxed);
12581
12582 assert_eq!(sketch.depth_hist(4), vec![0, 1, 4, 5]);
12583 }
12584
12585 #[test]
12586 fn atomic_count_min_depth_hist_uses_compact_dynamic_reducers() {
12587 let sketch = AtomicCountMinSketch::new(16, 2).unwrap();
12588 sketch.cells_by_hash[0].store(1, Ordering::Relaxed);
12589 sketch.cells_by_hash[1].store(7, Ordering::Relaxed);
12590
12591 let hist = sketch.depth_hist(8192);
12592
12593 assert_eq!(hist.len(), 8192);
12594 assert_eq!(hist[1], 1);
12595 assert_eq!(hist[7], 7);
12596 assert!(hist[8..].iter().all(|&value| value == 0));
12597 }
12598
12599 #[test]
12600 fn combined_primary_histograms_match_separate_collectors() {
12601 let dir = tempfile::tempdir().unwrap();
12602 let path = dir.path().join("reads.fq");
12603 write_fastq(
12604 &path,
12605 &[
12606 ("r1", b"ACGTACGT", b"IIIIIIII"),
12607 ("r2", b"ACGTTCGT", b"IIIIIIII"),
12608 ("r3", b"TTTTACGT", b"IIIIIIII"),
12609 ],
12610 );
12611 let config = Config {
12612 in1: Some(path.clone()),
12613 k: 3,
12614 min_quality: 0,
12615 min_prob: 0.0,
12616 ..Config::default()
12617 };
12618 let mut counts = CountMap::default();
12619 count_single_file(&config, &path, &mut counts, None).unwrap();
12620
12621 let separate_hist = collect_primary_hist(&config, &counts, None, 0).unwrap();
12622 let sparse_hist = collect_primary_sparse_hist(&config, &counts, None, 0).unwrap();
12623 let separate_rhist = collect_primary_read_hist(&config, &counts, None, 0).unwrap();
12624 let sparse_rhist = collect_primary_sparse_read_hist(&config, &counts, None, 0).unwrap();
12625 let (sparse_combined_hist, sparse_combined_rhist) =
12626 collect_primary_sparse_hist_and_read_hist(&config, &counts, None, 0).unwrap();
12627 let (combined_hist, combined_rhist) =
12628 collect_primary_hist_and_read_hist(&config, &counts, None, 0).unwrap();
12629
12630 assert_eq!(
12631 sparse_hist_to_dense(&sparse_hist, config.hist_len),
12632 separate_hist
12633 );
12634 assert_eq!(
12635 sparse_hist_to_dense(&sparse_combined_hist, config.hist_len),
12636 separate_hist
12637 );
12638 assert_eq!(combined_hist, separate_hist);
12639 assert_eq!(combined_rhist.reads, separate_rhist.reads);
12640 assert_eq!(combined_rhist.bases, separate_rhist.bases);
12641 let mut dense_sparse_rhist = ReadDepthHistogram::new(config.hist_len);
12642 merge_sparse_read_depth_hist_into_dense(&mut dense_sparse_rhist, sparse_rhist);
12643 assert_eq!(dense_sparse_rhist.reads, separate_rhist.reads);
12644 assert_eq!(dense_sparse_rhist.bases, separate_rhist.bases);
12645 let mut dense_sparse_combined_rhist = ReadDepthHistogram::new(config.hist_len);
12646 merge_sparse_read_depth_hist_into_dense(
12647 &mut dense_sparse_combined_rhist,
12648 sparse_combined_rhist,
12649 );
12650 assert_eq!(dense_sparse_combined_rhist.reads, separate_rhist.reads);
12651 assert_eq!(dense_sparse_combined_rhist.bases, separate_rhist.bases);
12652 }
12653
12654 #[test]
12655 fn countup_work_source_collects_input_histograms_like_separate_collectors() {
12656 let dir = tempfile::tempdir().unwrap();
12657 let path = dir.path().join("reads.fq");
12658 write_fastq(
12659 &path,
12660 &[
12661 ("r1", b"ACGTACGT", b"IIIIIIII"),
12662 ("r2", b"ACGTTCGT", b"IIIIIIII"),
12663 ("r3", b"TTTTACGT", b"IIIIIIII"),
12664 ],
12665 );
12666 let config = Config {
12667 in1: Some(path.clone()),
12668 count_up: true,
12669 k: 3,
12670 min_quality: 0,
12671 min_prob: 0.0,
12672 hist_len: 64,
12673 ..Config::default()
12674 };
12675 let mut counts = CountMap::default();
12676 count_single_file(&config, &path, &mut counts, None).unwrap();
12677
12678 let separate_hist = collect_primary_hist(&config, &counts, None, 0).unwrap();
12679 let separate_rhist = collect_primary_read_hist(&config, &counts, None, 0).unwrap();
12680 let build = collect_countup_work_source(&config, &counts, 0, true, true).unwrap();
12681
12682 assert_eq!(build.format1, SeqFormat::Fastq);
12683 assert_eq!(build.format2, None);
12684 assert_eq!(
12685 sparse_hist_to_dense(&build.input_hist.unwrap(), config.hist_len),
12686 separate_hist
12687 );
12688 let mut combined_rhist = ReadDepthHistogram::new(config.hist_len);
12689 merge_sparse_read_depth_hist_into_dense(
12690 &mut combined_rhist,
12691 build.input_read_hist.unwrap(),
12692 );
12693 assert_eq!(combined_rhist.reads, separate_rhist.reads);
12694 assert_eq!(combined_rhist.bases, separate_rhist.bases);
12695 }
12696
12697 #[test]
12698 fn combined_primary_histograms_with_keep_filter_match_separate_collectors() {
12699 let dir = tempfile::tempdir().unwrap();
12700 let path = dir.path().join("reads.fq");
12701 write_fastq(
12702 &path,
12703 &[
12704 ("r1", b"ACGTACGT", b"IIIIIIII"),
12705 ("r2", b"ACGTACGT", b"IIIIIIII"),
12706 ("r3", b"TTTTACGT", b"IIIIIIII"),
12707 ],
12708 );
12709 let config = Config {
12710 in1: Some(path.clone()),
12711 k: 3,
12712 min_quality: 0,
12713 min_prob: 0.0,
12714 ..Config::default()
12715 };
12716 let mut input_counts = CountMap::default();
12717 count_single_file(&config, &path, &mut input_counts, None).unwrap();
12718 let mut kept_counts = CountMap::default();
12719 increment_pair_counts(
12720 &config,
12721 &mut kept_counts,
12722 &record("kept", b"ACGTACGT"),
12723 None,
12724 );
12725
12726 let separate_hist =
12727 collect_primary_hist(&config, &kept_counts, Some(&input_counts), 17).unwrap();
12728 let sparse_hist =
12729 collect_primary_sparse_hist(&config, &kept_counts, Some(&input_counts), 17).unwrap();
12730 let separate_rhist =
12731 collect_primary_read_hist(&config, &kept_counts, Some(&input_counts), 17).unwrap();
12732 let sparse_rhist =
12733 collect_primary_sparse_read_hist(&config, &kept_counts, Some(&input_counts), 17)
12734 .unwrap();
12735 let (sparse_combined_hist, sparse_combined_rhist) =
12736 collect_primary_sparse_hist_and_read_hist(
12737 &config,
12738 &kept_counts,
12739 Some(&input_counts),
12740 17,
12741 )
12742 .unwrap();
12743 let (combined_hist, combined_rhist) =
12744 collect_primary_hist_and_read_hist(&config, &kept_counts, Some(&input_counts), 17)
12745 .unwrap();
12746
12747 assert_eq!(
12748 sparse_hist_to_dense(&sparse_hist, config.hist_len),
12749 separate_hist
12750 );
12751 assert_eq!(
12752 sparse_hist_to_dense(&sparse_combined_hist, config.hist_len),
12753 separate_hist
12754 );
12755 assert_eq!(combined_hist, separate_hist);
12756 assert_eq!(combined_rhist.reads, separate_rhist.reads);
12757 assert_eq!(combined_rhist.bases, separate_rhist.bases);
12758 let mut dense_sparse_rhist = ReadDepthHistogram::new(config.hist_len);
12759 merge_sparse_read_depth_hist_into_dense(&mut dense_sparse_rhist, sparse_rhist);
12760 assert_eq!(dense_sparse_rhist.reads, separate_rhist.reads);
12761 assert_eq!(dense_sparse_rhist.bases, separate_rhist.bases);
12762 let mut dense_sparse_combined_rhist = ReadDepthHistogram::new(config.hist_len);
12763 merge_sparse_read_depth_hist_into_dense(
12764 &mut dense_sparse_combined_rhist,
12765 sparse_combined_rhist,
12766 );
12767 assert_eq!(dense_sparse_combined_rhist.reads, separate_rhist.reads);
12768 assert_eq!(dense_sparse_combined_rhist.bases, separate_rhist.bases);
12769 }
12770
12771 #[test]
12772 fn packed_count_min_unique_kmers_uses_bbtools_hash_adjusted_estimate() {
12773 let mut sketch = PackedCountMinSketch::new(1024, 4, 8).unwrap();
12774 for bucket in 0..256 {
12775 sketch.set_cell(bucket, 1);
12776 }
12777 sketch.increments = 1_000;
12778
12779 let estimated = sketch.unique_kmers();
12780 assert!(
12781 (70..=80).contains(&estimated),
12782 "BBTools-style hash-adjusted estimate was {estimated}"
12783 );
12784 }
12785
12786 #[test]
12787 fn packed_count_min_unique_kmers_honors_min_depth_threshold() {
12788 let mut sketch = PackedCountMinSketch::new(1024, 4, 8).unwrap();
12789 for bucket in 0..256 {
12790 let depth = if bucket < 128 { 3 } else { 1 };
12791 sketch.set_cell(bucket, depth);
12792 }
12793 sketch.increments = 1_000;
12794
12795 let total_estimated = sketch.unique_kmers();
12796 let high_depth_estimated = sketch.unique_kmers_at_least(2);
12797
12798 assert!(
12799 (70..=80).contains(&total_estimated),
12800 "all-depth estimate was {total_estimated}"
12801 );
12802 assert!(
12803 (30..=40).contains(&high_depth_estimated),
12804 "thresholded estimate was {high_depth_estimated}"
12805 );
12806 assert_eq!(sketch.unique_kmers_at_least(9), 0);
12807 }
12808
12809 #[test]
12810 fn atomic_count_min_unique_kmers_honors_min_depth_threshold() {
12811 let sketch = AtomicCountMinSketch::new(1024, 4).unwrap();
12812 for bucket in 0..256 {
12813 let depth = if bucket < 128 { 3 } else { 1 };
12814 sketch.cells_by_hash[bucket].store(depth, Ordering::Relaxed);
12815 }
12816 sketch.occupied_slots.store(256, Ordering::Relaxed);
12817 sketch.add_key_increments(1_000);
12818
12819 let total_estimated = sketch.unique_kmers();
12820 let high_depth_estimated = sketch.unique_kmers_at_least(2);
12821
12822 assert!(
12823 (70..=80).contains(&total_estimated),
12824 "all-depth estimate was {total_estimated}"
12825 );
12826 assert!(
12827 (30..=40).contains(&high_depth_estimated),
12828 "thresholded estimate was {high_depth_estimated}"
12829 );
12830 assert_eq!(sketch.occupied_slots_at_least(1), 256);
12831 }
12832
12833 #[test]
12834 fn cardinality_estimator_tracks_unique_keys_with_fixed_register_memory() {
12835 let config = Config {
12836 k: 31,
12837 cardinality: crate::cli::CardinalitySettings {
12838 input: true,
12839 buckets: 2048,
12840 seed: 42,
12841 ..Default::default()
12842 },
12843 ..Default::default()
12844 };
12845 let mut estimator = KmerCardinalityEstimator::from_config(&config);
12846 for key in 0..1_000 {
12847 estimator.observe_key(&KmerKey::Short(key));
12848 estimator.observe_key(&KmerKey::Short(key));
12849 }
12850
12851 let estimate = estimator.estimate();
12852 assert_eq!(estimate.k, 31);
12853 assert_eq!(estimate.buckets, 2048);
12854 assert!(
12855 (900..=1_100).contains(&estimate.estimated_unique_kmers),
12856 "cardinality estimate was {}",
12857 estimate.estimated_unique_kmers
12858 );
12859 assert_eq!(estimator.registers.len(), 2048);
12860 }
12861
12862 #[test]
12863 fn packed_count_min_sketch_packs_cells_across_word_boundaries() {
12864 let mut sketch = PackedCountMinSketch::new(17, 1, 5).unwrap();
12865 for slot in 0..17 {
12866 sketch.set_cell(slot, slot as u64);
12867 }
12868
12869 for slot in 0..17 {
12870 assert_eq!(sketch.cell(slot), slot as u64);
12871 }
12872 }
12873
12874 #[test]
12875 fn bounded_input_counts_builds_direct_sketch_when_cells_are_constrained() {
12876 let dir = tempfile::tempdir().unwrap();
12877 let path = dir.path().join("reads.fq");
12878 write_fastq(
12879 &path,
12880 &[
12881 ("r1", b"ACGTACGT", b"IIIIIIII"),
12882 ("r2", b"ACGTTCGT", b"IIIIIIII"),
12883 ],
12884 );
12885 let config = Config {
12886 in1: Some(path),
12887 k: 3,
12888 min_quality: 0,
12889 min_prob: 0.0,
12890 count_min: crate::cli::CountMinSettings {
12891 cells: Some(4),
12892 hashes: Some(2),
12893 bits: Some(4),
12894 memory_bytes: None,
12895 },
12896 ..Config::default()
12897 };
12898 let probe = kmers_for_record(&record("probe", b"ACGTACGT"), &config)
12899 .into_iter()
12900 .next()
12901 .unwrap();
12902
12903 let counts = build_input_counts(&config).unwrap();
12904
12905 let InputCounts::Sketch(sketch) = counts else {
12906 panic!("cells= should build a bounded packed count-min sketch");
12907 };
12908 assert_eq!(sketch.words.len(), 1);
12909 assert!(sketch.depth(&probe) > 0);
12910 }
12911
12912 #[test]
12913 fn auto_count_min_uses_sketch_when_input_metadata_exceeds_threshold() {
12914 let dir = tempfile::tempdir().unwrap();
12915 let path = dir.path().join("reads.fq");
12916 write_fastq(
12917 &path,
12918 &[
12919 ("r1", b"ACGTACGT", b"IIIIIIII"),
12920 ("r2", b"ACGTTCGT", b"IIIIIIII"),
12921 ],
12922 );
12923 let config = Config {
12924 in1: Some(path),
12925 k: 3,
12926 min_quality: 0,
12927 min_prob: 0.0,
12928 auto_count_min_input_bytes: 1,
12929 auto_count_min_memory_bytes: Some(4096),
12930 ..Config::default()
12931 };
12932
12933 let counts = build_input_counts(&config).unwrap();
12934
12935 match counts {
12936 InputCounts::AtomicSketch(sketch) => {
12937 assert!(sketch.cells > 0);
12938 assert!(sketch.increments.load(Ordering::Relaxed) > 0);
12939 }
12940 InputCounts::AtomicPackedSketch(sketch) => {
12941 assert!(sketch.cells > 0);
12942 assert!(sketch.increments.load(Ordering::Relaxed) > 0);
12943 }
12944 InputCounts::Sketch(sketch) => {
12945 assert!(sketch.cells > 0);
12946 assert!(sketch.increments > 0);
12947 }
12948 InputCounts::PrefilteredSketch { .. } => {}
12949 InputCounts::Exact(_) => {
12950 panic!("large-input auto count-min should build a bounded sketch");
12951 }
12952 }
12953 }
12954
12955 #[test]
12956 fn force_exact_counts_overrides_auto_and_explicit_sketch_settings() {
12957 let dir = tempfile::tempdir().unwrap();
12958 let path = dir.path().join("reads.fq");
12959 write_fastq(
12960 &path,
12961 &[
12962 ("r1", b"ACGTACGT", b"IIIIIIII"),
12963 ("r2", b"ACGTTCGT", b"IIIIIIII"),
12964 ],
12965 );
12966 let config = Config {
12967 in1: Some(path),
12968 k: 3,
12969 min_quality: 0,
12970 min_prob: 0.0,
12971 force_exact_counts: true,
12972 auto_count_min_input_bytes: 1,
12973 count_min: crate::cli::CountMinSettings {
12974 cells: Some(1),
12975 hashes: Some(2),
12976 bits: Some(4),
12977 memory_bytes: Some(1024),
12978 },
12979 ..Config::default()
12980 };
12981
12982 let counts = build_input_counts(&config).unwrap();
12983
12984 let InputCounts::Exact(counts) = counts else {
12985 panic!("force_exact_counts should override automatic and explicit sketch settings");
12986 };
12987 assert!(counts.len() > 1);
12988 }
12989
12990 #[test]
12991 fn bounded_sketch_chunked_parallel_is_deterministic_and_conservative() {
12992 let config = Config {
12993 k: 3,
12994 min_quality: 0,
12995 min_prob: 0.0,
12996 count_min: crate::cli::CountMinSettings {
12997 cells: Some(32),
12998 hashes: Some(3),
12999 bits: Some(8),
13000 memory_bytes: None,
13001 },
13002 ..Config::default()
13003 };
13004 let pairs = vec![
13005 (
13006 record("r1/1", b"ACGTACGT"),
13007 Some(record("r1/2", b"TCGTACGA")),
13008 ),
13009 (record("r2/1", b"AAAAACCC"), None),
13010 (
13011 record("r3/1", b"GGGGTTTT"),
13012 Some(record("r3/2", b"CCCCAAAA")),
13013 ),
13014 ];
13015 let mut exact = CountMap::default();
13016 for (r1, r2) in &pairs {
13017 increment_pair_counts(&config, &mut exact, r1, r2.as_ref());
13018 }
13019 let mut chunked_a = new_bounded_count_min_sketch(&config).unwrap();
13020 let mut chunked_b = new_bounded_count_min_sketch(&config).unwrap();
13021
13022 increment_sketch_from_pair_chunk(&config, &mut chunked_a, &pairs, None);
13023 increment_sketch_from_pair_chunk(&config, &mut chunked_b, &pairs, None);
13024
13025 assert_eq!(chunked_a.words, chunked_b.words);
13026 assert_eq!(chunked_a.increments, exact.values().copied().sum::<u64>());
13027 for (key, exact_depth) in exact {
13028 assert!(chunked_a.depth(&key) >= exact_depth.min(chunked_a.max_count));
13029 }
13030 }
13031
13032 #[test]
13033 fn atomic_count_min_chunked_parallel_matches_sequential_conservative_bits32() {
13034 let config = Config {
13035 k: 3,
13036 min_quality: 0,
13037 min_prob: 0.0,
13038 count_min: crate::cli::CountMinSettings {
13039 cells: Some(64),
13040 hashes: Some(3),
13041 bits: Some(32),
13042 memory_bytes: None,
13043 },
13044 ..Config::default()
13045 };
13046 let pairs = vec![
13047 (
13048 record("r1/1", b"ACGTACGT"),
13049 Some(record("r1/2", b"TCGTACGA")),
13050 ),
13051 (record("r2/1", b"AAAAACCC"), None),
13052 (
13053 record("r3/1", b"GGGGTTTT"),
13054 Some(record("r3/2", b"CCCCAAAA")),
13055 ),
13056 ];
13057 let sequential = new_atomic_count_min_sketch(&config).unwrap();
13058 let mut merged_counts = CountMap::default();
13059 for (r1, r2) in &pairs {
13060 let mut pair_counts = CountMap::default();
13061 increment_pair_counts(&config, &mut pair_counts, r1, r2.as_ref());
13062 merge_count_maps(&mut merged_counts, pair_counts);
13063 }
13064 let mut entries = merged_counts.into_iter().collect::<Vec<_>>();
13065 entries.sort_unstable_by(|(left, _), (right, _)| left.cmp(right));
13066 let key_increments = entries.iter().map(|(_, count)| *count).sum();
13067 for (key, count) in entries {
13068 sequential.add_key_count(&key, count);
13069 }
13070 sequential.add_key_increments(key_increments);
13071 let chunked = new_atomic_count_min_sketch(&config).unwrap();
13072
13073 increment_atomic_sketch_from_pair_chunk(&config, &chunked, &pairs, None);
13074
13075 assert_eq!(
13076 chunked.increments.load(Ordering::Relaxed),
13077 sequential.increments.load(Ordering::Relaxed)
13078 );
13079 assert_eq!(
13080 chunked.occupied_slots.load(Ordering::Relaxed),
13081 sequential.occupied_slots.load(Ordering::Relaxed)
13082 );
13083 for slot in 0..sequential.cells {
13084 assert_eq!(
13085 u64::from(chunked.cells_by_hash[slot].load(Ordering::Relaxed)),
13086 u64::from(sequential.cells_by_hash[slot].load(Ordering::Relaxed))
13087 );
13088 }
13089 }
13090
13091 #[test]
13092 fn nondeterministic_atomic_count_min_direct_path_matches_sequential_without_collisions() {
13093 let config = Config {
13094 k: 5,
13095 min_quality: 0,
13096 min_prob: 0.0,
13097 deterministic: false,
13098 count_min: crate::cli::CountMinSettings {
13099 cells: Some(8192),
13100 hashes: Some(1),
13101 bits: Some(32),
13102 memory_bytes: None,
13103 },
13104 ..Config::default()
13105 };
13106 let pairs = vec![
13107 (
13108 record("r1/1", b"ACGTACGTAC"),
13109 Some(record("r1/2", b"TCGTACGAAA")),
13110 ),
13111 (record("r2/1", b"AAAAACCCCC"), None),
13112 (
13113 record("r3/1", b"GGGGTTTTAA"),
13114 Some(record("r3/2", b"CCCCAAAAGG")),
13115 ),
13116 ];
13117 let sequential = new_atomic_count_min_sketch(&Config {
13118 deterministic: true,
13119 ..config.clone()
13120 })
13121 .unwrap();
13122 let mut merged_counts = CountMap::default();
13123 for (r1, r2) in &pairs {
13124 increment_pair_counts(&config, &mut merged_counts, r1, r2.as_ref());
13125 }
13126 let key_increments = merged_counts.values().copied().sum();
13127 sequential.add_key_counts(&merged_counts);
13128 sequential.add_key_increments(key_increments);
13129
13130 let direct = new_atomic_count_min_sketch(&config).unwrap();
13131 increment_atomic_sketch_from_pair_chunk(&config, &direct, &pairs, None);
13132
13133 assert_eq!(
13134 direct.increments.load(Ordering::Relaxed),
13135 sequential.increments.load(Ordering::Relaxed)
13136 );
13137 assert_eq!(
13138 direct.occupied_slots.load(Ordering::Relaxed),
13139 sequential.occupied_slots.load(Ordering::Relaxed)
13140 );
13141 for slot in 0..sequential.cells {
13142 assert_eq!(
13143 u64::from(direct.cells_by_hash[slot].load(Ordering::Relaxed)),
13144 u64::from(sequential.cells_by_hash[slot].load(Ordering::Relaxed))
13145 );
13146 }
13147 }
13148
13149 #[test]
13150 fn atomic_count_min_conservative_update_reduces_collision_inflation() {
13151 let config = Config {
13152 k: 3,
13153 min_quality: 0,
13154 min_prob: 0.0,
13155 count_min: crate::cli::CountMinSettings {
13156 cells: Some(1),
13157 hashes: Some(3),
13158 bits: Some(32),
13159 memory_bytes: None,
13160 },
13161 ..Config::default()
13162 };
13163 let key_a = KmerKey::Short(1);
13164 let key_b = KmerKey::Short(2);
13165 let sketch = new_atomic_count_min_sketch(&config).unwrap();
13166
13167 sketch.add_key_count(&key_a, 5);
13168 sketch.add_key_count(&key_b, 1);
13169
13170 assert_eq!(sketch.depth(&key_a), 6);
13171 assert_eq!(sketch.depth(&key_b), 6);
13172 }
13173
13174 #[test]
13175 fn bounded_output_counts_uses_sketch_for_kept_kmers_when_cells_are_constrained() {
13176 let config = Config {
13177 k: 3,
13178 min_quality: 0,
13179 min_prob: 0.0,
13180 count_min: crate::cli::CountMinSettings {
13181 cells: Some(4),
13182 hashes: Some(2),
13183 bits: Some(4),
13184 memory_bytes: None,
13185 },
13186 ..Config::default()
13187 };
13188 let r1 = record("r1", b"ACGTACGT");
13189 let probe = kmers_for_record(&r1, &config).into_iter().next().unwrap();
13190 let pair = NormalizedPair {
13191 input_list_index: 0,
13192 r1: r1.clone(),
13193 r2: None,
13194 out_r1: r1,
13195 out_r2: None,
13196 decision: PairDecision::default(),
13197 uncorrectable: false,
13198 read_count: 1,
13199 base_count: 8,
13200 };
13201 let mut counts = new_output_counts(&config).unwrap();
13202
13203 increment_output_counts_from_normalized_chunk(&config, &mut counts, &[pair]);
13204
13205 let OutputCounts::Sketch(sketch) = counts else {
13206 panic!("cells= should use a bounded output sketch for kept-kmer side counts");
13207 };
13208 assert_eq!(sketch.words.len(), 1);
13209 assert!(sketch.depth(&probe) > 0);
13210 }
13211
13212 #[test]
13213 fn nondeterministic_atomic_output_counts_direct_path_matches_sequential_without_collisions() {
13214 let config = Config {
13215 k: 5,
13216 min_quality: 0,
13217 min_prob: 0.0,
13218 deterministic: false,
13219 count_min: crate::cli::CountMinSettings {
13220 cells: Some(8192),
13221 hashes: Some(1),
13222 bits: Some(32),
13223 memory_bytes: None,
13224 },
13225 ..Config::default()
13226 };
13227 let kept_a = record("r1", b"ACGTACGTAC");
13228 let kept_b = record("r2", b"TTTTCCCCAA");
13229 let tossed = record("r3", b"GGGGAAAACC");
13230 let pairs = vec![
13231 NormalizedPair {
13232 input_list_index: 0,
13233 r1: kept_a.clone(),
13234 r2: None,
13235 out_r1: kept_a,
13236 out_r2: None,
13237 decision: PairDecision::default(),
13238 uncorrectable: false,
13239 read_count: 1,
13240 base_count: 10,
13241 },
13242 NormalizedPair {
13243 input_list_index: 0,
13244 r1: kept_b.clone(),
13245 r2: None,
13246 out_r1: kept_b,
13247 out_r2: None,
13248 decision: PairDecision::default(),
13249 uncorrectable: false,
13250 read_count: 1,
13251 base_count: 10,
13252 },
13253 NormalizedPair {
13254 input_list_index: 0,
13255 r1: tossed.clone(),
13256 r2: None,
13257 out_r1: tossed,
13258 out_r2: None,
13259 decision: PairDecision {
13260 toss: true,
13261 ..PairDecision::default()
13262 },
13263 uncorrectable: false,
13264 read_count: 1,
13265 base_count: 10,
13266 },
13267 ];
13268 let sequential_config = Config {
13269 deterministic: true,
13270 ..config.clone()
13271 };
13272 let mut sequential = new_output_counts(&sequential_config).unwrap();
13273 let mut direct = new_output_counts(&config).unwrap();
13274
13275 increment_output_counts_from_normalized_chunk(&sequential_config, &mut sequential, &pairs);
13276 increment_output_counts_from_normalized_chunk(&config, &mut direct, &pairs);
13277
13278 let (OutputCounts::AtomicSketch(sequential), OutputCounts::AtomicSketch(direct)) =
13279 (sequential, direct)
13280 else {
13281 panic!("bits=32 output counts should use atomic sketches");
13282 };
13283 assert_eq!(
13284 direct.increments.load(Ordering::Relaxed),
13285 sequential.increments.load(Ordering::Relaxed)
13286 );
13287 assert_eq!(
13288 direct.occupied_slots.load(Ordering::Relaxed),
13289 sequential.occupied_slots.load(Ordering::Relaxed)
13290 );
13291 for slot in 0..sequential.cells {
13292 assert_eq!(
13293 u64::from(direct.cells_by_hash[slot].load(Ordering::Relaxed)),
13294 u64::from(sequential.cells_by_hash[slot].load(Ordering::Relaxed))
13295 );
13296 }
13297 }
13298
13299 #[test]
13300 fn bounded_sketch_memory_budget_derives_cell_count() {
13301 let config = Config {
13302 count_min: crate::cli::CountMinSettings {
13303 cells: None,
13304 hashes: Some(2),
13305 bits: Some(8),
13306 memory_bytes: Some(1000),
13307 },
13308 threads: Some(2),
13309 ..Config::default()
13310 };
13311
13312 let sketch = new_bounded_count_min_sketch(&config).unwrap();
13313
13314 assert_eq!(sketch.cells, 998);
13315 assert_eq!(sketch.words.len(), 125);
13316 }
13317
13318 #[test]
13319 fn count_min_table_sizing_prime_adjusts_like_kcountarray() {
13320 assert_eq!(count_min_table_cells_from_total(1, 3), 1);
13321 assert_eq!(count_min_table_cells_from_total(9, 3), 7);
13322 assert_eq!(count_min_table_cells_from_total(64, 3), 62);
13323 assert_eq!(count_min_table_cells_from_total(1000, 2), 998);
13324 }
13325
13326 #[test]
13327 fn non_prefiltered_short_kmer_sketch_caps_cells_to_kmer_space_like_bbnorm() {
13328 let config = Config {
13329 k: 3,
13330 count_min: crate::cli::CountMinSettings {
13331 cells: Some(10_000),
13332 hashes: Some(2),
13333 bits: Some(8),
13334 memory_bytes: None,
13335 },
13336 ..Config::default()
13337 };
13338
13339 assert_eq!(short_kmer_space_cells(3), Some(64));
13340 assert_eq!(main_count_min_total_cells(&config, 8), 64);
13341
13342 let sketch = new_bounded_count_min_sketch(&config).unwrap();
13343 assert!(sketch.cells <= 64);
13344 }
13345
13346 #[test]
13347 fn prefiltered_short_kmer_sketch_preserves_requested_cells_like_bbnorm() {
13348 let config = Config {
13349 k: 3,
13350 count_min: crate::cli::CountMinSettings {
13351 cells: Some(10_000),
13352 hashes: Some(2),
13353 bits: Some(8),
13354 memory_bytes: None,
13355 },
13356 prefilter: crate::cli::PrefilterSettings {
13357 cells: Some(128),
13358 hashes: Some(2),
13359 bits: Some(2),
13360 ..Default::default()
13361 },
13362 ..Config::default()
13363 };
13364
13365 assert_eq!(main_count_min_total_cells(&config, 8), 10_000);
13366 }
13367
13368 #[test]
13369 fn kcount_array_min_arrays_rounds_threads_like_bbtools() {
13370 assert_eq!(kcount_array_min_arrays_for_threads(1), 2);
13371 assert_eq!(kcount_array_min_arrays_for_threads(2), 2);
13372 assert_eq!(kcount_array_min_arrays_for_threads(3), 4);
13373 assert_eq!(kcount_array_min_arrays_for_threads(8), 8);
13374 assert_eq!(kcount_array_min_arrays_for_threads(9), 16);
13375 }
13376
13377 #[test]
13378 fn bounded_sketch_sizing_uses_configured_threads_for_kcount_arrays() {
13379 let config = Config {
13380 threads: Some(8),
13381 count_min: crate::cli::CountMinSettings {
13382 cells: Some(1000),
13383 hashes: Some(2),
13384 bits: Some(8),
13385 memory_bytes: None,
13386 },
13387 ..Config::default()
13388 };
13389
13390 let sketch = new_bounded_count_min_sketch(&config).unwrap();
13391
13392 assert_eq!(sketch.cells, 904);
13393 assert_eq!(sketch.words.len(), 113);
13394 assert_eq!(sketch.layout.array_mask, 7);
13395 assert_eq!(sketch.layout.array_bits, 3);
13396 assert_eq!(sketch.layout.cells_per_array, 113);
13397 }
13398
13399 #[test]
13400 fn bounded_sketch_sizing_uses_active_rayon_threads_for_auto_threads() {
13401 let pool = rayon::ThreadPoolBuilder::new()
13402 .num_threads(3)
13403 .build()
13404 .unwrap();
13405 pool.install(|| {
13406 let config = Config {
13407 threads: None,
13408 count_min: crate::cli::CountMinSettings {
13409 cells: Some(1000),
13410 hashes: Some(2),
13411 bits: Some(8),
13412 memory_bytes: None,
13413 },
13414 ..Config::default()
13415 };
13416
13417 let sketch = new_bounded_count_min_sketch(&config).unwrap();
13418
13419 assert_eq!(kcount_array_min_arrays(&config), 4);
13420 assert_eq!(sketch.cells, 964);
13421 assert_eq!(sketch.words.len(), 121);
13422 assert_eq!(sketch.layout.array_mask, 3);
13423 assert_eq!(sketch.layout.array_bits, 2);
13424 assert_eq!(sketch.layout.cells_per_array, 241);
13425 });
13426 }
13427
13428 #[test]
13429 fn explicit_count_min_cells_are_total_budget_like_bbtools() {
13430 let config = Config {
13431 count_min: crate::cli::CountMinSettings {
13432 cells: Some(9),
13433 hashes: Some(3),
13434 bits: Some(8),
13435 memory_bytes: None,
13436 },
13437 ..Config::default()
13438 };
13439
13440 let packed = new_bounded_count_min_sketch(&config).unwrap();
13441 let atomic = new_atomic_count_min_sketch(&Config {
13442 count_min: crate::cli::CountMinSettings {
13443 bits: Some(32),
13444 ..config.count_min
13445 },
13446 ..Config::default()
13447 })
13448 .unwrap();
13449
13450 assert_eq!(packed.cells, 7);
13451 assert_eq!(packed.words.len(), 1);
13452 assert_eq!(atomic.cells, 7);
13453 assert_eq!(atomic.cells_by_hash.len(), 7);
13454 }
13455
13456 #[test]
13457 fn automatic_memory_budget_uses_bbtools_sizing_formula() {
13458 let config = Config {
13459 hist_in: Some(PathBuf::from("hist.tsv")),
13460 hist_len: 1000,
13461 threads: Some(3),
13462 build_passes: 2,
13463 ..Config::default()
13464 };
13465
13466 let usable = bbtools_usable_table_memory_bytes(&config, 1_000_000_000);
13467
13468 assert_eq!(usable, 329_944_000);
13469 }
13470
13471 #[test]
13472 fn countup_auto_memory_budget_halves_filter_bytes_like_bbnorm() {
13473 let config = Config {
13474 auto_count_min_memory_bytes: Some(1_000_000_000),
13475 table_reads: Some(1_000_000),
13476 ..Config::default()
13477 };
13478 let countup_config = Config {
13479 count_up: true,
13480 ..config.clone()
13481 };
13482
13483 assert_eq!(automatic_count_min_memory_bytes(&config), Some(659_920_000));
13484 assert_eq!(
13485 automatic_count_min_memory_bytes(&countup_config),
13486 Some(329_960_000)
13487 );
13488 }
13489
13490 #[test]
13491 fn automatic_output_counts_use_side_budget_and_next_mask_seed() {
13492 let config = Config {
13493 auto_count_min_memory_bytes: Some(1_000_000_000),
13494 table_reads: Some(1_000_000),
13495 threads: Some(8),
13496 deterministic: false,
13497 ..Config::default()
13498 };
13499
13500 assert_eq!(automatic_count_min_memory_bytes(&config), Some(659_920_000));
13501 assert_eq!(
13502 output_count_min_memory_bytes(&config, 32),
13503 Some(164_980_000)
13504 );
13505
13506 let main = new_atomic_count_min_sketch(&config).unwrap();
13507 let output = new_output_counts(&config).unwrap();
13508 let OutputCounts::AtomicSketch(output) = output else {
13509 panic!("automatic bits=32 output counts should use atomic sketches");
13510 };
13511 let main_layout = main.layout_summary("input_main", None);
13512 let output_layout = output.layout_summary("output_kept", None);
13513
13514 assert_eq!(
13515 output_layout.mask_seed,
13516 BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED
13517 );
13518 assert!(output_layout.memory_bytes < main_layout.memory_bytes / 2);
13519 assert!(output_layout.memory_bytes >= OUTPUT_COUNT_MIN_AUTO_MIN_MEMORY_BYTES);
13520 }
13521
13522 #[test]
13523 fn explicit_output_count_memory_preserves_requested_budget() {
13524 let config = Config {
13525 count_min: crate::cli::CountMinSettings {
13526 cells: None,
13527 hashes: Some(3),
13528 bits: Some(32),
13529 memory_bytes: Some(128 * 1024 * 1024),
13530 },
13531 threads: Some(4),
13532 ..Config::default()
13533 };
13534
13535 assert_eq!(
13536 output_count_min_memory_bytes(&config, 32),
13537 Some(128 * 1024 * 1024)
13538 );
13539 let main = new_atomic_count_min_sketch(&config).unwrap();
13540 let output = new_output_counts(&config).unwrap();
13541 let OutputCounts::AtomicSketch(output) = output else {
13542 panic!("explicit bits=32 output counts should use atomic sketches");
13543 };
13544
13545 assert_eq!(output.cells, main.cells);
13546 assert_eq!(
13547 output.layout.mask_seed,
13548 BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED
13549 );
13550 }
13551
13552 #[test]
13553 fn constrained_prefilter_inflates_unsaturated_colliding_counts() {
13554 let config = Config {
13555 prefilter: crate::cli::PrefilterSettings {
13556 enabled: false,
13557 force_disabled: false,
13558 cells: Some(1),
13559 hashes: Some(2),
13560 bits: Some(8),
13561 memory_bytes: None,
13562 memory_fraction_micros: None,
13563 },
13564 ..Config::default()
13565 };
13566 let mut counts = CountMap::default();
13567 counts.insert(KmerKey::Short(7), 2);
13568 counts.insert(KmerKey::Short(11), 5);
13569
13570 apply_prefilter_collision_estimates(&config, &mut counts);
13571
13572 assert_eq!(counts.get(&KmerKey::Short(7)), Some(&7));
13573 assert_eq!(counts.get(&KmerKey::Short(11)), Some(&7));
13574 }
13575
13576 #[test]
13577 fn constrained_prefilter_keeps_exact_counts_after_saturation() {
13578 let config = Config {
13579 prefilter: crate::cli::PrefilterSettings {
13580 enabled: false,
13581 force_disabled: false,
13582 cells: Some(1),
13583 hashes: Some(1),
13584 bits: Some(2),
13585 memory_bytes: None,
13586 memory_fraction_micros: None,
13587 },
13588 ..Config::default()
13589 };
13590 let mut counts = CountMap::default();
13591 counts.insert(KmerKey::Short(7), 2);
13592 counts.insert(KmerKey::Short(11), 5);
13593
13594 apply_prefilter_collision_estimates(&config, &mut counts);
13595
13596 assert_eq!(counts.get(&KmerKey::Short(7)), Some(&2));
13597 assert_eq!(counts.get(&KmerKey::Short(11)), Some(&5));
13598 }
13599
13600 #[test]
13601 fn prefilter_memory_budget_derives_prime_table_cells() {
13602 let config = Config {
13603 prefilter: crate::cli::PrefilterSettings {
13604 enabled: false,
13605 force_disabled: false,
13606 cells: None,
13607 hashes: Some(2),
13608 bits: Some(8),
13609 memory_bytes: Some(1000),
13610 memory_fraction_micros: None,
13611 },
13612 ..Config::default()
13613 };
13614 let mut counts = CountMap::default();
13615 counts.insert(KmerKey::Short(7), 2);
13616 counts.insert(KmerKey::Short(11), 5);
13617
13618 let bits = config.prefilter.bits.unwrap();
13619 let total_cells = count_min_cells_from_memory(config.prefilter.memory_bytes, bits);
13620 let table_cells = count_min_table_cells_from_total_bits(total_cells, bits);
13621
13622 assert_eq!(total_cells, 1000);
13623 assert_eq!(table_cells, 998);
13624
13625 apply_prefilter_collision_estimates(&config, &mut counts);
13626
13627 assert_eq!(counts.get(&KmerKey::Short(7)), Some(&2));
13628 assert_eq!(counts.get(&KmerKey::Short(11)), Some(&5));
13629 }
13630
13631 #[test]
13632 fn prefilter_fraction_derives_memory_from_table_budget() {
13633 let config = Config {
13634 auto_count_min_memory_bytes: Some(10_000),
13635 prefilter: crate::cli::PrefilterSettings {
13636 enabled: true,
13637 force_disabled: false,
13638 cells: None,
13639 hashes: Some(2),
13640 bits: Some(8),
13641 memory_bytes: None,
13642 memory_fraction_micros: Some(350_000),
13643 },
13644 ..Config::default()
13645 };
13646 let mut counts = CountMap::default();
13647 counts.insert(KmerKey::Short(7), 2);
13648 counts.insert(KmerKey::Short(11), 5);
13649
13650 let total_cells = prefilter_total_cells(&config, config.prefilter.bits.unwrap());
13651 let table_cells =
13652 count_min_table_cells_from_total_bits(total_cells, config.prefilter.bits.unwrap());
13653
13654 assert_eq!(total_cells, 3500);
13655 assert_eq!(table_cells, 3494);
13656
13657 apply_prefilter_collision_estimates(&config, &mut counts);
13658
13659 assert_eq!(counts.get(&KmerKey::Short(7)), Some(&2));
13660 assert_eq!(counts.get(&KmerKey::Short(11)), Some(&5));
13661 }
13662
13663 #[test]
13664 fn prefilter_fraction_partitions_main_cell_budget() {
13665 let config = Config {
13666 count_min: crate::cli::CountMinSettings {
13667 cells: Some(1000),
13668 hashes: Some(1),
13669 bits: Some(32),
13670 memory_bytes: None,
13671 },
13672 prefilter: crate::cli::PrefilterSettings {
13673 enabled: true,
13674 force_disabled: false,
13675 cells: None,
13676 hashes: Some(1),
13677 bits: Some(2),
13678 memory_bytes: None,
13679 memory_fraction_micros: Some(250_000),
13680 },
13681 threads: Some(2),
13682 ..Config::default()
13683 };
13684
13685 assert_eq!(main_count_min_total_cells(&config, 32), 750);
13686 assert_eq!(prefilter_total_cells(&config, 2), 4000);
13687
13688 let main = new_atomic_count_min_sketch(&config).unwrap();
13689 let prefilter = new_prefilter_count_min_sketch(&config).unwrap();
13690
13691 assert_eq!(main.cells, count_min_table_cells_from_total_bits(750, 32));
13692 assert_eq!(
13693 prefilter.cells,
13694 count_min_table_cells_from_total_bits(4000, 2)
13695 );
13696 assert_eq!(prefilter.max_count, 3);
13697 }
13698
13699 #[test]
13700 fn prefilter_flag_uses_bbtools_default_fraction_on_bounded_count_min_paths() {
13701 let config = Config {
13702 count_min: crate::cli::CountMinSettings {
13703 cells: Some(10_000),
13704 hashes: Some(2),
13705 bits: Some(32),
13706 memory_bytes: None,
13707 },
13708 prefilter: crate::cli::PrefilterSettings {
13709 enabled: true,
13710 force_disabled: false,
13711 cells: None,
13712 hashes: Some(2),
13713 bits: Some(2),
13714 memory_bytes: None,
13715 memory_fraction_micros: None,
13716 },
13717 ..Config::default()
13718 };
13719
13720 assert!(use_prefilter_collision_estimates(&config));
13721 assert_eq!(main_count_min_total_cells(&config, 32), 6500);
13722 assert_eq!(prefilter_total_cells(&config, 2), 56_000);
13723 }
13724
13725 #[test]
13726 fn zero_prefilter_fraction_does_not_force_prefilter_sketch() {
13727 let config = Config {
13728 count_min: crate::cli::CountMinSettings {
13729 cells: Some(10_000),
13730 bits: Some(32),
13731 ..Default::default()
13732 },
13733 prefilter: crate::cli::PrefilterSettings {
13734 enabled: false,
13735 force_disabled: false,
13736 memory_fraction_micros: Some(0),
13737 ..Default::default()
13738 },
13739 ..Config::default()
13740 };
13741
13742 assert!(!use_prefilter_collision_estimates(&config));
13743 assert_eq!(main_count_min_total_cells(&config, 32), 10_000);
13744 }
13745
13746 #[test]
13747 fn forced_off_prefilter_ignores_lingering_controls_like_bbnorm() {
13748 let config = Config {
13749 count_min: crate::cli::CountMinSettings {
13750 cells: Some(10_000),
13751 hashes: Some(3),
13752 bits: Some(32),
13753 ..Default::default()
13754 },
13755 prefilter: crate::cli::PrefilterSettings {
13756 enabled: false,
13757 force_disabled: true,
13758 cells: Some(1_000),
13759 hashes: Some(1),
13760 bits: Some(2),
13761 memory_bytes: None,
13762 memory_fraction_micros: Some(DEFAULT_PREFILTER_FRACTION_MICROS),
13763 },
13764 ..Config::default()
13765 };
13766
13767 assert!(!use_prefilter_collision_estimates(&config));
13768 assert_eq!(prefilter_memory_fraction_micros(&config), None);
13769 assert_eq!(main_count_min_total_cells(&config, 32), 10_000);
13770 }
13771
13772 #[test]
13773 fn prefilter_default_hashes_track_main_hashes_like_bbnorm() {
13774 let config = Config {
13775 count_min: crate::cli::CountMinSettings {
13776 cells: Some(10_000),
13777 hashes: Some(8),
13778 bits: Some(32),
13779 memory_bytes: None,
13780 },
13781 prefilter: crate::cli::PrefilterSettings {
13782 enabled: true,
13783 force_disabled: false,
13784 bits: Some(2),
13785 ..Default::default()
13786 },
13787 ..Config::default()
13788 };
13789
13790 let prefilter = new_prefilter_count_min_sketch(&config).unwrap();
13791
13792 assert_eq!(default_prefilter_hashes(&config), 4);
13793 assert_eq!(prefilter.hashes, 4);
13794
13795 let explicit = Config {
13796 prefilter: crate::cli::PrefilterSettings {
13797 hashes: Some(1),
13798 ..config.prefilter
13799 },
13800 ..config
13801 };
13802 let prefilter = new_prefilter_count_min_sketch(&explicit).unwrap();
13803 assert_eq!(prefilter.hashes, 1);
13804 }
13805
13806 #[test]
13807 fn explicit_prefilter_hashes_enable_default_partition_like_bbnorm() {
13808 let config = Config {
13809 count_min: crate::cli::CountMinSettings {
13810 cells: Some(10_000),
13811 hashes: Some(3),
13812 bits: Some(32),
13813 memory_bytes: None,
13814 },
13815 prefilter: crate::cli::PrefilterSettings {
13816 enabled: true,
13817 force_disabled: false,
13818 hashes: Some(1),
13819 bits: Some(2),
13820 ..Default::default()
13821 },
13822 ..Config::default()
13823 };
13824
13825 assert_eq!(
13826 prefilter_memory_fraction_micros(&config),
13827 Some(DEFAULT_PREFILTER_FRACTION_MICROS)
13828 );
13829 assert_eq!(main_count_min_total_cells(&config, 32), 6500);
13830 assert_eq!(prefilter_total_cells(&config, 2), 56_000);
13831 }
13832
13833 #[test]
13834 fn prefilter_flag_alone_keeps_small_exact_inputs_on_exact_path() {
13835 let dir = tempfile::tempdir().unwrap();
13836 let path = dir.path().join("reads.fq");
13837 write_fastq(&path, &[("r1", b"ACGTACGT", b"IIIIIIII")]);
13838 let config = Config {
13839 in1: Some(path),
13840 k: 3,
13841 min_quality: 0,
13842 min_prob: 0.0,
13843 prefilter: crate::cli::PrefilterSettings {
13844 enabled: true,
13845 force_disabled: false,
13846 ..Default::default()
13847 },
13848 ..Config::default()
13849 };
13850
13851 let counts = build_input_counts(&config).unwrap();
13852 assert!(matches!(counts, InputCounts::Exact(_)));
13853 }
13854
13855 #[test]
13856 fn prefilter_flag_builds_two_stage_sketch_when_count_min_is_bounded() {
13857 let dir = tempfile::tempdir().unwrap();
13858 let path = dir.path().join("reads.fq");
13859 write_fastq(
13860 &path,
13861 &[
13862 ("r1", b"ACGTACGT", b"IIIIIIII"),
13863 ("r2", b"ACGTACGT", b"IIIIIIII"),
13864 ("r3", b"ACGTACGT", b"IIIIIIII"),
13865 ],
13866 );
13867 let config = Config {
13868 in1: Some(path),
13869 k: 3,
13870 min_quality: 0,
13871 min_prob: 0.0,
13872 count_min: crate::cli::CountMinSettings {
13873 cells: Some(512),
13874 hashes: Some(2),
13875 bits: Some(32),
13876 memory_bytes: None,
13877 },
13878 prefilter: crate::cli::PrefilterSettings {
13879 enabled: true,
13880 force_disabled: false,
13881 ..Default::default()
13882 },
13883 ..Config::default()
13884 };
13885
13886 let counts = build_input_counts(&config).unwrap();
13887 let InputCounts::PrefilteredSketch {
13888 prefilter,
13889 limit,
13890 main,
13891 } = counts
13892 else {
13893 panic!("prefilter=t plus bounded count-min should build a two-stage sketch");
13894 };
13895 assert_eq!(prefilter.bits(), DEFAULT_PREFILTER_BITS);
13896 assert_eq!(limit, prefilter.max_count());
13897 assert_eq!(prefilter_total_cells(&config, DEFAULT_PREFILTER_BITS), 2867);
13898 assert_eq!(main_count_min_total_cells(&config, 32), 332);
13899 assert!(matches!(*main, InputCounts::AtomicSketch(_)));
13900 }
13901
13902 #[test]
13903 fn explicit_prefilter_memory_does_not_shrink_main_table_budget() {
13904 let config = Config {
13905 count_min: crate::cli::CountMinSettings {
13906 cells: Some(1000),
13907 hashes: Some(1),
13908 bits: Some(32),
13909 memory_bytes: None,
13910 },
13911 prefilter: crate::cli::PrefilterSettings {
13912 enabled: true,
13913 force_disabled: false,
13914 cells: None,
13915 hashes: Some(1),
13916 bits: Some(2),
13917 memory_bytes: Some(100),
13918 memory_fraction_micros: Some(250_000),
13919 },
13920 ..Config::default()
13921 };
13922
13923 assert_eq!(main_count_min_total_cells(&config, 32), 1000);
13924 assert_eq!(prefilter_total_cells(&config, 2), 400);
13925 }
13926
13927 #[test]
13928 fn prefiltered_input_counts_use_prefilter_until_saturation() {
13929 let low = KmerKey::Short(1);
13930 let high = KmerKey::Short(2);
13931 let mut prefilter = PackedCountMinSketch::new(4099, 2, 2).unwrap();
13932 prefilter.add_key_count(&low, 2);
13933 prefilter.add_key_count(&high, 3);
13934
13935 let main = AtomicCountMinSketch::new(4099, 2).unwrap();
13936 main.add_key_count(&low, 99);
13937 main.add_key_count(&high, 5);
13938
13939 let counts = InputCounts::PrefilteredSketch {
13940 limit: prefilter.max_count,
13941 prefilter: PrefilterCountMinSketch::Packed(prefilter),
13942 main: Box::new(InputCounts::AtomicSketch(main)),
13943 };
13944
13945 assert_eq!(counts.depth(&low), 2);
13946 assert_eq!(counts.depth(&high), 5);
13947 }
13948
13949 #[test]
13950 fn prefiltered_input_counts_honor_explicit_lower_prefilter_limit() {
13951 let key = KmerKey::Short(7);
13952 let mut prefilter = PackedCountMinSketch::new(4099, 2, 2).unwrap();
13953 prefilter.add_key_count(&key, 2);
13954
13955 let main = AtomicCountMinSketch::new(4099, 2).unwrap();
13956 main.add_key_count(&key, 11);
13957
13958 let counts = InputCounts::PrefilteredSketch {
13959 limit: 2,
13960 prefilter: PrefilterCountMinSketch::Packed(prefilter),
13961 main: Box::new(InputCounts::AtomicSketch(main)),
13962 };
13963
13964 assert_eq!(counts.depth(&key), 11);
13965 }
13966
13967 #[test]
13968 fn input_count_layout_summary_reports_prefilter_and_main_tables() {
13969 let prefilter =
13970 PackedCountMinSketch::new_with_min_arrays_and_mask_seed(4099, 2, 2, 4, 0).unwrap();
13971 let main = AtomicCountMinSketch::new_with_min_arrays_and_update_mode(
13972 8191,
13973 3,
13974 4,
13975 CountMinUpdateMode::Conservative,
13976 7,
13977 )
13978 .unwrap();
13979 let counts = InputCounts::PrefilteredSketch {
13980 limit: prefilter.max_count,
13981 prefilter: PrefilterCountMinSketch::Packed(prefilter),
13982 main: Box::new(InputCounts::AtomicSketch(main)),
13983 };
13984
13985 let layouts = counts.sketch_layouts();
13986
13987 assert_eq!(layouts.len(), 2);
13988 assert_eq!(layouts[0].table, "input_prefilter");
13989 assert_eq!(layouts[0].kind, "packed");
13990 assert_eq!(layouts[0].bits, 2);
13991 assert_eq!(layouts[0].hashes, 2);
13992 assert_eq!(layouts[0].mask_seed, 0);
13993 assert_eq!(layouts[0].update_mode, "conservative");
13994 assert_eq!(layouts[0].prefilter_limit, Some(3));
13995 assert!(layouts[0].memory_bytes > 0);
13996 assert_eq!(layouts[1].table, "input_main");
13997 assert_eq!(layouts[1].kind, "atomic");
13998 assert_eq!(layouts[1].bits, 32);
13999 assert_eq!(layouts[1].hashes, 3);
14000 assert_eq!(layouts[1].mask_seed, 7);
14001 assert_eq!(layouts[1].prefilter_limit, None);
14002 assert!(layouts[1].arrays >= 4);
14003 assert!(layouts[1].memory_bytes >= layouts[1].cells * std::mem::size_of::<AtomicU32>());
14004 }
14005
14006 #[test]
14007 fn prefilter_gate_uses_explicit_limit_for_main_counts() {
14008 let below = KmerKey::Short(1);
14009 let at_limit = KmerKey::Short(2);
14010 let above = KmerKey::Short(3);
14011 let mut prefilter = PackedCountMinSketch::new(4099, 2, 2).unwrap();
14012 prefilter.add_key_count(&below, 1);
14013 prefilter.add_key_count(&at_limit, 2);
14014 prefilter.add_key_count(&above, 3);
14015
14016 let mut counts = CountMap::default();
14017 counts.insert(below.clone(), 10);
14018 counts.insert(at_limit.clone(), 20);
14019 counts.insert(above.clone(), 30);
14020
14021 let prefilter = PrefilterCountMinSketch::Packed(prefilter);
14022 retain_prefilter_saturated_counts(&mut counts, Some(PrefilterGate::new(&prefilter, 2)));
14023
14024 assert!(!counts.contains_key(&below));
14025 assert_eq!(counts.get(&at_limit), Some(&20));
14026 assert_eq!(counts.get(&above), Some(&30));
14027 }
14028
14029 #[test]
14030 fn prefilter_gate_during_collection_matches_post_retain() {
14031 let r1 = record("r1", b"ACGTACGTACGT");
14032 let r2 = record("r2", b"TGCATGCATGCA");
14033
14034 for remove_duplicate_kmers in [false, true] {
14035 let config = Config {
14036 k: 3,
14037 min_quality: 0,
14038 min_prob: 0.0,
14039 remove_duplicate_kmers,
14040 ..Config::default()
14041 };
14042 let mut prefilter = PackedCountMinSketch::new(4099, 2, 2).unwrap();
14043 let keys = unique_pair_kmers(&config, &r1, Some(&r2));
14044 for key in keys.iter().step_by(2) {
14045 prefilter.add_key_count(key, prefilter.max_count);
14046 }
14047 let prefilter = PrefilterCountMinSketch::Packed(prefilter);
14048 let gate = PrefilterGate::new(&prefilter, prefilter.max_count());
14049 assert!(
14050 keys.iter().any(|key| !gate.should_count_in_main(key)),
14051 "fixture should include at least one prefilter-rejected k-mer"
14052 );
14053
14054 let mut post_retain = CountMap::default();
14055 increment_pair_counts(&config, &mut post_retain, &r1, Some(&r2));
14056 retain_prefilter_saturated_counts(&mut post_retain, Some(gate));
14057
14058 let mut during_collection = CountMap::default();
14059 increment_pair_counts_with_prefilter(
14060 &config,
14061 &mut during_collection,
14062 &r1,
14063 Some(&r2),
14064 Some(gate),
14065 );
14066
14067 assert_eq!(during_collection, post_retain);
14068 }
14069 }
14070
14071 #[test]
14072 fn prefiltered_input_counts_use_thresholded_main_unique_estimates_above_prefilter_max() {
14073 let mut prefilter = PackedCountMinSketch::new(1024, 4, 2).unwrap();
14074 for bucket in 0..256 {
14075 let depth = if bucket < 128 { prefilter.max_count } else { 1 };
14076 prefilter.set_cell(bucket, depth);
14077 }
14078 prefilter.increments = 1_000;
14079
14080 let main = AtomicCountMinSketch::new(1024, 4).unwrap();
14081 for bucket in 0..128 {
14082 main.cells_by_hash[bucket].store(4, Ordering::Relaxed);
14083 }
14084 main.add_key_increments(1_000);
14085
14086 let counts = InputCounts::PrefilteredSketch {
14087 limit: prefilter.max_count,
14088 prefilter: PrefilterCountMinSketch::Packed(prefilter),
14089 main: Box::new(InputCounts::AtomicSketch(main)),
14090 };
14091
14092 let all_depth_estimated = counts.unique_kmers();
14093 let saturated_prefilter_estimated = counts.unique_kmers_at_least(2);
14094 let high_depth_estimated = counts.unique_kmers_at_least(4);
14095 let split = counts.unique_kmer_estimate_split().unwrap();
14096
14097 assert!(
14098 (70..=80).contains(&all_depth_estimated),
14099 "prefilter all-depth estimate was {all_depth_estimated}"
14100 );
14101 assert!(
14102 (30..=40).contains(&saturated_prefilter_estimated),
14103 "prefilter threshold estimate was {saturated_prefilter_estimated}"
14104 );
14105 assert!(
14106 (30..=40).contains(&high_depth_estimated),
14107 "main high-depth estimate was {high_depth_estimated}"
14108 );
14109 assert_eq!(split.low_depth_max, 3);
14110 assert_eq!(split.high_depth_min, 4);
14111 assert_eq!(split.high_depth_kmers, high_depth_estimated);
14112 assert_eq!(
14113 split.low_depth_kmers,
14114 all_depth_estimated.saturating_sub(high_depth_estimated)
14115 );
14116 assert!(
14117 (30..=50).contains(&split.low_depth_kmers),
14118 "prefilter low-depth split estimate was {}",
14119 split.low_depth_kmers
14120 );
14121 }
14122
14123 #[test]
14124 fn bounded_input_counts_builds_two_stage_prefiltered_sketch() {
14125 let dir = tempfile::tempdir().unwrap();
14126 let path = dir.path().join("reads.fq");
14127 write_fastq(
14128 &path,
14129 &[
14130 ("r1", b"ACGTACGT", b"IIIIIIII"),
14131 ("r2", b"ACGTACGT", b"IIIIIIII"),
14132 ("r3", b"ACGTACGT", b"IIIIIIII"),
14133 ],
14134 );
14135 let config = Config {
14136 in1: Some(path),
14137 k: 3,
14138 min_quality: 0,
14139 min_prob: 0.0,
14140 count_min: crate::cli::CountMinSettings {
14141 cells: Some(128),
14142 hashes: Some(2),
14143 bits: Some(32),
14144 memory_bytes: None,
14145 },
14146 prefilter: crate::cli::PrefilterSettings {
14147 enabled: false,
14148 force_disabled: false,
14149 cells: None,
14150 hashes: Some(2),
14151 bits: None,
14152 memory_bytes: Some(1024),
14153 memory_fraction_micros: None,
14154 },
14155 ..Config::default()
14156 };
14157
14158 let counts = build_input_counts(&config).unwrap();
14159 let InputCounts::PrefilteredSketch {
14160 prefilter,
14161 limit,
14162 main,
14163 } = counts
14164 else {
14165 panic!("prefilter memory plus bounded count-min should build a two-stage sketch");
14166 };
14167 assert_eq!(prefilter.bits(), DEFAULT_PREFILTER_BITS);
14168 assert_eq!(prefilter.max_count(), 3);
14169 assert_eq!(limit, prefilter.max_count());
14170 assert_eq!(prefilter.update_mode(), CountMinUpdateMode::Conservative);
14171 assert!(matches!(*main, InputCounts::AtomicSketch(_)));
14172 }
14173
14174 #[test]
14175 fn trusted_build_pass_filter_reduces_non_singleton_depths() {
14176 let config = Config {
14177 build_passes: 2,
14178 ..Config::default()
14179 };
14180 let mut counts = CountMap::default();
14181 counts.insert(KmerKey::Short(7), 1);
14182 counts.insert(KmerKey::Short(11), 2);
14183 counts.insert(KmerKey::Short(13), 3);
14184
14185 apply_trusted_build_pass_filter(&config, &mut counts);
14186
14187 assert_eq!(counts.get(&KmerKey::Short(7)), Some(&1));
14188 assert_eq!(counts.get(&KmerKey::Short(11)), Some(&1));
14189 assert_eq!(counts.get(&KmerKey::Short(13)), Some(&2));
14190 }
14191
14192 #[test]
14193 fn ecco_auto_disables_overlap_repair_when_java_style_sample_is_empty() {
14194 let dir = tempfile::tempdir().unwrap();
14195 let r1_path = dir.path().join("r1.fq");
14196 let r2_path = dir.path().join("r2.fq");
14197 let r1 = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
14198 let r2 = b"GTCGCATATTTCAAGCACTAATTCGCTGCGGCACAACTAA";
14199 let q = b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII";
14200 write_fastq(
14201 &r1_path,
14202 &[
14203 ("overlap1/1", r1, q),
14204 ("overlap2/1", r1, q),
14205 ("overlap3/1", r1, q),
14206 ("overlap4/1", r1, q),
14207 ],
14208 );
14209 write_fastq(
14210 &r2_path,
14211 &[
14212 ("overlap1/2", r2, q),
14213 ("overlap2/2", r2, q),
14214 ("overlap3/2", r2, q),
14215 ("overlap4/2", r2, q),
14216 ],
14217 );
14218 let config = Config {
14219 in1: Some(r1_path),
14220 in2: Some(r2_path),
14221 error_correct: true,
14222 error_correct_first: true,
14223 error_correct_final: true,
14224 overlap_error_correct_auto: true,
14225 ..Config::default()
14226 };
14227
14228 let resolved = resolve_overlap_error_correct_auto(&config).unwrap();
14229
14230 assert!(!resolved.overlap_error_correct_auto);
14231 assert!(!resolved.overlap_error_correct);
14232 }
14233
14234 #[test]
14235 fn ecco_auto_enables_overlap_repair_for_sampled_mergeable_pairs() {
14236 let dir = tempfile::tempdir().unwrap();
14237 let r1_path = dir.path().join("r1.fq");
14238 let r2_path = dir.path().join("r2.fq");
14239 let r1 = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
14240 let r2 = b"GTCGCATATTTCAAGCACTACTTCGCTGCGGCACAACTAA";
14241 let q = b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII";
14242 write_repeated_fastq(&r1_path, "overlap/1_", r1, q, 200);
14243 write_repeated_fastq(&r2_path, "overlap/2_", r2, q, 200);
14244 let config = Config {
14245 in1: Some(r1_path),
14246 in2: Some(r2_path),
14247 error_correct: true,
14248 error_correct_first: true,
14249 error_correct_final: true,
14250 overlap_error_correct_auto: true,
14251 ..Config::default()
14252 };
14253
14254 let resolved = resolve_overlap_error_correct_auto(&config).unwrap();
14255
14256 assert!(!resolved.overlap_error_correct_auto);
14257 assert!(resolved.overlap_error_correct);
14258 }
14259
14260 #[test]
14261 fn countup_abrc_controls_tossed_read_table_updates() {
14262 let keys = vec![KmerKey::Short(7), KmerKey::Short(11)];
14263 let mut input_counts = CountMap::default();
14264 input_counts.insert(keys[0].clone(), 3);
14265 input_counts.insert(keys[1].clone(), 3);
14266
14267 let base_config = Config {
14268 min_depth: 1,
14269 ..Config::default()
14270 };
14271 let mut kept_counts = OutputCounts::Exact(CountMap::default());
14272 update_countup_kept_counts_for_decision(
14273 &base_config,
14274 &mut kept_counts,
14275 &input_counts,
14276 &keys,
14277 true,
14278 );
14279 assert_eq!(kept_counts.unique_kmers(), 0);
14280
14281 let add_bad_config = Config {
14282 add_bad_reads_countup: true,
14283 ..base_config.clone()
14284 };
14285 update_countup_kept_counts_for_decision(
14286 &add_bad_config,
14287 &mut kept_counts,
14288 &input_counts,
14289 &keys,
14290 true,
14291 );
14292 assert_eq!(kept_counts.depth(&keys[0]), 1);
14293 assert_eq!(kept_counts.depth(&keys[1]), 1);
14294
14295 update_countup_kept_counts_for_decision(
14296 &base_config,
14297 &mut kept_counts,
14298 &input_counts,
14299 &keys,
14300 false,
14301 );
14302 assert_eq!(kept_counts.depth(&keys[0]), 2);
14303 assert_eq!(kept_counts.depth(&keys[1]), 2);
14304 }
14305
14306 #[test]
14307 fn countup_decision_plan_reuses_input_depth_gate_for_kept_updates() {
14308 let keys = vec![KmerKey::Short(7), KmerKey::Short(11), KmerKey::Short(13)];
14309 let mut input_counts = CountMap::default();
14310 input_counts.insert(keys[0].clone(), 0);
14311 input_counts.insert(keys[1].clone(), 3);
14312 input_counts.insert(keys[2].clone(), 4);
14313 let kept_counts = CountMap::default();
14314 let config = Config {
14315 min_depth: 2,
14316 min_kmers_over_min_depth: 1,
14317 target_depth: 10,
14318 add_bad_reads_countup: true,
14319 ..Config::default()
14320 };
14321
14322 let plan = countup_decision_plan(&config, &input_counts, &kept_counts, &keys, 10);
14323 assert_eq!(
14324 plan.toss,
14325 decide_countup_pair(&config, &input_counts, &kept_counts, &keys, 10)
14326 );
14327 assert_eq!(plan.eligible_key_indices, vec![1, 2]);
14328
14329 let mut planned_counts = OutputCounts::Exact(CountMap::default());
14330 update_countup_kept_counts_for_plan(&config, &mut planned_counts, &keys, &plan);
14331
14332 let mut replayed_counts = OutputCounts::Exact(CountMap::default());
14333 update_countup_kept_counts_for_decision(
14334 &config,
14335 &mut replayed_counts,
14336 &input_counts,
14337 &keys,
14338 plan.toss,
14339 );
14340
14341 assert_eq!(
14342 planned_counts.unique_kmers(),
14343 replayed_counts.unique_kmers()
14344 );
14345 assert_eq!(planned_counts.depth(&keys[0]), 0);
14346 assert_eq!(planned_counts.depth(&keys[1]), 1);
14347 assert_eq!(planned_counts.depth(&keys[2]), 1);
14348 }
14349
14350 #[test]
14351 fn countup_bounded_kept_counts_use_sketch_when_cells_are_constrained() {
14352 let keys = vec![KmerKey::Short(7), KmerKey::Short(11)];
14353 let mut input_counts = CountMap::default();
14354 input_counts.insert(keys[0].clone(), 3);
14355 input_counts.insert(keys[1].clone(), 3);
14356 let config = Config {
14357 min_depth: 1,
14358 count_min: crate::cli::CountMinSettings {
14359 cells: Some(1),
14360 hashes: Some(2),
14361 bits: Some(3),
14362 memory_bytes: None,
14363 },
14364 ..Config::default()
14365 };
14366 let mut kept_counts = new_output_counts(&config).unwrap();
14367
14368 update_countup_kept_counts_for_decision(
14369 &config,
14370 &mut kept_counts,
14371 &input_counts,
14372 &keys,
14373 false,
14374 );
14375
14376 let OutputCounts::Sketch(sketch) = kept_counts else {
14377 panic!("countup cells= should use a bounded kept-count sketch");
14378 };
14379 assert_eq!(sketch.words.len(), 1);
14380 assert_eq!(sketch.depth(&keys[0]), 2);
14381 assert_eq!(sketch.depth(&keys[1]), 2);
14382 }
14383
14384 #[test]
14385 fn countup_kept_count_sketch_uses_java_target_sized_cells() {
14386 let config = Config {
14387 count_up: true,
14388 target_depth: 100,
14389 threads: Some(1),
14390 count_min: crate::cli::CountMinSettings {
14391 cells: Some(10_000),
14392 hashes: Some(8),
14393 bits: Some(32),
14394 memory_bytes: None,
14395 },
14396 ..Config::default()
14397 };
14398
14399 let kept_counts = new_output_counts(&config).unwrap();
14400
14401 let OutputCounts::Sketch(sketch) = kept_counts else {
14402 panic!("countup kept-count table should use a packed sketch");
14403 };
14404 assert_eq!(sketch.bits, 8);
14405 assert_eq!(sketch.hashes, 3);
14406 assert_eq!(sketch.cells, 9_998);
14407 assert_eq!(
14408 sketch.layout.mask_seed,
14409 BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED
14410 );
14411 }
14412
14413 #[test]
14414 fn countup_kept_count_bits_use_adjusted_target_boundaries_like_bbnorm() {
14415 assert_eq!(
14416 countup_output_count_bits(&Config {
14417 count_up: true,
14418 target_depth: 16,
14419 ..Config::default()
14420 }),
14421 4
14422 );
14423 assert_eq!(
14424 countup_output_count_bits(&Config {
14425 count_up: true,
14426 target_depth: 17,
14427 ..Config::default()
14428 }),
14429 8
14430 );
14431 assert_eq!(
14432 countup_output_count_bits(&Config {
14433 count_up: true,
14434 target_depth: 268,
14435 ..Config::default()
14436 }),
14437 8
14438 );
14439 assert_eq!(
14440 countup_output_count_bits(&Config {
14441 count_up: true,
14442 target_depth: 269,
14443 ..Config::default()
14444 }),
14445 16
14446 );
14447 }
14448
14449 #[test]
14450 fn output_pair_analysis_is_only_required_for_rename_or_depth_bins() {
14451 assert!(!needs_output_pair_analysis(&Config::default()));
14452 assert!(needs_output_pair_analysis(&Config {
14453 rename_reads: true,
14454 ..Config::default()
14455 }));
14456 assert!(needs_output_pair_analysis(&Config {
14457 out_low1: Some(PathBuf::from("low.fq")),
14458 ..Config::default()
14459 }));
14460 assert!(needs_output_pair_analysis(&Config {
14461 out_high2: Some(PathBuf::from("high2.fq")),
14462 ..Config::default()
14463 }));
14464 }
14465
14466 #[test]
14467 fn countup_kept_count_sketch_uses_next_mask_seed_after_prefilter_and_main() {
14468 let config = Config {
14469 count_up: true,
14470 target_depth: 100,
14471 threads: Some(1),
14472 prefilter: crate::cli::PrefilterSettings {
14473 enabled: true,
14474 force_disabled: false,
14475 ..Default::default()
14476 },
14477 count_min: crate::cli::CountMinSettings {
14478 cells: Some(10_000),
14479 hashes: Some(3),
14480 bits: Some(32),
14481 memory_bytes: None,
14482 },
14483 ..Config::default()
14484 };
14485
14486 let kept_counts = new_output_counts(&config).unwrap();
14487
14488 let OutputCounts::Sketch(sketch) = kept_counts else {
14489 panic!("countup kept-count table should use a packed sketch");
14490 };
14491 assert_eq!(
14492 sketch.layout.mask_seed,
14493 BBTOOLS_KCOUNT_ARRAY_SECOND_MASK_SEED + BBTOOLS_KCOUNT_ARRAY_MASK_SEED_STEP
14494 );
14495 }
14496
14497 #[test]
14498 fn multipass_caps_wide_count_min_bits_like_bbnorm() {
14499 let mut default_bits = Config {
14500 passes: 2,
14501 ..Config::default()
14502 };
14503 apply_bbtools_multipass_cell_bits_cap(&mut default_bits);
14504 assert_eq!(default_bits.count_min.bits, Some(16));
14505
14506 let mut explicit_wide_bits = Config {
14507 passes: 2,
14508 count_min: crate::cli::CountMinSettings {
14509 bits: Some(32),
14510 ..Default::default()
14511 },
14512 ..Config::default()
14513 };
14514 apply_bbtools_multipass_cell_bits_cap(&mut explicit_wide_bits);
14515 assert_eq!(explicit_wide_bits.count_min.bits, Some(16));
14516
14517 let mut explicit_narrow_bits = Config {
14518 passes: 2,
14519 count_min: crate::cli::CountMinSettings {
14520 bits: Some(8),
14521 ..Default::default()
14522 },
14523 ..Config::default()
14524 };
14525 apply_bbtools_multipass_cell_bits_cap(&mut explicit_narrow_bits);
14526 assert_eq!(explicit_narrow_bits.count_min.bits, Some(8));
14527
14528 let mut single_pass = Config {
14529 passes: 1,
14530 ..Config::default()
14531 };
14532 apply_bbtools_multipass_cell_bits_cap(&mut single_pass);
14533 assert_eq!(single_pass.count_min.bits, None);
14534 }
14535
14536 #[test]
14537 fn multipass_intermediate_pass_uses_bits1_like_bbnorm() {
14538 let config = Config {
14539 passes: 2,
14540 count_min_bits_first: Some(8),
14541 count_min: crate::cli::CountMinSettings {
14542 bits: Some(16),
14543 ..Default::default()
14544 },
14545 ..Config::default()
14546 };
14547
14548 let pass_config = pass_config_for_intermediate(
14549 &config,
14550 1,
14551 Path::new("in1.fq"),
14552 None,
14553 false,
14554 PathBuf::from("out1.fq"),
14555 None,
14556 None,
14557 None,
14558 );
14559
14560 assert_eq!(pass_config.count_min.bits, Some(8));
14561 assert_eq!(config.count_min.bits, Some(16));
14562 }
14563
14564 #[test]
14565 fn count_map_capacity_hint_uses_initialsize_and_prealloc() {
14566 let explicit = Config {
14567 table_initial_size: Some(1234),
14568 ..Config::default()
14569 };
14570 assert_eq!(count_map_capacity_hint(&explicit), Some(1234));
14571
14572 let paired_prealloc = Config {
14573 table_prealloc_fraction: Some(0.5),
14574 table_reads: Some(10),
14575 in2: Some(PathBuf::from("mate.fq")),
14576 k: 31,
14577 ..Config::default()
14578 };
14579 assert_eq!(preallocation_capacity_hint(&paired_prealloc), Some(700));
14580
14581 let larger_prealloc = Config {
14582 table_initial_size: Some(100),
14583 table_prealloc_fraction: Some(1.0),
14584 table_reads: Some(10),
14585 in2: Some(PathBuf::from("mate.fq")),
14586 k: 31,
14587 ..Config::default()
14588 };
14589 assert_eq!(count_map_capacity_hint(&larger_prealloc), Some(1400));
14590 }
14591
14592 #[test]
14593 fn countup_presort_prefers_low_error_reads_like_java() {
14594 let config = Config {
14595 k: 3,
14596 min_depth: 1,
14597 low_thresh: 1,
14598 high_thresh: 3,
14599 error_detect_ratio: 2,
14600 low_percentile: 0.20,
14601 ..Config::default()
14602 };
14603 let clean = SequenceRecord {
14604 id: "clean".to_string(),
14605 numeric_id: 2,
14606 bases: b"AAAAAAAAAA".to_vec(),
14607 qualities: Some(vec![b'I'; 10]),
14608 };
14609 let noisy = SequenceRecord {
14610 id: "noisy".to_string(),
14611 numeric_id: 1,
14612 bases: b"AAAAACCCCC".to_vec(),
14613 qualities: Some(vec![b'I'; 10]),
14614 };
14615 let mut input_counts = CountMap::default();
14616 for key in kmers_for_record(&clean, &config) {
14617 input_counts.insert(key, 10);
14618 }
14619
14620 let mut pairs = [
14621 CountupWorkPair {
14622 input_list_index: 0,
14623 sort_key: countup_sort_key(&config, &input_counts, &noisy, None, 0),
14624 r1: noisy,
14625 r2: None,
14626 },
14627 CountupWorkPair {
14628 input_list_index: 0,
14629 sort_key: countup_sort_key(&config, &input_counts, &clean, None, 1),
14630 r1: clean,
14631 r2: None,
14632 },
14633 ];
14634 pairs.sort_by(compare_countup_work_pairs);
14635
14636 assert_eq!(pairs[0].r1.id, "clean");
14637 assert_eq!(pairs[0].sort_key.errors, 0);
14638 assert!(pairs[1].sort_key.errors > pairs[0].sort_key.errors);
14639 }
14640
14641 #[test]
14642 fn countup_presort_tie_breaks_by_record_id_without_duplicate_key_id() {
14643 fn tied_pair(id: &str, original_index: usize) -> CountupWorkPair {
14644 CountupWorkPair {
14645 input_list_index: 0,
14646 sort_key: CountupSortKey {
14647 errors: 0,
14648 total_len: 8,
14649 expected_errors: 0.0,
14650 numeric_id: 0,
14651 original_index,
14652 },
14653 r1: record(id, b"ACGTACGT"),
14654 r2: None,
14655 }
14656 }
14657
14658 let mut pairs = [tied_pair("read_b", 0), tied_pair("read_a", 1)];
14659 pairs.sort_by(compare_countup_work_pairs);
14660
14661 assert_eq!(pairs[0].r1.id, "read_a");
14662 assert_eq!(pairs[1].r1.id, "read_b");
14663 }
14664
14665 #[test]
14666 fn countup_spilled_runs_merge_like_in_memory_sort() {
14667 fn work_pair(
14668 id: &str,
14669 errors: usize,
14670 len: usize,
14671 original_index: usize,
14672 ) -> CountupWorkPair {
14673 CountupWorkPair {
14674 input_list_index: 0,
14675 sort_key: CountupSortKey {
14676 errors,
14677 total_len: len,
14678 expected_errors: errors as f64,
14679 numeric_id: original_index as u64,
14680 original_index,
14681 },
14682 r1: record(id, b"ACGTACGT"),
14683 r2: None,
14684 }
14685 }
14686
14687 let config = Config::default();
14688 let mut temp_dir = None;
14689 let mut run_paths = Vec::new();
14690 let mut spill_summary = CountupSpillSummary::default();
14691 let mut first_run = vec![work_pair("worse", 2, 8, 2), work_pair("best", 0, 8, 0)];
14692 let mut second_run = vec![work_pair("longer", 1, 12, 1), work_pair("shorter", 1, 8, 3)];
14693 let mut expected = first_run.clone();
14694 expected.extend(second_run.clone());
14695 expected.sort_by(compare_countup_work_pairs);
14696
14697 spill_countup_run(
14698 &config,
14699 &mut temp_dir,
14700 &mut run_paths,
14701 &mut spill_summary,
14702 &mut first_run,
14703 )
14704 .unwrap();
14705 spill_countup_run(
14706 &config,
14707 &mut temp_dir,
14708 &mut run_paths,
14709 &mut spill_summary,
14710 &mut second_run,
14711 )
14712 .unwrap();
14713 spill_summary.final_runs = run_paths.len();
14714 let source = CountupWorkSource {
14715 temp_dir,
14716 inner: CountupWorkSourceInner::Spilled(run_paths),
14717 };
14718 let mut iter = source.into_iter().unwrap();
14719 let mut actual_ids = Vec::new();
14720 while let Some(pair) = iter.next_pair().unwrap() {
14721 actual_ids.push(pair.r1.id);
14722 }
14723 let expected_ids: Vec<_> = expected.into_iter().map(|pair| pair.r1.id).collect();
14724
14725 assert_eq!(actual_ids, expected_ids);
14726 assert_eq!(actual_ids, ["best", "longer", "shorter", "worse"]);
14727 assert_eq!(spill_summary.initial_runs, 2);
14728 assert_eq!(spill_summary.merge_runs, 0);
14729 assert_eq!(spill_summary.final_runs, 2);
14730 assert!(spill_summary.bytes_written > 0);
14731 assert_eq!(
14732 spill_summary.peak_live_bytes,
14733 spill_summary.final_live_bytes
14734 );
14735 }
14736
14737 #[test]
14738 fn countup_spill_live_limit_aborts_initial_run() {
14739 let config = Config {
14740 max_countup_spill_live_bytes: Some(0),
14741 ..Config::default()
14742 };
14743 let mut temp_dir = None;
14744 let mut run_paths = Vec::new();
14745 let mut spill_summary = CountupSpillSummary::default();
14746 let mut run = vec![CountupWorkPair {
14747 input_list_index: 0,
14748 sort_key: CountupSortKey {
14749 errors: 0,
14750 total_len: 8,
14751 expected_errors: 0.0,
14752 numeric_id: 0,
14753 original_index: 0,
14754 },
14755 r1: record("read", b"ACGTACGT"),
14756 r2: None,
14757 }];
14758
14759 let err = spill_countup_run(
14760 &config,
14761 &mut temp_dir,
14762 &mut run_paths,
14763 &mut spill_summary,
14764 &mut run,
14765 )
14766 .unwrap_err()
14767 .to_string();
14768
14769 assert!(err.contains("maxcountupspillbytes"), "{err}");
14770 assert_eq!(spill_summary.initial_runs, 1);
14771 assert!(spill_summary.peak_live_bytes > 0);
14772 assert_eq!(run_paths.len(), 1);
14773 }
14774
14775 #[test]
14776 fn countup_spill_final_live_limit_aborts_initial_run() {
14777 let config = Config {
14778 max_countup_spill_final_live_bytes: Some(0),
14779 ..Config::default()
14780 };
14781 let mut temp_dir = None;
14782 let mut run_paths = Vec::new();
14783 let mut spill_summary = CountupSpillSummary::default();
14784 let mut run = vec![CountupWorkPair {
14785 input_list_index: 0,
14786 sort_key: CountupSortKey {
14787 errors: 0,
14788 total_len: 8,
14789 expected_errors: 0.0,
14790 numeric_id: 0,
14791 original_index: 0,
14792 },
14793 r1: record("read", b"ACGTACGT"),
14794 r2: None,
14795 }];
14796
14797 let err = spill_countup_run(
14798 &config,
14799 &mut temp_dir,
14800 &mut run_paths,
14801 &mut spill_summary,
14802 &mut run,
14803 )
14804 .unwrap_err()
14805 .to_string();
14806
14807 assert!(err.contains("maxcountupspillfinallivebytes"), "{err}");
14808 assert_eq!(spill_summary.initial_runs, 1);
14809 assert!(spill_summary.final_live_bytes > 0);
14810 assert_eq!(run_paths.len(), 1);
14811 }
14812
14813 #[test]
14814 fn countup_spill_initial_run_limit_aborts_initial_run() {
14815 let config = Config {
14816 max_countup_spill_initial_runs: Some(0),
14817 ..Config::default()
14818 };
14819 let mut temp_dir = None;
14820 let mut run_paths = Vec::new();
14821 let mut spill_summary = CountupSpillSummary::default();
14822 let mut run = vec![CountupWorkPair {
14823 input_list_index: 0,
14824 sort_key: CountupSortKey {
14825 errors: 0,
14826 total_len: 8,
14827 expected_errors: 0.0,
14828 numeric_id: 0,
14829 original_index: 0,
14830 },
14831 r1: record("read", b"ACGTACGT"),
14832 r2: None,
14833 }];
14834
14835 let err = spill_countup_run(
14836 &config,
14837 &mut temp_dir,
14838 &mut run_paths,
14839 &mut spill_summary,
14840 &mut run,
14841 )
14842 .unwrap_err()
14843 .to_string();
14844
14845 assert!(err.contains("maxcountupspillinitialruns"), "{err}");
14846 assert_eq!(spill_summary.initial_runs, 1);
14847 }
14848
14849 #[test]
14850 fn countup_compacted_run_group_preserves_sorted_order() {
14851 fn work_pair(
14852 id: &str,
14853 errors: usize,
14854 len: usize,
14855 original_index: usize,
14856 ) -> CountupWorkPair {
14857 CountupWorkPair {
14858 input_list_index: 0,
14859 sort_key: CountupSortKey {
14860 errors,
14861 total_len: len,
14862 expected_errors: errors as f64,
14863 numeric_id: original_index as u64,
14864 original_index,
14865 },
14866 r1: record(id, b"ACGTACGT"),
14867 r2: None,
14868 }
14869 }
14870
14871 let dir = tempfile::tempdir().unwrap();
14872 let mut all_pairs = Vec::new();
14873 let mut paths = Vec::new();
14874 for (run_index, mut run) in [
14875 vec![work_pair("c", 3, 8, 3), work_pair("a", 0, 8, 0)],
14876 vec![work_pair("d", 4, 8, 4), work_pair("b", 1, 8, 1)],
14877 vec![work_pair("e", 5, 8, 5), work_pair("aa", 1, 12, 2)],
14878 ]
14879 .into_iter()
14880 .enumerate()
14881 {
14882 all_pairs.extend(run.clone());
14883 run.sort_by(compare_countup_work_pairs);
14884 let path = dir.path().join(format!("run-{run_index}.bin"));
14885 write_countup_run(&path, &run).unwrap();
14886 paths.push(path);
14887 }
14888 all_pairs.sort_by(compare_countup_work_pairs);
14889
14890 let merged = dir.path().join("merged.bin");
14891 let merged_bytes = merge_countup_run_group(&paths, &merged).unwrap();
14892 let mut reader = CountupRunReader::open(&merged).unwrap();
14893 let mut actual_ids = Vec::new();
14894 while let Some(pair) = reader.next_pair().unwrap() {
14895 actual_ids.push(pair.r1.id);
14896 }
14897 let expected_ids: Vec<_> = all_pairs.into_iter().map(|pair| pair.r1.id).collect();
14898
14899 assert_eq!(actual_ids, expected_ids);
14900 assert_eq!(merged_bytes, merged.metadata().unwrap().len());
14901 }
14902
14903 #[test]
14904 fn countup_compaction_tracks_peak_and_final_temp_bytes() {
14905 let dir = tempfile::tempdir().unwrap();
14906 let mut paths = Vec::new();
14907 let mut spill_summary = CountupSpillSummary::default();
14908
14909 for run_index in 0..=COUNTUP_SORT_MERGE_FANIN {
14910 let path = dir.path().join(format!("run-{run_index}.bin"));
14911 let pair = CountupWorkPair {
14912 input_list_index: 0,
14913 sort_key: CountupSortKey {
14914 errors: run_index,
14915 total_len: 8,
14916 expected_errors: run_index as f64,
14917 numeric_id: run_index as u64,
14918 original_index: run_index,
14919 },
14920 r1: record(&format!("read-{run_index}"), b"ACGTACGT"),
14921 r2: None,
14922 };
14923 let bytes = write_countup_run(&path, &[pair]).unwrap();
14924 spill_summary.note_initial_run(bytes);
14925 paths.push(path);
14926 }
14927
14928 let initial_live_bytes = spill_summary.final_live_bytes;
14929 compact_countup_runs(&Config::default(), &mut paths, &mut spill_summary).unwrap();
14930 spill_summary.final_runs = paths.len();
14931 let final_live_from_files: u64 = paths
14932 .iter()
14933 .map(|path| path.metadata().unwrap().len())
14934 .sum();
14935
14936 assert_eq!(spill_summary.initial_runs, COUNTUP_SORT_MERGE_FANIN + 1);
14937 assert_eq!(spill_summary.merge_runs, 2);
14938 assert_eq!(spill_summary.final_runs, 2);
14939 assert_eq!(spill_summary.final_live_bytes, final_live_from_files);
14940 assert!(spill_summary.bytes_written > initial_live_bytes);
14941 assert!(spill_summary.peak_live_bytes >= initial_live_bytes);
14942 }
14943
14944 #[test]
14945 fn countup_spill_write_limit_aborts_compaction() {
14946 let dir = tempfile::tempdir().unwrap();
14947 let mut paths = Vec::new();
14948 let mut spill_summary = CountupSpillSummary::default();
14949
14950 for run_index in 0..=COUNTUP_SORT_MERGE_FANIN {
14951 let path = dir.path().join(format!("run-{run_index}.bin"));
14952 let pair = CountupWorkPair {
14953 input_list_index: 0,
14954 sort_key: CountupSortKey {
14955 errors: run_index,
14956 total_len: 8,
14957 expected_errors: run_index as f64,
14958 numeric_id: run_index as u64,
14959 original_index: run_index,
14960 },
14961 r1: record(&format!("read-{run_index}"), b"ACGTACGT"),
14962 r2: None,
14963 };
14964 let bytes = write_countup_run(&path, &[pair]).unwrap();
14965 spill_summary.note_initial_run(bytes);
14966 paths.push(path);
14967 }
14968 let config = Config {
14969 max_countup_spill_write_bytes: Some(spill_summary.bytes_written),
14970 ..Config::default()
14971 };
14972
14973 let err = compact_countup_runs(&config, &mut paths, &mut spill_summary)
14974 .unwrap_err()
14975 .to_string();
14976
14977 assert!(err.contains("maxcountupspillwritebytes"), "{err}");
14978 assert!(spill_summary.merge_runs > 0);
14979 assert!(spill_summary.bytes_written > config.max_countup_spill_write_bytes.unwrap());
14980 }
14981
14982 #[test]
14983 fn countup_spill_run_limits_abort_compaction() {
14984 let dir = tempfile::tempdir().unwrap();
14985 let mut paths = Vec::new();
14986 let mut spill_summary = CountupSpillSummary::default();
14987
14988 for run_index in 0..=COUNTUP_SORT_MERGE_FANIN {
14989 let path = dir.path().join(format!("run-{run_index}.bin"));
14990 let pair = CountupWorkPair {
14991 input_list_index: 0,
14992 sort_key: CountupSortKey {
14993 errors: run_index,
14994 total_len: 8,
14995 expected_errors: run_index as f64,
14996 numeric_id: run_index as u64,
14997 original_index: run_index,
14998 },
14999 r1: record(&format!("read-{run_index}"), b"ACGTACGT"),
15000 r2: None,
15001 };
15002 let bytes = write_countup_run(&path, &[pair]).unwrap();
15003 spill_summary.note_initial_run(bytes);
15004 paths.push(path);
15005 }
15006 let merge_limited = Config {
15007 max_countup_spill_merge_runs: Some(0),
15008 ..Config::default()
15009 };
15010 let mut merge_limited_paths = paths.clone();
15011 let err =
15012 compact_countup_runs(&merge_limited, &mut merge_limited_paths, &mut spill_summary)
15013 .unwrap_err()
15014 .to_string();
15015 assert!(err.contains("maxcountupspillmergeruns"), "{err}");
15016
15017 let mut spill_summary = CountupSpillSummary::default();
15018 let mut paths = Vec::new();
15019 for run_index in 0..=COUNTUP_SORT_MERGE_FANIN {
15020 let path = dir.path().join(format!("final-run-{run_index}.bin"));
15021 let pair = CountupWorkPair {
15022 input_list_index: 0,
15023 sort_key: CountupSortKey {
15024 errors: run_index,
15025 total_len: 8,
15026 expected_errors: run_index as f64,
15027 numeric_id: run_index as u64,
15028 original_index: run_index,
15029 },
15030 r1: record(&format!("final-read-{run_index}"), b"ACGTACGT"),
15031 r2: None,
15032 };
15033 let bytes = write_countup_run(&path, &[pair]).unwrap();
15034 spill_summary.note_initial_run(bytes);
15035 paths.push(path);
15036 }
15037 let final_limited = Config {
15038 max_countup_spill_final_runs: Some(1),
15039 ..Config::default()
15040 };
15041 let err = compact_countup_runs(&final_limited, &mut paths, &mut spill_summary)
15042 .unwrap_err()
15043 .to_string();
15044 assert!(err.contains("maxcountupspillfinalruns"), "{err}");
15045 }
15046
15047 #[test]
15048 fn countup_run_reader_uses_large_spill_buffer() {
15049 let dir = tempfile::tempdir().unwrap();
15050 let path = dir.path().join("run.bin");
15051 let pair = CountupWorkPair {
15052 input_list_index: 0,
15053 sort_key: CountupSortKey {
15054 errors: 0,
15055 total_len: 8,
15056 expected_errors: 0.0,
15057 numeric_id: 0,
15058 original_index: 0,
15059 },
15060 r1: record("read", b"ACGTACGT"),
15061 r2: None,
15062 };
15063
15064 write_countup_run(&path, &[pair]).unwrap();
15065 let reader = CountupRunReader::open(&path).unwrap();
15066
15067 assert_eq!(reader.reader.capacity(), COUNTUP_RUN_IO_BUFFER_CAPACITY);
15068 }
15069
15070 #[test]
15071 fn countup_work_pair_memory_hint_tracks_payload_size() {
15072 let small = CountupWorkPair {
15073 input_list_index: 0,
15074 sort_key: CountupSortKey {
15075 errors: 0,
15076 total_len: 4,
15077 expected_errors: 0.0,
15078 numeric_id: 0,
15079 original_index: 0,
15080 },
15081 r1: record("small", b"ACGT"),
15082 r2: None,
15083 };
15084 let large = CountupWorkPair {
15085 input_list_index: 0,
15086 sort_key: CountupSortKey {
15087 errors: 0,
15088 total_len: 400,
15089 expected_errors: 0.0,
15090 numeric_id: 1,
15091 original_index: 1,
15092 },
15093 r1: record("large", &vec![b'A'; 400]),
15094 r2: Some(record("large/2", &vec![b'C'; 400])),
15095 };
15096
15097 assert!(countup_work_pair_memory_hint(&large) > countup_work_pair_memory_hint(&small));
15098 }
15099
15100 #[test]
15101 fn countup_work_candidate_memory_hint_tracks_payload_size() {
15102 let small = CountupWorkCandidate {
15103 input_list_index: 0,
15104 original_index: 0,
15105 rand: 0.0,
15106 r1: record("small", b"ACGT"),
15107 r2: None,
15108 };
15109 let large = CountupWorkCandidate {
15110 input_list_index: 0,
15111 original_index: 1,
15112 rand: 0.0,
15113 r1: record("large", &vec![b'A'; 400]),
15114 r2: Some(record("large/2", &vec![b'C'; 400])),
15115 };
15116
15117 assert!(
15118 countup_work_candidate_memory_hint(&large) > countup_work_candidate_memory_hint(&small)
15119 );
15120 }
15121
15122 #[test]
15123 fn countup_prepass_chunk_ready_respects_pair_and_byte_limits() {
15124 assert!(!countup_prepass_chunk_ready(
15125 COUNTUP_PREPASS_CHUNK_PAIR_LIMIT - 1,
15126 COUNTUP_PREPASS_CHUNK_BYTE_LIMIT - 1
15127 ));
15128 assert!(countup_prepass_chunk_ready(
15129 COUNTUP_PREPASS_CHUNK_PAIR_LIMIT,
15130 0
15131 ));
15132 assert!(countup_prepass_chunk_ready(
15133 1,
15134 COUNTUP_PREPASS_CHUNK_BYTE_LIMIT
15135 ));
15136 }
15137
15138 #[test]
15139 fn countup_prepass_carries_tossed_reads_only_with_abrc() {
15140 let config = Config {
15141 k: 3,
15142 min_length: 11,
15143 target_depth: 2,
15144 min_depth: 1,
15145 min_kmers_over_min_depth: 3,
15146 ..Config::default()
15147 };
15148 let prepass = countup_prepass_config(&config);
15149 assert_eq!(prepass.target_depth, 8);
15150 assert_eq!(prepass.min_depth, 0);
15151 assert_eq!(prepass.min_kmers_over_min_depth, 1);
15152
15153 let input_counts = CountMap::default();
15154 let mut filtered = record("short", b"AAAAAAAAAA");
15155 assert!(
15156 !countup_prepass_pair(&prepass, false, &input_counts, &mut filtered, None, 0.0,)
15157 .include
15158 );
15159
15160 let mut carried = record("short", b"AAAAAAAAAA");
15161 assert!(
15162 countup_prepass_pair(&prepass, true, &input_counts, &mut carried, None, 0.0,).include
15163 );
15164 }
15165
15166 #[test]
15167 fn countup_prepass_requires_both_mates_bad_like_java() {
15168 let config = Config {
15169 count_up: true,
15170 toss_error_reads: true,
15171 require_both_bad: false,
15172 k: 3,
15173 target_depth: 100,
15174 max_depth: Some(1000),
15175 min_depth: 1,
15176 min_kmers_over_min_depth: 1,
15177 error_detect_ratio: 2,
15178 high_thresh: 2,
15179 low_thresh: 1,
15180 ..Config::default()
15181 };
15182 let prepass = countup_prepass_config(&config);
15183 assert!(!config.require_both_bad);
15184 assert!(prepass.require_both_bad);
15185
15186 let mut bad_mate = record("bad", b"AAACCC");
15187 let mut good_mate = record("good", b"GGGGGG");
15188 let mut input_counts = CountMap::default();
15189 let bad_keys = kmers_for_record(&bad_mate, &prepass);
15190 for key in &bad_keys {
15191 input_counts.insert(key.clone(), 10);
15192 }
15193 input_counts.insert(bad_keys[1].clone(), 1);
15194 input_counts.insert(bad_keys[2].clone(), 1);
15195 for key in kmers_for_record(&good_mate, &prepass) {
15196 input_counts.insert(key, 10);
15197 }
15198
15199 assert!(analyze_pair(&prepass, &input_counts, &bad_mate, None).error1);
15200 assert!(!analyze_pair(&prepass, &input_counts, &good_mate, None).error1);
15201 assert!(
15202 countup_prepass_pair(
15203 &prepass,
15204 false,
15205 &input_counts,
15206 &mut bad_mate,
15207 Some(&mut good_mate),
15208 0.0,
15209 )
15210 .include
15211 );
15212 }
15213
15214 #[test]
15215 fn countup_prepass_reuses_decision_analysis_for_sort_key_without_ecc() {
15216 let config = Config {
15217 count_up: true,
15218 k: 3,
15219 min_depth: 1,
15220 min_kmers_over_min_depth: 1,
15221 target_depth: 100,
15222 max_depth: Some(1000),
15223 ..Config::default()
15224 };
15225 let prepass = countup_prepass_config(&config);
15226 let mut read = record("read42", b"ACGTACGT");
15227 let mut input_counts = CountMap::default();
15228 for key in kmers_for_record(&read, &prepass) {
15229 input_counts.insert(key, 10);
15230 }
15231
15232 let result = countup_prepass_pair(&prepass, false, &input_counts, &mut read, None, 0.0);
15233 let reused_key =
15234 countup_sort_key_from_analysis(&read, None, 42, result.sort_analysis.as_ref().unwrap());
15235 let replayed_key = countup_sort_key(&prepass, &input_counts, &read, None, 42);
15236
15237 assert!(result.include);
15238 assert_eq!(reused_key.errors, replayed_key.errors);
15239 assert_eq!(reused_key.total_len, replayed_key.total_len);
15240 assert_eq!(reused_key.numeric_id, replayed_key.numeric_id);
15241 assert_eq!(reused_key.original_index, replayed_key.original_index);
15242 assert_eq!(reused_key.expected_errors, replayed_key.expected_errors);
15243 }
15244
15245 #[test]
15246 fn countup_work_candidates_match_sequential_prepass_sort_keys() {
15247 let config = Config {
15248 count_up: true,
15249 k: 3,
15250 min_depth: 1,
15251 min_kmers_over_min_depth: 1,
15252 target_depth: 100,
15253 max_depth: Some(1000),
15254 ..Config::default()
15255 };
15256 let prepass = countup_prepass_config(&config);
15257 let clean = record("clean", b"ACGTACGT");
15258 let noisy = record("noisy", b"AAAACCCC");
15259 let mut input_counts = CountMap::default();
15260 for key in kmers_for_record(&clean, &prepass) {
15261 input_counts.insert(key, 10);
15262 }
15263 let candidates = vec![
15264 CountupWorkCandidate {
15265 input_list_index: 0,
15266 original_index: 0,
15267 rand: 0.0,
15268 r1: noisy.clone(),
15269 r2: None,
15270 },
15271 CountupWorkCandidate {
15272 input_list_index: 0,
15273 original_index: 1,
15274 rand: 0.0,
15275 r1: clean.clone(),
15276 r2: None,
15277 },
15278 ];
15279 let mut actual =
15280 process_countup_work_candidates(&config, &prepass, &input_counts, candidates);
15281 let mut expected = vec![
15282 CountupWorkPair {
15283 input_list_index: 0,
15284 sort_key: countup_sort_key(&prepass, &input_counts, &noisy, None, 0),
15285 r1: noisy,
15286 r2: None,
15287 },
15288 CountupWorkPair {
15289 input_list_index: 0,
15290 sort_key: countup_sort_key(&prepass, &input_counts, &clean, None, 1),
15291 r1: clean,
15292 r2: None,
15293 },
15294 ];
15295 actual.sort_by(compare_countup_work_pairs);
15296 expected.sort_by(compare_countup_work_pairs);
15297
15298 let actual_ids: Vec<_> = actual.iter().map(|pair| pair.r1.id.as_str()).collect();
15299 let expected_ids: Vec<_> = expected.iter().map(|pair| pair.r1.id.as_str()).collect();
15300 assert_eq!(actual_ids, expected_ids);
15301 for (actual, expected) in actual.iter().zip(&expected) {
15302 assert_eq!(actual.sort_key.errors, expected.sort_key.errors);
15303 assert_eq!(actual.sort_key.total_len, expected.sort_key.total_len);
15304 assert_eq!(
15305 actual.sort_key.original_index,
15306 expected.sort_key.original_index
15307 );
15308 }
15309 }
15310
15311 #[test]
15312 fn countup_length_filter_respects_keepall_override() {
15313 let read = record("short", b"ACGT");
15314 let filter_config = Config {
15315 min_length: 5,
15316 ..Config::default()
15317 };
15318 assert!(countup_length_toss(&filter_config, &read, None));
15319
15320 let keepall_config = Config {
15321 keep_all: true,
15322 ..filter_config
15323 };
15324 assert!(!countup_length_toss(&keepall_config, &read, None));
15325 }
15326
15327 #[test]
15328 fn countup_tossbadreads_applies_java_error_spike_rules() {
15329 let keys: Vec<_> = (0..20).map(KmerKey::Short).collect();
15330 let mut input_counts = CountMap::default();
15331 let mut kept_counts = CountMap::default();
15332 for (index, key) in keys.iter().enumerate() {
15333 let input_depth = if index < 8 { 1 } else { 10 };
15334 let kept_depth = if index < 8 { 0 } else { 10 };
15335 input_counts.insert(key.clone(), input_depth);
15336 kept_counts.insert(key.clone(), kept_depth);
15337 }
15338 let base_config = Config {
15339 min_depth: 1,
15340 min_kmers_over_min_depth: 1,
15341 target_depth: 10,
15342 low_thresh: 1,
15343 high_thresh: 10,
15344 error_detect_ratio: 2,
15345 ..Config::default()
15346 };
15347
15348 assert!(!decide_countup_pair(
15349 &base_config,
15350 &input_counts,
15351 &kept_counts,
15352 &keys,
15353 10,
15354 ));
15355
15356 let toss_errors_config = Config {
15357 toss_error_reads: true,
15358 ..base_config.clone()
15359 };
15360 assert!(decide_countup_pair(
15361 &toss_errors_config,
15362 &input_counts,
15363 &kept_counts,
15364 &keys,
15365 10,
15366 ));
15367
15368 let keepall_config = Config {
15369 keep_all: true,
15370 ..toss_errors_config
15371 };
15372 assert!(!decide_countup_pair(
15373 &keepall_config,
15374 &input_counts,
15375 &kept_counts,
15376 &keys,
15377 10,
15378 ));
15379 }
15380
15381 #[test]
15382 fn java_rng_matches_known_first_doubles() {
15383 let mut rng = JavaXoshiro::new(0);
15384 let values = [
15385 rng.next_double(),
15386 rng.next_double(),
15387 rng.next_double(),
15388 rng.next_double(),
15389 ];
15390 let expected = [
15391 0.02774461029305808,
15392 0.9419058303890074,
15393 0.3687890049137593,
15394 0.8390756877056451,
15395 ];
15396 for (actual, expected) in values.into_iter().zip(expected) {
15397 assert!((actual - expected).abs() < f64::EPSILON);
15398 }
15399 }
15400
15401 #[test]
15402 fn nondeterministic_seed_varies_between_requests() {
15403 let first = nondeterministic_seed();
15404 let second = nondeterministic_seed();
15405 assert_ne!(first, second);
15406 }
15407
15408 #[test]
15409 fn deterministic_coin_uses_java_read_rand_shape() {
15410 assert_eq!(deterministic_coin(Some(0.0), 7), 1);
15411 assert_eq!(deterministic_coin(Some(0.5), 7), 4);
15412 assert_eq!(deterministic_coin(Some(0.999_999), 7), 7);
15413 }
15414
15415 #[test]
15416 fn qtrim_right_uses_java_optimal_quality_scoring() {
15417 let config = Config {
15418 trim_right: true,
15419 trim_quality: 10.0,
15420 ..Config::default()
15421 };
15422 let mut read = quality_record("r1", b"ACGTACGT", b"IIII!!!!");
15423
15424 trim_record(&config, &mut read);
15425
15426 assert_eq!(read.bases, b"ACGT");
15427 assert_eq!(read.qualities.as_deref(), Some(&b"IIII"[..]));
15428 }
15429
15430 #[test]
15431 fn qtrim_left_uses_java_optimal_quality_scoring() {
15432 let config = Config {
15433 trim_left: true,
15434 trim_quality: 10.0,
15435 ..Config::default()
15436 };
15437 let mut read = quality_record("r1", b"ACGTACGT", b"!!!!IIII");
15438
15439 trim_record(&config, &mut read);
15440
15441 assert_eq!(read.bases, b"ACGT");
15442 assert_eq!(read.qualities.as_deref(), Some(&b"IIII"[..]));
15443 }
15444
15445 #[test]
15446 fn ecc_corrects_single_substitution_from_exact_counts() {
15447 let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15448 let mut mutant = clean.to_vec();
15449 mutant[14] = b'A';
15450 assert_ne!(mutant, clean);
15451
15452 let config = Config {
15453 k: 7,
15454 min_quality: 0,
15455 min_prob: 0.0,
15456 error_correct: true,
15457 passes: 1,
15458 ..Config::default()
15459 };
15460 let mut counts = CountMap::default();
15461 for i in 0..30 {
15462 increment_pair_counts(
15463 &config,
15464 &mut counts,
15465 &record(&format!("clean{i}"), clean),
15466 None,
15467 );
15468 }
15469 increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15470
15471 let mut read = record("mutant", &mutant);
15472 let result = correct_read_errors(&config, &counts, &mut read);
15473
15474 assert_eq!(result.corrected, 1);
15475 assert!(!result.uncorrectable);
15476 assert_eq!(read.bases, clean);
15477 }
15478
15479 #[test]
15480 fn ecc_flags_high_quality_suspect_error_as_uncorrectable() {
15481 let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15482 let mut mutant = clean.to_vec();
15483 mutant[14] = b'A';
15484 let config = Config {
15485 k: 7,
15486 min_quality: 0,
15487 min_prob: 0.0,
15488 error_correct: true,
15489 max_quality_to_correct: 0,
15490 passes: 1,
15491 ..Config::default()
15492 };
15493 let mut counts = CountMap::default();
15494 for i in 0..30 {
15495 increment_pair_counts(
15496 &config,
15497 &mut counts,
15498 &record(&format!("clean{i}"), clean),
15499 None,
15500 );
15501 }
15502 increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15503
15504 let mut read = record("mutant", &mutant);
15505 let result = correct_read_errors(&config, &counts, &mut read);
15506
15507 assert_eq!(result.corrected, 0);
15508 assert!(result.uncorrectable);
15509 assert_eq!(read.bases, mutant);
15510 }
15511
15512 #[test]
15513 fn ecc_pair_rollback_restores_corrected_mate_when_partner_is_uncorrectable() {
15514 let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15515 let mut mutant = clean.to_vec();
15516 mutant[14] = b'A';
15517 let config = Config {
15518 k: 7,
15519 min_quality: 0,
15520 min_prob: 0.0,
15521 error_correct: true,
15522 max_quality_to_correct: 20,
15523 passes: 1,
15524 ..Config::default()
15525 };
15526 let mut counts = CountMap::default();
15527 for i in 0..30 {
15528 increment_pair_counts(
15529 &config,
15530 &mut counts,
15531 &record(&format!("clean{i}"), clean),
15532 None,
15533 );
15534 }
15535 increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15536
15537 let low_quality = vec![b'!'; mutant.len()];
15538 let high_quality = vec![b'I'; mutant.len()];
15539 let mut correctable = quality_record("lowq", &mutant, &low_quality);
15540 let mut uncorrectable = quality_record("highq", &mutant, &high_quality);
15541 let original_correctable = correctable.clone();
15542 let original_uncorrectable = uncorrectable.clone();
15543
15544 let result = correct_pair_errors_with_rollback(
15545 &config,
15546 &counts,
15547 &mut correctable,
15548 Some(&mut uncorrectable),
15549 );
15550
15551 assert!(result.corrected > 0);
15552 assert!(result.uncorrectable);
15553 assert_eq!(correctable.bases, original_correctable.bases);
15554 assert_eq!(
15555 correctable.qualities.as_deref(),
15556 original_correctable.qualities.as_deref()
15557 );
15558 assert_eq!(uncorrectable.bases, original_uncorrectable.bases);
15559 }
15560
15561 #[test]
15562 fn ecc_marks_uncorrectable_errors_when_requested() {
15563 let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15564 let mut mutant = clean.to_vec();
15565 mutant[14] = b'A';
15566 let config = Config {
15567 k: 7,
15568 min_quality: 0,
15569 min_prob: 0.0,
15570 error_correct: true,
15571 max_quality_to_correct: 0,
15572 mark_uncorrectable_errors: true,
15573 passes: 1,
15574 ..Config::default()
15575 };
15576 let mut counts = CountMap::default();
15577 for i in 0..30 {
15578 increment_pair_counts(
15579 &config,
15580 &mut counts,
15581 &record(&format!("clean{i}"), clean),
15582 None,
15583 );
15584 }
15585 increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15586
15587 let mut read = record("mutant", &mutant);
15588 let result = correct_read_errors(&config, &counts, &mut read);
15589
15590 assert_eq!(result.corrected, 0);
15591 assert_eq!(result.marked, 1);
15592 assert!(result.uncorrectable);
15593 assert_eq!(read.bases, mutant);
15594 assert_eq!(read.qualities.as_ref().unwrap()[14], b'2');
15595 }
15596
15597 #[test]
15598 fn ecc_mark_only_reduces_suspect_base_quality() {
15599 let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15600 let mut mutant = clean.to_vec();
15601 mutant[14] = b'A';
15602 let config = Config {
15603 k: 7,
15604 min_quality: 0,
15605 min_prob: 0.0,
15606 error_correct: true,
15607 mark_errors_only: true,
15608 passes: 1,
15609 ..Config::default()
15610 };
15611 let mut counts = CountMap::default();
15612 for i in 0..30 {
15613 increment_pair_counts(
15614 &config,
15615 &mut counts,
15616 &record(&format!("clean{i}"), clean),
15617 None,
15618 );
15619 }
15620 increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15621
15622 let mut read = record("mutant", &mutant);
15623 let result = correct_read_errors(&config, &counts, &mut read);
15624
15625 assert_eq!(result.marked, 1);
15626 assert_eq!(read.bases, mutant);
15627 assert_eq!(read.qualities.as_ref().unwrap()[14], b'2');
15628 }
15629
15630 #[test]
15631 fn ecc_mark_only_marks_all_detected_sites_even_when_ecclimit_is_low() {
15632 let config = Config {
15633 k: 7,
15634 prefix_len: 2,
15635 max_errors_to_correct: 1,
15636 correct_from_right: false,
15637 ..Config::default()
15638 };
15639 let mut read = quality_record("marked", b"ACGTTGCATGTC", b"IIIIIIIIIIII");
15640 let coverage = vec![30, 30, 0, 30, 30, 0];
15641
15642 let result = mark_read_errors(&config, &mut read, &coverage);
15643
15644 assert_eq!(result.marked, 2);
15645 let qualities = read.qualities.as_deref().unwrap();
15646 assert_eq!(qualities[8], b'2');
15647 assert_eq!(qualities[11], b'2');
15648 }
15649
15650 #[test]
15651 fn overlap_ecc_repairs_lower_quality_mate_base() {
15652 let r1_bases = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
15653 let r2_clean = b"GTCGCATATTTCAAGCACTACTTCGCTGCGGCACAACTAA";
15654 let mut r2_bases = r2_clean.to_vec();
15655 r2_bases[20] = b'A';
15656 let mut r1 = quality_record("r1", r1_bases, b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII");
15657 let mut r2 = quality_record("r2", &r2_bases, b"IIIIIIIIIIIIIIIIIIII#IIIIIIIIIIIIIIIIIII");
15658 let config = Config {
15659 overlap_error_correct: true,
15660 max_quality_to_correct: 20,
15661 ..Config::default()
15662 };
15663
15664 let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15665
15666 assert_eq!(result.corrected, 1);
15667 assert_eq!(r1.bases, r1_bases);
15668 assert_eq!(r2.bases, r2_clean);
15669 assert_eq!(
15670 r1.qualities.as_deref(),
15671 Some(&b"SSSSSSSSSSSSSSSSSSSGSSSSSSSSSSSSSSSSSSSS"[..])
15672 );
15673 assert_eq!(
15674 r2.qualities.as_deref(),
15675 Some(&b"SSSSSSSSSSSSSSSSSSSSGSSSSSSSSSSSSSSSSSSS"[..])
15676 );
15677 }
15678
15679 #[test]
15680 fn overlap_ecc_skips_short_pairs_like_java_strict_mode() {
15681 let r1_bases = b"ACGTTGCATGTCAGTA";
15682 let r2_clean = b"TACTGACATGCAACGT";
15683 let mut r2_bases = r2_clean.to_vec();
15684 r2_bases[9] = b'T';
15685 let mut r1 = quality_record("r1", r1_bases, b"IIIIIIIIIIIIIIII");
15686 let mut r2 = quality_record("r2", &r2_bases, b"IIIIIIIII!IIIIII");
15687 let config = Config {
15688 overlap_error_correct: true,
15689 max_quality_to_correct: 20,
15690 ..Config::default()
15691 };
15692
15693 let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15694
15695 assert_eq!(result.corrected, 0);
15696 assert_eq!(r1.bases, r1_bases);
15697 assert_eq!(r2.bases, r2_bases);
15698 }
15699
15700 #[test]
15701 fn overlap_ecc_skips_ambiguous_repetitive_pairs_like_java_strict_mode() {
15702 let r1_bases = b"ACGTTGCATGTCAGTAACGTTGCATGTCAGTAACGTTGCA";
15703 let r2_clean = b"TGCAACGTTACTGACATGCAACGTTACTGACATGCAACGT";
15704 let mut r2_bases = r2_clean.to_vec();
15705 r2_bases[20] = b'C';
15706 let mut r1 = quality_record("r1", r1_bases, b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII");
15707 let mut r2 = quality_record("r2", &r2_bases, b"IIIIIIIIIIIIIIIIIIII!IIIIIIIIIIIIIIIIIII");
15708 let config = Config {
15709 overlap_error_correct: true,
15710 max_quality_to_correct: 20,
15711 ..Config::default()
15712 };
15713
15714 let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15715
15716 assert_eq!(result.corrected, 0);
15717 assert_eq!(r1.bases, r1_bases);
15718 assert_eq!(r2.bases, r2_bases);
15719 }
15720
15721 #[test]
15722 fn overlap_entropy_gate_keeps_java_strict_floor_for_high_entropy_fixture() {
15723 let bases = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
15724 assert_eq!(overlap_entropy_min_overlap(bases), 12);
15725 }
15726
15727 #[test]
15728 fn overlap_entropy_gate_raises_min_overlap_for_low_complexity_reads() {
15729 let bases = b"AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT";
15730 assert_eq!(overlap_entropy_min_overlap(bases), 32);
15731 }
15732
15733 #[test]
15734 fn overlap_ecc_rejects_high_confidence_mismatch_like_java_expected_filter() {
15735 let r1_bases = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
15736 let mut r2_bases = b"GTCGCATATTTCAAGCACTACTTCGCTGCGGCACAACTAA".to_vec();
15737 r2_bases[20] = b'A';
15738 let mut r1 = quality_record("r1", r1_bases, b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII");
15739 let mut r2 = quality_record("r2", &r2_bases, b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII");
15740 let config = Config {
15741 overlap_error_correct: true,
15742 max_quality_to_correct: 41,
15743 ..Config::default()
15744 };
15745
15746 let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15747
15748 assert_eq!(result.corrected, 0);
15749 assert_eq!(r1.bases, r1_bases);
15750 assert_eq!(r2.bases, r2_bases);
15751 assert_eq!(
15752 r1.qualities.as_deref(),
15753 Some(&b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"[..])
15754 );
15755 assert_eq!(
15756 r2.qualities.as_deref(),
15757 Some(&b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"[..])
15758 );
15759 }
15760
15761 #[test]
15762 fn overlap_ecc_rejects_low_confidence_tie_under_java_strict_mode() {
15763 let r1_bases = b"TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGAC";
15764 let mut r2_bases = b"GTCGCATATTTCAAGCACTACTTCGCTGCGGCACAACTAA".to_vec();
15765 r2_bases[20] = b'A';
15766 let mut r1 = quality_record("r1", r1_bases, b"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
15767 let mut r2 = quality_record("r2", &r2_bases, b"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
15768 let config = Config {
15769 overlap_error_correct: true,
15770 max_quality_to_correct: 41,
15771 ..Config::default()
15772 };
15773
15774 let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15775
15776 assert_eq!(result.corrected, 0);
15777 assert_eq!(r1.bases, r1_bases);
15778 assert_eq!(r2.bases, r2_bases);
15779 assert_eq!(
15780 r1.qualities.as_deref(),
15781 Some(&b"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"[..])
15782 );
15783 assert_eq!(
15784 r2.qualities.as_deref(),
15785 Some(&b"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"[..])
15786 );
15787 }
15788
15789 #[test]
15790 fn overlap_ecc_rejects_quality_weighted_multimismatch_candidate_like_java() {
15791 let r1_bases = b"CAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAA";
15792 let r2_bases = b"TTTTGCTAACGCGTCTGGCATCTCAACAGGCATTGGTTAC";
15793 let mut r1 = quality_record(
15794 "r1",
15795 r1_bases,
15796 b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII",
15797 );
15798 let mut r2 = quality_record("r2", r2_bases, b"IIIII!I'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII");
15799 let original_r1 = r1.clone();
15800 let original_r2 = r2.clone();
15801 let config = Config {
15802 overlap_error_correct: true,
15803 max_quality_to_correct: 41,
15804 ..Config::default()
15805 };
15806
15807 let result = correct_pair_by_overlap(&config, &mut r1, &mut r2);
15808
15809 assert_eq!(result.corrected, 0);
15810 assert_eq!(r1.bases, original_r1.bases);
15811 assert_eq!(r2.bases, original_r2.bases);
15812 }
15813
15814 #[test]
15815 fn trim_after_marking_defers_qtrim_until_after_ecc_marking() {
15816 let clean = b"ACGTTGCATGTCAGTACCGTAACGTTGCA";
15817 let mut mutant = clean.to_vec();
15818 mutant[26] = b'A';
15819 let config = Config {
15820 k: 7,
15821 min_quality: 0,
15822 min_prob: 0.0,
15823 error_correct: true,
15824 mark_errors_only: true,
15825 trim_after_marking: true,
15826 trim_right: true,
15827 trim_optimal: false,
15828 trim_quality: 20.0,
15829 keep_all: true,
15830 passes: 1,
15831 ..Config::default()
15832 };
15833 let mut counts = CountMap::default();
15834 for i in 0..30 {
15835 increment_pair_counts(
15836 &config,
15837 &mut counts,
15838 &record(&format!("clean{i}"), clean),
15839 None,
15840 );
15841 }
15842 increment_pair_counts(&config, &mut counts, &record("mutant", &mutant), None);
15843
15844 let input = vec![(0, record("mutant", &mutant), None, 0.0)];
15845 let pairs = normalize_pair_chunk(&config, &counts, &input);
15846
15847 assert_eq!(pairs[0].out_r1.bases, b"ACGTTGCATGTCAGTACCGTAACGTTAC");
15848 assert_eq!(
15849 pairs[0].out_r1.qualities.as_deref(),
15850 Some(&b"IIIIIIIIIIIIIIIIIIIIIIIIIIII"[..])
15851 );
15852 }
15853
15854 #[test]
15855 fn bad_kmer_fraction_lowers_dynamic_toss_target_like_bbnorm() {
15856 let config = Config {
15857 target_depth: 100,
15858 max_depth: Some(125),
15859 target_bad_percent_low: 0.2,
15860 target_bad_percent_high: 0.8,
15861 ..Config::default()
15862 };
15863 let clean = PairAnalysis::default();
15864 assert_eq!(dynamic_depth_limits(&config, &clean), (100, 125));
15865
15866 let noisy = PairAnalysis {
15867 low_kmer_count: 5,
15868 total_kmer_count: 10,
15869 ..PairAnalysis::default()
15870 };
15871 assert_eq!(dynamic_depth_limits(&config, &noisy), (35, 35));
15872 }
15873
15874 #[test]
15875 fn multipass_bad_depth_targets_match_java_pass_shape() {
15876 let config = Config {
15877 passes: 3,
15878 target_depth: 100,
15879 target_bad_percent_low: 0.2,
15880 target_bad_percent_high: 0.8,
15881 ..Config::default()
15882 };
15883
15884 let first_target = intermediate_target_depth(&config, 1);
15885 assert_eq!(first_target, 400);
15886 assert_eq!(
15887 intermediate_bad_depth_targets(&config, 1, first_target),
15888 (30, 120)
15889 );
15890
15891 let second_target = intermediate_target_depth(&config, 2);
15892 assert_eq!(second_target, 200);
15893 assert_eq!(
15894 intermediate_bad_depth_targets(&config, 2, second_target),
15895 (20, 80)
15896 );
15897 }
15898
15899 #[test]
15900 fn qtrim_keeps_java_min_result_shape_for_all_bad_reads() {
15901 let config = Config {
15902 trim_right: true,
15903 trim_quality: 10.0,
15904 ..Config::default()
15905 };
15906 let mut read = quality_record("r1", b"ACGT", b"!!!!");
15907
15908 trim_record(&config, &mut read);
15909
15910 assert_eq!(read.bases, b"A");
15911 assert_eq!(read.qualities.as_deref(), Some(&b"!"[..]));
15912 }
15913
15914 #[test]
15915 fn qtrim_window_uses_java_sliding_threshold() {
15916 let config = Config {
15917 trim_right: true,
15918 trim_quality: 10.0,
15919 trim_optimal: false,
15920 trim_window: true,
15921 trim_window_length: 4,
15922 ..Config::default()
15923 };
15924 let mut read = quality_record("r1", b"ACGTACGTACGT", b"IIIIIII!!!!!");
15925
15926 trim_record(&config, &mut read);
15927
15928 assert_eq!(read.bases, b"ACGTACG");
15929 assert_eq!(read.qualities.as_deref(), Some(&b"IIIIIII"[..]));
15930 }
15931
15932 #[test]
15933 fn output_hash_patterns_match_bbnorm_pair_expansion() {
15934 let paths = prepare_output_paths(Some(Path::new("reads#.fq")), None, true);
15935 assert_eq!(paths.first, Some(PathBuf::from("reads1.fq")));
15936 assert_eq!(paths.second, Some(PathBuf::from("reads2.fq")));
15937
15938 let paths = prepare_output_paths(
15939 Some(Path::new("reads#.fq")),
15940 Some(Path::new("mate.fq")),
15941 true,
15942 );
15943 assert_eq!(paths.first, Some(PathBuf::from("reads1.fq")));
15944 assert_eq!(paths.second, Some(PathBuf::from("mate.fq")));
15945
15946 let paths = prepare_output_paths(Some(Path::new("single#.fq")), None, false);
15947 assert_eq!(paths.first, Some(PathBuf::from("single1.fq")));
15948 assert_eq!(paths.second, None);
15949 }
15950}