Skip to main content

rsomics_bam_fingerprint/
lib.rs

1//! ChIP-enrichment fingerprint, matching deeptools `plotFingerprint` default semantics.
2//!
3//! ## What the fingerprint is
4//!
5//! The genome is sampled at evenly spaced windows (bins). Each bin's value is
6//! the per-base read coverage summed over the bin. Sorting those per-bin values
7//! and plotting their normalised cumulative sum against the normalised bin rank
8//! gives a Lorenz curve: a diagonal means coverage is uniform (no enrichment),
9//! a sharp elbow near the right means a few bins hold most of the signal (strong
10//! ChIP enrichment).
11//!
12//! ## Deterministic sampling (source parity with deeptools)
13//!
14//! deeptools does **not** pick bins at random. With the whole genome it sets
15//! `stepSize = max(genomeSize / numberOfSamples, 1)` and walks each chromosome
16//! at `stepSize` intervals, taking one bin `[i, i + binSize)` per step. The walk
17//! happens inside genome "chunks" (`mapReduce` partitions) of length `chunkSize`,
18//! and a bin that would cross a chunk boundary (`i + binSize > chunkEnd`) is
19//! dropped — so chunk geometry, not just stepSize, decides which bins exist.
20//! `chunkSize` itself is derived from the BAM's mapped-read count:
21//!
22//! ```text
23//! reads_per_bp = max_mapped / genomeSize
24//! chunkSize    = floor(stepSize * 1000 / (reads_per_bp * nBams))
25//! chunkSize    = max(chunkSize, stepSize, binSize)
26//! ```
27//!
28//! Reproducing all of this makes our sampled-bin set identical to deeptools'
29//! when run with one BAM at `--numberOfProcessors 1` (no task shuffle).
30//!
31//! ## Per-bin coverage rule (`SumCoveragePerBin`)
32//!
33//! For each bin `[reg0, reg1)` (a single-tile region, `tileSize = binSize`):
34//! every read with `FLAG & 0x4 == 0` is decomposed into aligned blocks
35//! (`get_blocks()`: reference-consuming M/=/X runs, split by D and N). Each block
36//! contributes `min(blockEnd, reg1) - max(blockStart, reg0)` bases (clamped to
37//! `[0, binSize]`) to the bin. Default filters: skip unmapped only — no MAPQ
38//! floor, no FLAG include/exclude, duplicates kept, reads not extended
39//! (`extendReads=False` → `defaultFragmentLength == "read length"`).
40//!
41//! ## Output (`--out-raw-counts`)
42//!
43//! Mirrors `plotFingerprint --outRawCounts`: a `#plotFingerprint --outRawCounts`
44//! header, a quoted label line, then one integer per sampled bin (the per-base
45//! coverage, formatted `%d` exactly as deeptools casts the float column). Bins
46//! appear in genome order (chromosome header order, ascending position).
47//!
48//! ## Fingerprint + summary (`--out-fingerprint`, `--out-quality-metrics`)
49//!
50//! The fingerprint is the sorted, cumulative, max-normalised coverage vs the
51//! normalised rank — the data behind deeptools' PNG. The quality metrics
52//! (AUC, X-intercept, elbow) use deeptools' exact formulas over the sorted
53//! cumulative curve.
54//!
55//! ## Scope
56//!
57//! We emit the data tables (raw counts, fingerprint curve, summary metrics), not
58//! the PNG plot — the same split as `rsomics-bam-signal`, which emits bedGraph
59//! rather than bigWig. Synthetic/JSD/CHANCE columns (only produced with
60//! `--JSDsample` against a reference BAM) are out of scope for this
61//! single-input crate.
62
63#![allow(clippy::cast_precision_loss)]
64
65use std::io::{BufWriter, Write};
66use std::num::NonZero;
67use std::path::Path;
68
69use rsomics_bamio::raw::{self, RawRecord};
70use rsomics_common::{Result, RsomicsError};
71
72const CIGAR_MATCH: u8 = 0;
73const CIGAR_INSERTION: u8 = 1;
74const CIGAR_DELETION: u8 = 2;
75const CIGAR_SKIP: u8 = 3;
76const CIGAR_SOFT_CLIP: u8 = 4;
77const CIGAR_SEQ_MATCH: u8 = 7;
78const CIGAR_SEQ_MISMATCH: u8 = 8;
79
80#[derive(Debug, Clone)]
81pub struct FingerprintOpts {
82    /// Window size in bp (deeptools default: 500).
83    pub bin_size: u32,
84    /// Target number of sampled bins across the genome (deeptools default: 500000).
85    pub number_of_samples: u64,
86    /// Minimum mapping quality (deeptools default: 0 = no filter).
87    pub min_mapq: u8,
88    /// Skip reads whose FLAG has any of these bits set (deeptools `samFlagExclude`,
89    /// default None → 0). Unmapped (0x4) reads are always skipped regardless.
90    pub sam_flag_exclude: u16,
91    /// Keep only reads whose FLAG has all of these bits set (deeptools
92    /// `samFlagInclude`, default None → 0 = no requirement).
93    pub sam_flag_include: u16,
94}
95
96impl Default for FingerprintOpts {
97    fn default() -> Self {
98        Self {
99            bin_size: 500,
100            number_of_samples: 500_000,
101            min_mapq: 0,
102            sam_flag_exclude: 0,
103            sam_flag_include: 0,
104        }
105    }
106}
107
108struct ChromGeom {
109    tid: usize,
110    length: u64,
111}
112
113/// A coverage region as deeptools constructs it in `count_reads_in_region`.
114///
115/// When `stepSize != binSize` each region is one bin (`n_bins == 1`); when
116/// `stepSize == binSize` a whole genome chunk becomes one tiled region of
117/// `n_bins` contiguous `bin_size` tiles (the partial tail bin is dropped, as
118/// deeptools floor-divides). `out_lo` is this region's offset into the global
119/// genome-ordered counts vector.
120struct Region {
121    tid: usize,
122    start: u64,
123    /// Exclusive end of the region as deeptools' region tuple records it
124    /// (`reg[1]`): the bin end for a single-bin region, the chunk end for a
125    /// tiled region (which may exceed `start + n_bins * bin_size` when the
126    /// partial tail bin was floor-dropped).
127    end: u64,
128    n_bins: u64,
129    out_lo: usize,
130}
131
132/// The fingerprint result: per-bin per-base coverage in genome order.
133pub struct Fingerprint {
134    pub counts: Vec<u64>,
135}
136
137impl Fingerprint {
138    /// Sorted, cumulative, max-normalised curve (the y-axis of the plot).
139    /// Returned paired with the normalised rank (x-axis), both length N.
140    pub fn cumulative_curve(&self) -> Vec<(f64, f64)> {
141        let n = self.counts.len();
142        let mut sorted = self.counts.clone();
143        sorted.sort_unstable();
144        let total: u64 = sorted.iter().sum();
145        let mut acc: u64 = 0;
146        sorted
147            .iter()
148            .enumerate()
149            .map(|(i, &v)| {
150                acc += v;
151                let x = i as f64 / n as f64;
152                let y = if total == 0 {
153                    0.0
154                } else {
155                    acc as f64 / total as f64
156                };
157                (x, y)
158            })
159            .collect()
160    }
161
162    /// deeptools quality metrics over the sorted cumulative curve: (AUC,
163    /// X-intercept, elbow). Formulas mirror `plotFingerprint.main`.
164    pub fn quality_metrics(&self) -> QualityMetrics {
165        let n = self.counts.len();
166        let mut sorted = self.counts.clone();
167        sorted.sort_unstable();
168        let total: u64 = sorted.iter().sum();
169
170        let mut counts = Vec::with_capacity(n);
171        let mut acc: u64 = 0;
172        for &v in &sorted {
173            acc += v;
174            counts.push(if total == 0 {
175                0.0
176            } else {
177                acc as f64 / total as f64
178            });
179        }
180
181        let auc = counts.iter().sum::<f64>() / n as f64;
182
183        let x_int = counts
184            .iter()
185            .position(|&c| c > 0.0)
186            .map_or(0.0, |k| (k + 1) as f64 / n as f64);
187
188        let elbow = if n <= 1 {
189            0.0
190        } else {
191            let mut best_k = 0usize;
192            let mut best = f64::NEG_INFINITY;
193            for (i, &c) in counts.iter().enumerate() {
194                let line = i as f64 / (n as f64 - 1.0);
195                let diff = line - c;
196                if diff > best {
197                    best = diff;
198                    best_k = i;
199                }
200            }
201            (best_k + 1) as f64 / n as f64
202        };
203
204        QualityMetrics { auc, x_int, elbow }
205    }
206}
207
208pub struct QualityMetrics {
209    pub auc: f64,
210    pub x_int: f64,
211    pub elbow: f64,
212}
213
214/// Compute the per-bin per-base coverage fingerprint for `input`.
215pub fn fingerprint(
216    input: &Path,
217    opts: &FingerprintOpts,
218    workers: NonZero<usize>,
219) -> Result<Fingerprint> {
220    let chroms = read_chrom_geom(input, workers)?;
221    let genome_size: u64 = chroms.iter().map(|c| c.length).sum();
222    if genome_size == 0 {
223        return Err(RsomicsError::InvalidInput(
224            "BAM header declares zero genome length".into(),
225        ));
226    }
227
228    let mapped = count_mapped(input, opts, workers)?;
229    if mapped == 0 {
230        return Err(RsomicsError::InvalidInput(
231            "no mapped reads found — check MAPPING quality / flag filters".into(),
232        ));
233    }
234
235    let step_size = (genome_size / opts.number_of_samples).max(1);
236
237    let mean_chrom_len = genome_size as f64 / chroms.len() as f64;
238    if mean_chrom_len < step_size as f64 {
239        let min_samples = (genome_size as f64 / mean_chrom_len) as u64;
240        return Err(RsomicsError::InvalidInput(format!(
241            "--number-of-samples has to be bigger than {min_samples}"
242        )));
243    }
244
245    let chunk_size = chunk_length(step_size, opts.bin_size, mapped, genome_size, 1);
246    let bin_size = u64::from(opts.bin_size);
247
248    let (regions, n_bins) = layout_regions(&chroms, step_size, bin_size, chunk_size);
249    if n_bins == 0 {
250        return Err(RsomicsError::InvalidInput(
251            "no bins were sampled — decrease --bin-size or --number-of-samples".into(),
252        ));
253    }
254
255    let counts = accumulate_coverage(
256        input, opts, workers, &chroms, &regions, n_bins, step_size, bin_size,
257    )?;
258    Ok(Fingerprint { counts })
259}
260
261fn read_chrom_geom(input: &Path, workers: NonZero<usize>) -> Result<Vec<ChromGeom>> {
262    let mut reader = rsomics_bamio::open_with_workers(input, workers)?;
263    let header = reader.read_header().map_err(RsomicsError::Io)?;
264    Ok(header
265        .reference_sequences()
266        .iter()
267        .enumerate()
268        .map(|(tid, (_, seq))| ChromGeom {
269            tid,
270            length: usize::from(seq.length()) as u64,
271        })
272        .collect())
273}
274
275/// Reads with `FLAG & 0x4 == 0`, matching pysam `.mapped` (the idxstats mapped
276/// total deeptools uses to derive `chunkSize`). No MAPQ or include/exclude
277/// filtering enters this count — only the unmapped bit.
278fn count_mapped(input: &Path, _opts: &FingerprintOpts, workers: NonZero<usize>) -> Result<u64> {
279    let mut reader = rsomics_bamio::open_with_workers(input, workers)?;
280    reader.read_header().map_err(RsomicsError::Io)?;
281    let mut record = RawRecord::default();
282    let mut mapped: u64 = 0;
283    while raw::read_record(reader.get_mut(), &mut record)? != 0 {
284        if record.flags() & 0x4 == 0 && record.reference_sequence_id() >= 0 {
285            mapped += 1;
286        }
287    }
288    Ok(mapped)
289}
290
291/// deeptools `get_chunk_length` with `len(bamFilesHandles) == n_bams`.
292fn chunk_length(
293    step_size: u64,
294    bin_size: u32,
295    max_mapped: u64,
296    genome_size: u64,
297    n_bams: u64,
298) -> u64 {
299    let reads_per_bp = max_mapped as f64 / genome_size as f64;
300    let mut chunk = (step_size as f64 * 1e3 / (reads_per_bp * n_bams as f64)) as u64;
301    if chunk < step_size {
302        chunk = step_size;
303    }
304    if u64::from(bin_size) > 0 && chunk < u64::from(bin_size) {
305        chunk = u64::from(bin_size);
306    }
307    chunk
308}
309
310/// Builds deeptools' coverage regions in genome order. Mirrors the `mapReduce`
311/// chunk walk (`range(0, size, chunkSize)`) crossed with `count_reads_in_region`:
312///
313/// - `stepSize == binSize` → each chunk is one tiled region spanning
314///   `nBins = (chunkEnd - chunkStart) // binSize` contiguous tiles (partial tail
315///   dropped, deeptools floor-divides).
316/// - `stepSize != binSize` → one single-bin region per `range(chunkStart,
317///   chunkEnd, stepSize)` step, skipping a bin that crosses the chunk end.
318///
319/// Returns the regions plus the total bin count (the length of the output).
320fn layout_regions(
321    chroms: &[ChromGeom],
322    step_size: u64,
323    bin_size: u64,
324    chunk_size: u64,
325) -> (Vec<Region>, usize) {
326    let tiled = step_size == bin_size;
327    let mut regions = Vec::new();
328    let mut out_lo = 0usize;
329    for chrom in chroms {
330        let mut chunk_start = 0u64;
331        while chunk_start < chrom.length {
332            let chunk_end = (chunk_start + chunk_size).min(chrom.length);
333            if tiled {
334                let n_bins = (chunk_end - chunk_start) / bin_size;
335                if n_bins > 0 {
336                    regions.push(Region {
337                        tid: chrom.tid,
338                        start: chunk_start,
339                        end: chunk_end,
340                        n_bins,
341                        out_lo,
342                    });
343                    out_lo += n_bins as usize;
344                }
345            } else {
346                let mut i = chunk_start;
347                while i < chunk_end {
348                    if i + bin_size > chunk_end {
349                        break;
350                    }
351                    regions.push(Region {
352                        tid: chrom.tid,
353                        start: i,
354                        end: i + bin_size,
355                        n_bins: 1,
356                        out_lo,
357                    });
358                    out_lo += 1;
359                    i += step_size;
360                }
361            }
362            chunk_start += chunk_size;
363        }
364    }
365    (regions, out_lo)
366}
367
368/// Single streaming pass: every kept read is routed to its chromosome's regions
369/// and its aligned blocks are spread across the bins via deeptools'
370/// `SumCoveragePerBin.get_coverage_of_region` arithmetic.
371#[allow(clippy::too_many_arguments)]
372fn accumulate_coverage(
373    input: &Path,
374    opts: &FingerprintOpts,
375    workers: NonZero<usize>,
376    chroms: &[ChromGeom],
377    regions: &[Region],
378    n_bins: usize,
379    step_size: u64,
380    bin_size: u64,
381) -> Result<Vec<u64>> {
382    // Per-chromosome slice [lo, hi) into `regions`, indexed by tid. regions are
383    // grouped by chromosome in header order, ascending start within each.
384    let mut chrom_span: Vec<(usize, usize)> = vec![(0, 0); chroms.len()];
385    {
386        let mut idx = 0usize;
387        while idx < regions.len() {
388            let tid = regions[idx].tid;
389            let lo = idx;
390            while idx < regions.len() && regions[idx].tid == tid {
391                idx += 1;
392            }
393            chrom_span[tid] = (lo, idx);
394        }
395    }
396
397    let mut counts = vec![0i64; n_bins];
398
399    let mut reader = rsomics_bamio::open_with_workers(input, workers)?;
400    reader.read_header().map_err(RsomicsError::Io)?;
401    let mut record = RawRecord::default();
402    let mut blocks: Vec<(u64, u64)> = Vec::new();
403
404    while raw::read_record(reader.get_mut(), &mut record)? != 0 {
405        let flags = record.flags();
406        if flags & 0x4 != 0 {
407            continue;
408        }
409        let tid = record.reference_sequence_id();
410        if tid < 0 {
411            continue;
412        }
413        if opts.sam_flag_include != 0 && (flags & opts.sam_flag_include) != opts.sam_flag_include {
414            continue;
415        }
416        if opts.sam_flag_exclude != 0 && (flags & opts.sam_flag_exclude) != 0 {
417            continue;
418        }
419        if opts.min_mapq > 0 && record.mapping_quality() < opts.min_mapq {
420            continue;
421        }
422
423        let tid = tid as usize;
424        let (lo, hi) = match chrom_span.get(tid) {
425            Some(&span) if span.0 < span.1 => span,
426            _ => continue,
427        };
428        let chrom_regions = &regions[lo..hi];
429
430        let start0 = record.alignment_start() as u64;
431        aligned_blocks(start0, &record, &mut blocks);
432        add_read(&mut counts, chrom_regions, &blocks, step_size, bin_size);
433    }
434
435    Ok(counts.into_iter().map(|c| c.max(0) as u64).collect())
436}
437
438/// pysam `get_blocks()`: aligned blocks of reference-consuming CIGAR. M/=/X
439/// extend the current block; D, N **and I** all break it (pysam emits a fresh
440/// block at every insertion even though I consumes no reference, so the two
441/// blocks abut). Soft/hard clips and padding are ignored. Fills `out` (cleared
442/// first) so the caller can reuse one allocation across records.
443fn aligned_blocks(start0: u64, record: &RawRecord, out: &mut Vec<(u64, u64)>) {
444    blocks_from_cigar(start0, record.cigar_ops(), out);
445}
446
447fn blocks_from_cigar(
448    start0: u64,
449    cigar: impl Iterator<Item = (u8, u32)>,
450    out: &mut Vec<(u64, u64)>,
451) {
452    out.clear();
453    let mut pos = start0;
454    let mut block_start = start0;
455    let mut in_block = false;
456    for (kind, len) in cigar {
457        let len = u64::from(len);
458        match kind {
459            CIGAR_MATCH | CIGAR_SEQ_MATCH | CIGAR_SEQ_MISMATCH => {
460                if !in_block {
461                    block_start = pos;
462                    in_block = true;
463                }
464                pos += len;
465            }
466            // I, D and N all break the current block (pysam emits a fresh block
467            // at every insertion too, even though I consumes no reference).
468            CIGAR_INSERTION | CIGAR_DELETION | CIGAR_SKIP => {
469                if in_block {
470                    out.push((block_start, pos));
471                    in_block = false;
472                }
473                if kind != CIGAR_INSERTION {
474                    pos += len;
475                }
476            }
477            CIGAR_SOFT_CLIP => {}
478            _ => {}
479        }
480    }
481    if in_block && pos > block_start {
482        out.push((block_start, pos));
483    }
484}
485
486/// Routes one read's aligned blocks into its chromosome's regions, applying
487/// deeptools' `SumCoveragePerBin.get_coverage_of_region` arithmetic per region.
488/// `chrom_regions` is ascending by start; the read can touch the (single) tiled
489/// region of its chunk, or — in the sparse single-bin layout — at most a handful
490/// of consecutive single-bin regions.
491fn add_read(
492    counts: &mut [i64],
493    chrom_regions: &[Region],
494    blocks: &[(u64, u64)],
495    step_size: u64,
496    bin_size: u64,
497) {
498    let Some(&(read_start, _)) = blocks.first() else {
499        return;
500    };
501    let read_end = blocks.last().unwrap().1;
502
503    let _ = step_size;
504    // First region whose end is past read_start (deeptools fetches reads in
505    // [reg.start, reg.end); a read overlapping that span is processed).
506    let mut r = chrom_regions.partition_point(|reg| reg.end <= read_start);
507    while r < chrom_regions.len() {
508        let reg = &chrom_regions[r];
509        if reg.start >= read_end {
510            break;
511        }
512        cover_region(
513            &mut counts[reg.out_lo..reg.out_lo + reg.n_bins as usize],
514            reg,
515            blocks,
516            bin_size,
517        );
518        r += 1;
519    }
520}
521
522/// deeptools `SumCoveragePerBin.get_coverage_of_region` for one region. `cov` is
523/// the bin slice for this region (`tileSize == bin_size`, `nRegBins == n_bins`).
524/// Per-base coverage is spread across tiles with deeptools' exact integer
525/// arithmetic — including the `ceil`/`eIdx` clamp and `last_eIdx` block guard
526/// that the upstream tiled path uses (and that make wide reads over-count a tile
527/// rather than clipping to true per-base overlap; matched for byte parity).
528fn cover_region(cov: &mut [i64], reg: &Region, blocks: &[(u64, u64)], bin_size: u64) {
529    let reg0 = reg.start as i64;
530    let reg1 = reg.end as i64;
531    let tile = bin_size as i64;
532    let n_reg_bins = reg.n_bins as i64;
533    let len_cov = cov.len() as i64;
534    let cov_end = reg0 + len_cov * tile;
535
536    let mut last_eidx: Option<i64> = None;
537    for &(bs, be) in blocks {
538        let mut frag_start = bs as i64;
539        let mut frag_end = be as i64;
540        if frag_end - frag_start == 0 {
541            continue;
542        }
543        if frag_end <= reg0 || frag_start >= reg1 {
544            continue;
545        }
546        if frag_start < reg0 {
547            frag_start = reg0;
548        }
549        if frag_end > cov_end {
550            frag_end = cov_end;
551        }
552
553        let mut s_idx = ((frag_start - reg0) / tile).max(0);
554        let mut e_idx = (div_ceil_i64(frag_end - reg0, tile)).min(n_reg_bins);
555        if e_idx >= len_cov {
556            e_idx = len_cov - 1;
557        }
558        if let Some(last) = last_eidx {
559            s_idx = last.max(s_idx);
560            if s_idx >= e_idx {
561                continue;
562            }
563        }
564
565        // First bin: partial overlap from frag_start to the bin's end.
566        let first = if frag_end < reg0 + (s_idx + 1) * tile {
567            frag_end - frag_start
568        } else {
569            reg0 + (s_idx + 1) * tile - frag_start
570        };
571        cov[s_idx as usize] += first.min(tile);
572
573        let mut k = s_idx + 1;
574        while k < e_idx {
575            cov[k as usize] += tile;
576            k += 1;
577        }
578        while e_idx - s_idx >= n_reg_bins {
579            e_idx -= 1;
580        }
581        if e_idx > s_idx {
582            let mut last = frag_end - (reg0 + e_idx * tile);
583            if last > tile {
584                last = tile;
585            } else if last < 0 {
586                last = 0;
587            }
588            cov[e_idx as usize] += last;
589        }
590        last_eidx = Some(e_idx);
591    }
592}
593
594fn div_ceil_i64(a: i64, b: i64) -> i64 {
595    (a + b - 1) / b
596}
597
598/// Write the `--outRawCounts` table: deeptools header, quoted label, one `%d`
599/// per sampled bin in genome order.
600pub fn write_raw_counts(out: &mut dyn Write, label: &str, fp: &Fingerprint) -> Result<()> {
601    let mut w = BufWriter::new(out);
602    writeln!(w, "#plotFingerprint --outRawCounts").map_err(RsomicsError::Io)?;
603    writeln!(w, "'{label}'").map_err(RsomicsError::Io)?;
604    for &c in &fp.counts {
605        writeln!(w, "{c}").map_err(RsomicsError::Io)?;
606    }
607    w.flush().map_err(RsomicsError::Io)
608}
609
610/// Write the fingerprint curve: `rank<TAB>fraction`, one row per sampled bin.
611pub fn write_fingerprint(out: &mut dyn Write, fp: &Fingerprint) -> Result<()> {
612    let mut w = BufWriter::new(out);
613    writeln!(w, "#rank\tfraction").map_err(RsomicsError::Io)?;
614    for (x, y) in fp.cumulative_curve() {
615        writeln!(w, "{x}\t{y}").map_err(RsomicsError::Io)?;
616    }
617    w.flush().map_err(RsomicsError::Io)
618}
619
620/// Write the quality-metrics summary: deeptools' AUC / X-intercept / elbow.
621pub fn write_quality_metrics(out: &mut dyn Write, label: &str, fp: &Fingerprint) -> Result<()> {
622    let m = fp.quality_metrics();
623    let mut w = BufWriter::new(out);
624    writeln!(w, "Sample\tAUC\tX-intercept\tElbow Point").map_err(RsomicsError::Io)?;
625    writeln!(w, "{}\t{}\t{}\t{}", label, m.auc, m.x_int, m.elbow).map_err(RsomicsError::Io)?;
626    w.flush().map_err(RsomicsError::Io)
627}
628
629#[cfg(test)]
630mod tests {
631    use super::*;
632
633    fn blocks(record_blocks: &[(u64, u64)]) -> Vec<(u64, u64)> {
634        record_blocks.to_vec()
635    }
636
637    #[test]
638    fn chunk_length_floors_to_step_and_bin() {
639        // reads_per_bp = 1.0; chunk = step*1000 / (1*1) = 500000, dominates.
640        assert_eq!(chunk_length(500, 500, 1_000_000, 1_000_000, 1), 500_000);
641        // dense: reads_per_bp large → chunk floored to step.
642        let c = chunk_length(50, 50, 1_000_000, 1_000, 1);
643        assert!(c >= 50);
644    }
645
646    #[test]
647    fn div_ceil_matches_python_ceil() {
648        assert_eq!(div_ceil_i64(5, 200), 1);
649        assert_eq!(div_ceil_i64(200, 200), 1);
650        assert_eq!(div_ceil_i64(201, 200), 2);
651        assert_eq!(div_ceil_i64(1046, 200), 6);
652    }
653
654    #[test]
655    fn single_bin_region_is_clamped_per_base_overlap() {
656        // One bin [1000,1200), one read block [1096,1146): contributes 50 bp.
657        let reg = Region {
658            tid: 0,
659            start: 1000,
660            end: 1200,
661            n_bins: 1,
662            out_lo: 0,
663        };
664        let mut cov = vec![0i64];
665        cover_region(&mut cov, &reg, &blocks(&[(1096, 1146)]), 200);
666        assert_eq!(cov[0], 50);
667    }
668
669    #[test]
670    fn tiled_region_spreads_full_middle_tile() {
671        // deeptools tiled quirk: a read [959,1009) with sIdx=4 eIdx=6 dumps a
672        // full 200 into the middle tile (bin 5) even though it only reaches 1009.
673        let reg = Region {
674            tid: 0,
675            start: 0,
676            end: 10000,
677            n_bins: 50,
678            out_lo: 0,
679        };
680        let mut cov = vec![0i64; 50];
681        cover_region(&mut cov, &reg, &blocks(&[(959, 1009)]), 200);
682        assert_eq!(cov[5], 200);
683        assert_eq!(cov[4], 41); // 1000 - 959
684    }
685
686    #[test]
687    fn blocks_split_on_insertion_deletion_skip() {
688        let mut out = Vec::new();
689        // 30M5I25M from 1999 → pysam [(1999,2029),(2029,2054)] (split on I).
690        blocks_from_cigar(
691            1999,
692            [(CIGAR_MATCH, 30), (CIGAR_INSERTION, 5), (CIGAR_MATCH, 25)].into_iter(),
693            &mut out,
694        );
695        assert_eq!(out, vec![(1999, 2029), (2029, 2054)]);
696
697        // 40M5D20M from 1199 → [(1199,1239),(1244,1264)] (split on D, gap 5).
698        blocks_from_cigar(
699            1199,
700            [(CIGAR_MATCH, 40), (CIGAR_DELETION, 5), (CIGAR_MATCH, 20)].into_iter(),
701            &mut out,
702        );
703        assert_eq!(out, vec![(1199, 1239), (1244, 1264)]);
704
705        // 30M100N30M from 499 → [(499,529),(629,659)] (split on N, gap 100).
706        blocks_from_cigar(
707            499,
708            [(CIGAR_MATCH, 30), (CIGAR_SKIP, 100), (CIGAR_MATCH, 30)].into_iter(),
709            &mut out,
710        );
711        assert_eq!(out, vec![(499, 529), (629, 659)]);
712
713        // 5S55M from 1599 → soft-clip ignored, [(1599,1654)].
714        blocks_from_cigar(
715            1599,
716            [(CIGAR_SOFT_CLIP, 5), (CIGAR_MATCH, 55)].into_iter(),
717            &mut out,
718        );
719        assert_eq!(out, vec![(1599, 1654)]);
720    }
721
722    #[test]
723    fn quality_metrics_uniform_curve() {
724        // Uniform counts → AUC ≈ 0.5+, elbow near 1, x-int = 1/n.
725        let fp = Fingerprint {
726            counts: vec![10; 100],
727        };
728        let m = fp.quality_metrics();
729        assert!((m.x_int - 0.01).abs() < 1e-12);
730        assert!(m.auc > 0.0 && m.auc < 1.0);
731    }
732
733    #[test]
734    fn quality_metrics_all_zero_is_safe() {
735        let fp = Fingerprint {
736            counts: vec![0; 10],
737        };
738        let m = fp.quality_metrics();
739        assert_eq!(m.auc, 0.0);
740        assert_eq!(m.x_int, 0.0);
741    }
742}