1#![allow(clippy::cast_precision_loss)]
64
65use std::io::{BufWriter, Write};
66use std::num::NonZero;
67use std::path::Path;
68
69use rsomics_bamio::raw::{self, RawRecord};
70use rsomics_common::{Result, RsomicsError};
71
72const CIGAR_MATCH: u8 = 0;
73const CIGAR_INSERTION: u8 = 1;
74const CIGAR_DELETION: u8 = 2;
75const CIGAR_SKIP: u8 = 3;
76const CIGAR_SOFT_CLIP: u8 = 4;
77const CIGAR_SEQ_MATCH: u8 = 7;
78const CIGAR_SEQ_MISMATCH: u8 = 8;
79
80#[derive(Debug, Clone)]
81pub struct FingerprintOpts {
82 pub bin_size: u32,
84 pub number_of_samples: u64,
86 pub min_mapq: u8,
88 pub sam_flag_exclude: u16,
91 pub sam_flag_include: u16,
94}
95
96impl Default for FingerprintOpts {
97 fn default() -> Self {
98 Self {
99 bin_size: 500,
100 number_of_samples: 500_000,
101 min_mapq: 0,
102 sam_flag_exclude: 0,
103 sam_flag_include: 0,
104 }
105 }
106}
107
108struct ChromGeom {
109 tid: usize,
110 length: u64,
111}
112
113struct Region {
121 tid: usize,
122 start: u64,
123 end: u64,
128 n_bins: u64,
129 out_lo: usize,
130}
131
132pub struct Fingerprint {
134 pub counts: Vec<u64>,
135}
136
137impl Fingerprint {
138 pub fn cumulative_curve(&self) -> Vec<(f64, f64)> {
141 let n = self.counts.len();
142 let mut sorted = self.counts.clone();
143 sorted.sort_unstable();
144 let total: u64 = sorted.iter().sum();
145 let mut acc: u64 = 0;
146 sorted
147 .iter()
148 .enumerate()
149 .map(|(i, &v)| {
150 acc += v;
151 let x = i as f64 / n as f64;
152 let y = if total == 0 {
153 0.0
154 } else {
155 acc as f64 / total as f64
156 };
157 (x, y)
158 })
159 .collect()
160 }
161
162 pub fn quality_metrics(&self) -> QualityMetrics {
165 let n = self.counts.len();
166 let mut sorted = self.counts.clone();
167 sorted.sort_unstable();
168 let total: u64 = sorted.iter().sum();
169
170 let mut counts = Vec::with_capacity(n);
171 let mut acc: u64 = 0;
172 for &v in &sorted {
173 acc += v;
174 counts.push(if total == 0 {
175 0.0
176 } else {
177 acc as f64 / total as f64
178 });
179 }
180
181 let auc = counts.iter().sum::<f64>() / n as f64;
182
183 let x_int = counts
184 .iter()
185 .position(|&c| c > 0.0)
186 .map_or(0.0, |k| (k + 1) as f64 / n as f64);
187
188 let elbow = if n <= 1 {
189 0.0
190 } else {
191 let mut best_k = 0usize;
192 let mut best = f64::NEG_INFINITY;
193 for (i, &c) in counts.iter().enumerate() {
194 let line = i as f64 / (n as f64 - 1.0);
195 let diff = line - c;
196 if diff > best {
197 best = diff;
198 best_k = i;
199 }
200 }
201 (best_k + 1) as f64 / n as f64
202 };
203
204 QualityMetrics { auc, x_int, elbow }
205 }
206}
207
208pub struct QualityMetrics {
209 pub auc: f64,
210 pub x_int: f64,
211 pub elbow: f64,
212}
213
214pub fn fingerprint(
216 input: &Path,
217 opts: &FingerprintOpts,
218 workers: NonZero<usize>,
219) -> Result<Fingerprint> {
220 let chroms = read_chrom_geom(input, workers)?;
221 let genome_size: u64 = chroms.iter().map(|c| c.length).sum();
222 if genome_size == 0 {
223 return Err(RsomicsError::InvalidInput(
224 "BAM header declares zero genome length".into(),
225 ));
226 }
227
228 let mapped = count_mapped(input, opts, workers)?;
229 if mapped == 0 {
230 return Err(RsomicsError::InvalidInput(
231 "no mapped reads found — check MAPPING quality / flag filters".into(),
232 ));
233 }
234
235 let step_size = (genome_size / opts.number_of_samples).max(1);
236
237 let mean_chrom_len = genome_size as f64 / chroms.len() as f64;
238 if mean_chrom_len < step_size as f64 {
239 let min_samples = (genome_size as f64 / mean_chrom_len) as u64;
240 return Err(RsomicsError::InvalidInput(format!(
241 "--number-of-samples has to be bigger than {min_samples}"
242 )));
243 }
244
245 let chunk_size = chunk_length(step_size, opts.bin_size, mapped, genome_size, 1);
246 let bin_size = u64::from(opts.bin_size);
247
248 let (regions, n_bins) = layout_regions(&chroms, step_size, bin_size, chunk_size);
249 if n_bins == 0 {
250 return Err(RsomicsError::InvalidInput(
251 "no bins were sampled — decrease --bin-size or --number-of-samples".into(),
252 ));
253 }
254
255 let counts = accumulate_coverage(
256 input, opts, workers, &chroms, ®ions, n_bins, step_size, bin_size,
257 )?;
258 Ok(Fingerprint { counts })
259}
260
261fn read_chrom_geom(input: &Path, workers: NonZero<usize>) -> Result<Vec<ChromGeom>> {
262 let mut reader = rsomics_bamio::open_with_workers(input, workers)?;
263 let header = reader.read_header().map_err(RsomicsError::Io)?;
264 Ok(header
265 .reference_sequences()
266 .iter()
267 .enumerate()
268 .map(|(tid, (_, seq))| ChromGeom {
269 tid,
270 length: usize::from(seq.length()) as u64,
271 })
272 .collect())
273}
274
275fn count_mapped(input: &Path, _opts: &FingerprintOpts, workers: NonZero<usize>) -> Result<u64> {
279 let mut reader = rsomics_bamio::open_with_workers(input, workers)?;
280 reader.read_header().map_err(RsomicsError::Io)?;
281 let mut record = RawRecord::default();
282 let mut mapped: u64 = 0;
283 while raw::read_record(reader.get_mut(), &mut record)? != 0 {
284 if record.flags() & 0x4 == 0 && record.reference_sequence_id() >= 0 {
285 mapped += 1;
286 }
287 }
288 Ok(mapped)
289}
290
291fn chunk_length(
293 step_size: u64,
294 bin_size: u32,
295 max_mapped: u64,
296 genome_size: u64,
297 n_bams: u64,
298) -> u64 {
299 let reads_per_bp = max_mapped as f64 / genome_size as f64;
300 let mut chunk = (step_size as f64 * 1e3 / (reads_per_bp * n_bams as f64)) as u64;
301 if chunk < step_size {
302 chunk = step_size;
303 }
304 if u64::from(bin_size) > 0 && chunk < u64::from(bin_size) {
305 chunk = u64::from(bin_size);
306 }
307 chunk
308}
309
310fn layout_regions(
321 chroms: &[ChromGeom],
322 step_size: u64,
323 bin_size: u64,
324 chunk_size: u64,
325) -> (Vec<Region>, usize) {
326 let tiled = step_size == bin_size;
327 let mut regions = Vec::new();
328 let mut out_lo = 0usize;
329 for chrom in chroms {
330 let mut chunk_start = 0u64;
331 while chunk_start < chrom.length {
332 let chunk_end = (chunk_start + chunk_size).min(chrom.length);
333 if tiled {
334 let n_bins = (chunk_end - chunk_start) / bin_size;
335 if n_bins > 0 {
336 regions.push(Region {
337 tid: chrom.tid,
338 start: chunk_start,
339 end: chunk_end,
340 n_bins,
341 out_lo,
342 });
343 out_lo += n_bins as usize;
344 }
345 } else {
346 let mut i = chunk_start;
347 while i < chunk_end {
348 if i + bin_size > chunk_end {
349 break;
350 }
351 regions.push(Region {
352 tid: chrom.tid,
353 start: i,
354 end: i + bin_size,
355 n_bins: 1,
356 out_lo,
357 });
358 out_lo += 1;
359 i += step_size;
360 }
361 }
362 chunk_start += chunk_size;
363 }
364 }
365 (regions, out_lo)
366}
367
368#[allow(clippy::too_many_arguments)]
372fn accumulate_coverage(
373 input: &Path,
374 opts: &FingerprintOpts,
375 workers: NonZero<usize>,
376 chroms: &[ChromGeom],
377 regions: &[Region],
378 n_bins: usize,
379 step_size: u64,
380 bin_size: u64,
381) -> Result<Vec<u64>> {
382 let mut chrom_span: Vec<(usize, usize)> = vec![(0, 0); chroms.len()];
385 {
386 let mut idx = 0usize;
387 while idx < regions.len() {
388 let tid = regions[idx].tid;
389 let lo = idx;
390 while idx < regions.len() && regions[idx].tid == tid {
391 idx += 1;
392 }
393 chrom_span[tid] = (lo, idx);
394 }
395 }
396
397 let mut counts = vec![0i64; n_bins];
398
399 let mut reader = rsomics_bamio::open_with_workers(input, workers)?;
400 reader.read_header().map_err(RsomicsError::Io)?;
401 let mut record = RawRecord::default();
402 let mut blocks: Vec<(u64, u64)> = Vec::new();
403
404 while raw::read_record(reader.get_mut(), &mut record)? != 0 {
405 let flags = record.flags();
406 if flags & 0x4 != 0 {
407 continue;
408 }
409 let tid = record.reference_sequence_id();
410 if tid < 0 {
411 continue;
412 }
413 if opts.sam_flag_include != 0 && (flags & opts.sam_flag_include) != opts.sam_flag_include {
414 continue;
415 }
416 if opts.sam_flag_exclude != 0 && (flags & opts.sam_flag_exclude) != 0 {
417 continue;
418 }
419 if opts.min_mapq > 0 && record.mapping_quality() < opts.min_mapq {
420 continue;
421 }
422
423 let tid = tid as usize;
424 let (lo, hi) = match chrom_span.get(tid) {
425 Some(&span) if span.0 < span.1 => span,
426 _ => continue,
427 };
428 let chrom_regions = ®ions[lo..hi];
429
430 let start0 = record.alignment_start() as u64;
431 aligned_blocks(start0, &record, &mut blocks);
432 add_read(&mut counts, chrom_regions, &blocks, step_size, bin_size);
433 }
434
435 Ok(counts.into_iter().map(|c| c.max(0) as u64).collect())
436}
437
438fn aligned_blocks(start0: u64, record: &RawRecord, out: &mut Vec<(u64, u64)>) {
444 blocks_from_cigar(start0, record.cigar_ops(), out);
445}
446
447fn blocks_from_cigar(
448 start0: u64,
449 cigar: impl Iterator<Item = (u8, u32)>,
450 out: &mut Vec<(u64, u64)>,
451) {
452 out.clear();
453 let mut pos = start0;
454 let mut block_start = start0;
455 let mut in_block = false;
456 for (kind, len) in cigar {
457 let len = u64::from(len);
458 match kind {
459 CIGAR_MATCH | CIGAR_SEQ_MATCH | CIGAR_SEQ_MISMATCH => {
460 if !in_block {
461 block_start = pos;
462 in_block = true;
463 }
464 pos += len;
465 }
466 CIGAR_INSERTION | CIGAR_DELETION | CIGAR_SKIP => {
469 if in_block {
470 out.push((block_start, pos));
471 in_block = false;
472 }
473 if kind != CIGAR_INSERTION {
474 pos += len;
475 }
476 }
477 CIGAR_SOFT_CLIP => {}
478 _ => {}
479 }
480 }
481 if in_block && pos > block_start {
482 out.push((block_start, pos));
483 }
484}
485
486fn add_read(
492 counts: &mut [i64],
493 chrom_regions: &[Region],
494 blocks: &[(u64, u64)],
495 step_size: u64,
496 bin_size: u64,
497) {
498 let Some(&(read_start, _)) = blocks.first() else {
499 return;
500 };
501 let read_end = blocks.last().unwrap().1;
502
503 let _ = step_size;
504 let mut r = chrom_regions.partition_point(|reg| reg.end <= read_start);
507 while r < chrom_regions.len() {
508 let reg = &chrom_regions[r];
509 if reg.start >= read_end {
510 break;
511 }
512 cover_region(
513 &mut counts[reg.out_lo..reg.out_lo + reg.n_bins as usize],
514 reg,
515 blocks,
516 bin_size,
517 );
518 r += 1;
519 }
520}
521
522fn cover_region(cov: &mut [i64], reg: &Region, blocks: &[(u64, u64)], bin_size: u64) {
529 let reg0 = reg.start as i64;
530 let reg1 = reg.end as i64;
531 let tile = bin_size as i64;
532 let n_reg_bins = reg.n_bins as i64;
533 let len_cov = cov.len() as i64;
534 let cov_end = reg0 + len_cov * tile;
535
536 let mut last_eidx: Option<i64> = None;
537 for &(bs, be) in blocks {
538 let mut frag_start = bs as i64;
539 let mut frag_end = be as i64;
540 if frag_end - frag_start == 0 {
541 continue;
542 }
543 if frag_end <= reg0 || frag_start >= reg1 {
544 continue;
545 }
546 if frag_start < reg0 {
547 frag_start = reg0;
548 }
549 if frag_end > cov_end {
550 frag_end = cov_end;
551 }
552
553 let mut s_idx = ((frag_start - reg0) / tile).max(0);
554 let mut e_idx = (div_ceil_i64(frag_end - reg0, tile)).min(n_reg_bins);
555 if e_idx >= len_cov {
556 e_idx = len_cov - 1;
557 }
558 if let Some(last) = last_eidx {
559 s_idx = last.max(s_idx);
560 if s_idx >= e_idx {
561 continue;
562 }
563 }
564
565 let first = if frag_end < reg0 + (s_idx + 1) * tile {
567 frag_end - frag_start
568 } else {
569 reg0 + (s_idx + 1) * tile - frag_start
570 };
571 cov[s_idx as usize] += first.min(tile);
572
573 let mut k = s_idx + 1;
574 while k < e_idx {
575 cov[k as usize] += tile;
576 k += 1;
577 }
578 while e_idx - s_idx >= n_reg_bins {
579 e_idx -= 1;
580 }
581 if e_idx > s_idx {
582 let mut last = frag_end - (reg0 + e_idx * tile);
583 if last > tile {
584 last = tile;
585 } else if last < 0 {
586 last = 0;
587 }
588 cov[e_idx as usize] += last;
589 }
590 last_eidx = Some(e_idx);
591 }
592}
593
594fn div_ceil_i64(a: i64, b: i64) -> i64 {
595 (a + b - 1) / b
596}
597
598pub fn write_raw_counts(out: &mut dyn Write, label: &str, fp: &Fingerprint) -> Result<()> {
601 let mut w = BufWriter::new(out);
602 writeln!(w, "#plotFingerprint --outRawCounts").map_err(RsomicsError::Io)?;
603 writeln!(w, "'{label}'").map_err(RsomicsError::Io)?;
604 for &c in &fp.counts {
605 writeln!(w, "{c}").map_err(RsomicsError::Io)?;
606 }
607 w.flush().map_err(RsomicsError::Io)
608}
609
610pub fn write_fingerprint(out: &mut dyn Write, fp: &Fingerprint) -> Result<()> {
612 let mut w = BufWriter::new(out);
613 writeln!(w, "#rank\tfraction").map_err(RsomicsError::Io)?;
614 for (x, y) in fp.cumulative_curve() {
615 writeln!(w, "{x}\t{y}").map_err(RsomicsError::Io)?;
616 }
617 w.flush().map_err(RsomicsError::Io)
618}
619
620pub fn write_quality_metrics(out: &mut dyn Write, label: &str, fp: &Fingerprint) -> Result<()> {
622 let m = fp.quality_metrics();
623 let mut w = BufWriter::new(out);
624 writeln!(w, "Sample\tAUC\tX-intercept\tElbow Point").map_err(RsomicsError::Io)?;
625 writeln!(w, "{}\t{}\t{}\t{}", label, m.auc, m.x_int, m.elbow).map_err(RsomicsError::Io)?;
626 w.flush().map_err(RsomicsError::Io)
627}
628
629#[cfg(test)]
630mod tests {
631 use super::*;
632
633 fn blocks(record_blocks: &[(u64, u64)]) -> Vec<(u64, u64)> {
634 record_blocks.to_vec()
635 }
636
637 #[test]
638 fn chunk_length_floors_to_step_and_bin() {
639 assert_eq!(chunk_length(500, 500, 1_000_000, 1_000_000, 1), 500_000);
641 let c = chunk_length(50, 50, 1_000_000, 1_000, 1);
643 assert!(c >= 50);
644 }
645
646 #[test]
647 fn div_ceil_matches_python_ceil() {
648 assert_eq!(div_ceil_i64(5, 200), 1);
649 assert_eq!(div_ceil_i64(200, 200), 1);
650 assert_eq!(div_ceil_i64(201, 200), 2);
651 assert_eq!(div_ceil_i64(1046, 200), 6);
652 }
653
654 #[test]
655 fn single_bin_region_is_clamped_per_base_overlap() {
656 let reg = Region {
658 tid: 0,
659 start: 1000,
660 end: 1200,
661 n_bins: 1,
662 out_lo: 0,
663 };
664 let mut cov = vec![0i64];
665 cover_region(&mut cov, ®, &blocks(&[(1096, 1146)]), 200);
666 assert_eq!(cov[0], 50);
667 }
668
669 #[test]
670 fn tiled_region_spreads_full_middle_tile() {
671 let reg = Region {
674 tid: 0,
675 start: 0,
676 end: 10000,
677 n_bins: 50,
678 out_lo: 0,
679 };
680 let mut cov = vec![0i64; 50];
681 cover_region(&mut cov, ®, &blocks(&[(959, 1009)]), 200);
682 assert_eq!(cov[5], 200);
683 assert_eq!(cov[4], 41); }
685
686 #[test]
687 fn blocks_split_on_insertion_deletion_skip() {
688 let mut out = Vec::new();
689 blocks_from_cigar(
691 1999,
692 [(CIGAR_MATCH, 30), (CIGAR_INSERTION, 5), (CIGAR_MATCH, 25)].into_iter(),
693 &mut out,
694 );
695 assert_eq!(out, vec![(1999, 2029), (2029, 2054)]);
696
697 blocks_from_cigar(
699 1199,
700 [(CIGAR_MATCH, 40), (CIGAR_DELETION, 5), (CIGAR_MATCH, 20)].into_iter(),
701 &mut out,
702 );
703 assert_eq!(out, vec![(1199, 1239), (1244, 1264)]);
704
705 blocks_from_cigar(
707 499,
708 [(CIGAR_MATCH, 30), (CIGAR_SKIP, 100), (CIGAR_MATCH, 30)].into_iter(),
709 &mut out,
710 );
711 assert_eq!(out, vec![(499, 529), (629, 659)]);
712
713 blocks_from_cigar(
715 1599,
716 [(CIGAR_SOFT_CLIP, 5), (CIGAR_MATCH, 55)].into_iter(),
717 &mut out,
718 );
719 assert_eq!(out, vec![(1599, 1654)]);
720 }
721
722 #[test]
723 fn quality_metrics_uniform_curve() {
724 let fp = Fingerprint {
726 counts: vec![10; 100],
727 };
728 let m = fp.quality_metrics();
729 assert!((m.x_int - 0.01).abs() < 1e-12);
730 assert!(m.auc > 0.0 && m.auc < 1.0);
731 }
732
733 #[test]
734 fn quality_metrics_all_zero_is_safe() {
735 let fp = Fingerprint {
736 counts: vec![0; 10],
737 };
738 let m = fp.quality_metrics();
739 assert_eq!(m.auc, 0.0);
740 assert_eq!(m.x_int, 0.0);
741 }
742}