1mod error_model;
17
18use std::collections::HashMap;
19use std::io::{BufRead, Write};
20use std::path::{Path, PathBuf};
21
22use anyhow::{Context, Result};
23use noodles_bam as bam;
24use noodles_sam as sam;
25use noodles_sam::alignment::record::cigar::op::Kind;
26use noodles_sam::alignment::record::data::field::{Tag, Value};
27use noodles_sam::Header;
28use serde::Serialize;
29
30use error_model::{AlignmentModel, AlnOp, SharedAlignmentModel};
31use salmon_core::{
32 is_compatible, LibraryFormat, MateStatus, ReadOrientation, ReadStrandedness, ReadType,
33};
34use salmon_eqclass::{range_factorize_bins, EquivalenceClassBuilder, TranscriptGroup};
35use salmon_infer::{optimize, optimize_with_init, EmOptions};
36use salmon_model::FragmentLengthDistribution;
37
38const SALMON_VERSION: &str = env!("CARGO_PKG_VERSION");
39
40#[derive(Debug, Clone)]
42pub struct AlignQuantOptions {
43 pub bam: PathBuf,
45 pub output_dir: PathBuf,
47 pub lib_type: String,
49 pub em: EmOptions,
51 pub range_factorization_bins: u32,
53 pub score_exp: f64,
55 pub transcripts: Option<PathBuf>,
57 pub no_error_model: bool,
59 pub seq_bias: bool,
61 pub gc_bias: bool,
63 pub pos_bias: bool,
65 pub incompat_prior: f64,
68 pub fld_mean: f64,
71 pub fld_sd: f64,
72 pub fld_max: usize,
73 pub forgetting_factor: f64,
75 pub init_uniform: bool,
78 pub sig_digits: u32,
81 pub num_error_bins: usize,
84 pub discard_orphans: bool,
87 pub bias_speed_samp: usize,
90 pub num_aux_model_samples: u64,
93 pub no_bias_length_threshold: bool,
96 pub gc_bins: usize,
98 pub cond_gc_bins: usize,
99 pub skip_quant: bool,
102 pub num_pre_aux_model_samples: u64,
105 pub progress: Option<std::sync::Arc<salmon_core::ProgressCounters>>,
109}
110
111impl AlignQuantOptions {
112 pub fn new(bam: PathBuf, output_dir: PathBuf) -> Self {
113 Self {
114 bam,
115 output_dir,
116 lib_type: "IU".to_string(),
117 em: EmOptions::default(),
118 range_factorization_bins: 4,
119 score_exp: 1.0,
120 transcripts: None,
121 no_error_model: false,
122 seq_bias: false,
123 gc_bias: false,
124 pos_bias: false,
125 incompat_prior: 0.0,
126 fld_mean: 250.0,
127 fld_sd: 25.0,
128 fld_max: 1000,
129 forgetting_factor: 0.65,
130 init_uniform: false,
131 sig_digits: 3,
132 num_error_bins: 4,
133 discard_orphans: false,
134 bias_speed_samp: 5,
135 num_aux_model_samples: 5_000_000,
136 no_bias_length_threshold: false,
137 gc_bins: salmon_model::gcbias::DEFAULT_GC_BINS,
138 cond_gc_bins: salmon_model::gcbias::DEFAULT_COND_BINS,
139 skip_quant: false,
140 num_pre_aux_model_samples: 5_000,
141 progress: None,
142 }
143 }
144}
145
146#[derive(Debug, Clone)]
148pub struct AlignQuantResult {
149 pub names: Vec<String>,
150 pub lengths: Vec<u32>,
151 pub eff_lengths: Vec<f64>,
152 pub tpm: Vec<f64>,
153 pub counts: Vec<f64>,
154 pub num_processed: u64,
155 pub num_mapped: u64,
156 pub num_eq_classes: usize,
157 pub frag_len_mean: f64,
158 pub frag_len_sd: f64,
159 pub length_classes: Vec<u32>,
160 pub frag_len_dist: Vec<f64>,
161 pub start_time: String,
162 pub bias_dump: salmon_model::dumps::BiasDump,
163 pub ambig: (Vec<u32>, Vec<u32>),
165}
166
167fn asctime_now() -> String {
169 jiff::Zoned::now()
170 .strftime("%a %b %e %H:%M:%S %Y")
171 .to_string()
172}
173
174fn value_as_i32(v: &Value) -> Option<i32> {
176 match v {
177 Value::Int8(x) => Some(*x as i32),
178 Value::UInt8(x) => Some(*x as i32),
179 Value::Int16(x) => Some(*x as i32),
180 Value::UInt16(x) => Some(*x as i32),
181 Value::Int32(x) => Some(*x),
182 Value::UInt32(x) => Some(*x as i32),
183 _ => None,
184 }
185}
186
187struct Placement {
192 tid: u32,
193 idxs: Vec<usize>,
194}
195
196fn pair_records(recs: &[FragRecord]) -> Vec<Placement> {
211 let n = recs.len();
212 let mut used = vec![false; n];
213 let mut placements: Vec<Placement> = Vec::with_capacity(n);
214
215 for i in 0..n {
216 if used[i] || !recs[i].is_read1 {
217 continue;
218 }
219 let r1 = &recs[i];
220 let (Some(mtid), Some(mpos)) = (r1.mate_tid, r1.mate_pos) else {
223 continue;
224 };
225 if mtid != r1.tid {
226 continue;
227 }
228 let mut mate: Option<usize> = None;
229 for j in 0..n {
230 if used[j] || recs[j].is_read1 {
231 continue;
232 }
233 let r2 = &recs[j];
234 if r2.tid != mtid
235 || r2.pos != mpos
236 || r2.mate_tid != Some(r1.tid)
237 || r2.mate_pos != Some(r1.pos)
238 {
239 continue;
240 }
241 let hi_ok = matches!((r1.hi, r2.hi), (Some(a), Some(b)) if a == b)
243 || r1.hi.is_none()
244 || r2.hi.is_none();
245 if hi_ok {
246 mate = Some(j);
247 break;
248 }
249 if mate.is_none() {
250 mate = Some(j);
251 }
252 }
253 if let Some(j) = mate {
254 used[i] = true;
255 used[j] = true;
256 let mut idxs = vec![i, j];
257 idxs.sort_by_key(|&k| recs[k].pos);
258 placements.push(Placement { tid: r1.tid, idxs });
259 }
260 }
261 for i in 0..n {
262 if !used[i] {
263 placements.push(Placement {
264 tid: recs[i].tid,
265 idxs: vec![i],
266 });
267 }
268 }
269 placements
270}
271
272fn logsumexp(xs: &[f64]) -> f64 {
274 let m = xs.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
275 if m == f64::NEG_INFINITY {
276 return f64::NEG_INFINITY;
277 }
278 m + xs.iter().map(|&x| (x - m).exp()).sum::<f64>().ln()
279}
280
281fn frag_format(recs: &[FragRecord], idxs: &[usize]) -> (Option<LibraryFormat>, bool, MateStatus) {
294 if idxs.len() >= 2 {
295 let r1 = idxs
296 .iter()
297 .map(|&i| &recs[i])
298 .find(|r| r.is_read1)
299 .unwrap_or(&recs[idxs[0]]);
300 let r2 = idxs
301 .iter()
302 .map(|&i| &recs[i])
303 .find(|r| !r.is_read1)
304 .unwrap_or(&recs[idxs[1]]);
305 let (r1_fw, r2_fw) = (!r1.is_reverse, !r2.is_reverse);
306 let (orientation, strandedness) = if r1_fw != r2_fw {
307 let (fw, rc) = if r1_fw { (r1, r2) } else { (r2, r1) };
308 let orientation = if fw.pos <= rc.pos {
309 ReadOrientation::Toward
310 } else {
311 ReadOrientation::Away
312 };
313 let strandedness = if r1_fw {
314 ReadStrandedness::SA
315 } else {
316 ReadStrandedness::AS
317 };
318 (orientation, strandedness)
319 } else {
320 (
321 ReadOrientation::Same,
322 if r1_fw {
323 ReadStrandedness::S
324 } else {
325 ReadStrandedness::A
326 },
327 )
328 };
329 (
330 Some(LibraryFormat::new(
331 ReadType::PairedEnd,
332 orientation,
333 strandedness,
334 )),
335 r1_fw,
336 MateStatus::PairedEndPaired,
337 )
338 } else {
339 let r = &recs[idxs[0]];
340 let status = if r.is_read1 {
341 MateStatus::PairedEndLeft
342 } else {
343 MateStatus::PairedEndRight
344 };
345 (None, !r.is_reverse, status)
346 }
347}
348
349#[inline]
351fn base_2bit(b: u8) -> u8 {
352 match b {
353 b'A' | b'a' => 0,
354 b'C' | b'c' => 1,
355 b'G' | b'g' => 2,
356 b'T' | b't' => 3,
357 _ => 0,
358 }
359}
360
361fn load_ref_bytes(path: &Path, names: &[String]) -> Result<Vec<Vec<u8>>> {
366 let file = std::fs::File::open(path).with_context(|| format!("opening {}", path.display()))?;
367 let mut magic = [0u8; 2];
368 let is_gz = {
369 use std::io::Read;
370 let mut f = std::fs::File::open(path)?;
371 f.read_exact(&mut magic).is_ok() && magic == [0x1f, 0x8b]
372 };
373 let reader: Box<dyn BufRead> = if is_gz {
374 Box::new(std::io::BufReader::new(flate2::read::MultiGzDecoder::new(
375 file,
376 )))
377 } else {
378 Box::new(std::io::BufReader::new(file))
379 };
380
381 let mut by_name: HashMap<String, Vec<u8>> = HashMap::new();
382 let mut cur_name: Option<String> = None;
383 let mut cur_seq: Vec<u8> = Vec::new();
384 for line in reader.lines() {
385 let line = line?;
386 if let Some(stripped) = line.strip_prefix('>') {
387 if let Some(n) = cur_name.take() {
388 by_name.insert(n, std::mem::take(&mut cur_seq));
389 }
390 cur_name = Some(stripped.split_whitespace().next().unwrap_or("").to_string());
391 } else if cur_name.is_some() {
392 cur_seq.extend(line.trim_end().bytes());
393 }
394 }
395 if let Some(n) = cur_name.take() {
396 by_name.insert(n, cur_seq);
397 }
398 Ok(names
399 .iter()
400 .map(|n| by_name.remove(n).unwrap_or_default())
401 .collect())
402}
403
404struct FragRecord {
406 tid: u32,
407 pos: usize,
408 read_2bit: Vec<u8>,
409 ops: Vec<(AlnOp, usize)>,
410 #[allow(dead_code)]
414 score: i32,
415 frag_len: i32,
416 is_reverse: bool,
418 is_read1: bool,
420 ref_span: usize,
423 mate_tid: Option<u32>,
425 mate_pos: Option<usize>,
427 hi: Option<i32>,
430}
431
432impl FragRecord {
433 #[inline]
436 fn five_prime(&self) -> usize {
437 if self.is_reverse {
438 self.pos + self.ref_span.saturating_sub(1)
439 } else {
440 self.pos
441 }
442 }
443}
444
445#[inline]
449fn canonical_name(name: &[u8]) -> &[u8] {
450 let l = name.len();
451 if l > 2 && name[l - 2] == b'/' {
452 &name[..l - 2]
453 } else {
454 name
455 }
456}
457
458fn kind_to_op(k: Kind) -> AlnOp {
460 match k {
461 Kind::Match => AlnOp::Match,
462 Kind::SequenceMatch => AlnOp::SeqMatch,
463 Kind::SequenceMismatch => AlnOp::SeqMismatch,
464 Kind::Insertion => AlnOp::Ins,
465 Kind::Deletion => AlnOp::Del,
466 Kind::Skip => AlnOp::RefSkip,
467 Kind::SoftClip => AlnOp::SoftClip,
468 Kind::HardClip => AlnOp::HardClip,
469 Kind::Pad => AlnOp::Pad,
470 }
471}
472
473fn is_sam_path(p: &Path) -> bool {
475 let s = p.to_string_lossy().to_ascii_lowercase();
476 s.ends_with(".sam") || s.ends_with(".sam.gz")
477}
478
479fn open_sam_reader(path: &Path) -> Result<sam::io::Reader<Box<dyn BufRead + Send>>> {
482 let file = std::fs::File::open(path).with_context(|| format!("opening {}", path.display()))?;
483 let lower = path.to_string_lossy().to_ascii_lowercase();
484 let inner: Box<dyn BufRead + Send> = if lower.ends_with(".gz") {
485 Box::new(std::io::BufReader::new(flate2::read::MultiGzDecoder::new(
486 file,
487 )))
488 } else {
489 Box::new(std::io::BufReader::new(file))
490 };
491 Ok(sam::io::Reader::new(inner))
492}
493
494fn read_alignment_header(path: &Path) -> Result<Header> {
496 if is_sam_path(path) {
497 open_sam_reader(path)?
498 .read_header()
499 .context("reading SAM header")
500 } else {
501 let mut reader = bam::io::Reader::new(
502 std::fs::File::open(path).with_context(|| format!("opening {}", path.display()))?,
503 );
504 reader.read_header().context("reading BAM header")
505 }
506}
507
508fn record_to_frag<R: sam::alignment::Record>(
513 record: &R,
514 header: &Header,
515 need_seq: bool,
516) -> Option<(Vec<u8>, FragRecord)> {
517 let flags = record.flags().ok()?;
518 if flags.is_unmapped() {
519 return None;
520 }
521 let name = record.name()?;
522 let cname = canonical_name(name.as_ref()).to_vec();
523
524 let tid = record.reference_sequence_id(header)?.ok()?;
525 let pos = record
526 .alignment_start()
527 .and_then(|r| r.ok())
528 .map(|p| p.get() - 1)
529 .unwrap_or(0);
530 let read_2bit = if need_seq {
531 record.sequence().iter().map(base_2bit).collect()
532 } else {
533 Vec::new()
534 };
535 let ops: Vec<(AlnOp, usize)> = record
536 .cigar()
537 .iter()
538 .filter_map(|r| r.ok())
539 .map(|op| (kind_to_op(op.kind()), op.len()))
540 .collect();
541 let ref_span: usize = ops
542 .iter()
543 .filter(|(o, _)| {
544 matches!(
545 o,
546 AlnOp::Match | AlnOp::SeqMatch | AlnOp::SeqMismatch | AlnOp::Del | AlnOp::RefSkip
547 )
548 })
549 .map(|(_, l)| l)
550 .sum();
551 let ops = if need_seq { ops } else { Vec::new() };
552 let score = record
553 .data()
554 .get(&Tag::ALIGNMENT_SCORE)
555 .and_then(|r| r.ok())
556 .and_then(|v| value_as_i32(&v))
557 .unwrap_or(0);
558 let frag_len = record.template_length().map(|t| t.abs()).unwrap_or(0);
559 let is_reverse = flags.is_reverse_complemented();
560 let is_read1 = flags.is_first_segment();
561 let mate_tid = (!flags.is_mate_unmapped())
564 .then(|| {
565 record
566 .mate_reference_sequence_id(header)
567 .and_then(|r| r.ok())
568 })
569 .flatten()
570 .map(|t| t as u32);
571 let mate_pos = record
572 .mate_alignment_start()
573 .and_then(|r| r.ok())
574 .map(|p| p.get() - 1);
575 let hi = record
576 .data()
577 .get(&Tag::HIT_INDEX)
578 .and_then(|r| r.ok())
579 .and_then(|v| value_as_i32(&v));
580 Some((
581 cname,
582 FragRecord {
583 tid: tid as u32,
584 pos,
585 read_2bit,
586 ops,
587 score,
588 frag_len,
589 is_reverse,
590 is_read1,
591 ref_span,
592 mate_tid,
593 mate_pos,
594 hi,
595 },
596 ))
597}
598
599struct PassCfg<'a> {
602 online: Option<&'a salmon_infer::OnlineInference>,
603 fld: &'a FragmentLengthDistribution,
604 eq_builder: &'a EquivalenceClassBuilder,
605 ref_bytes: &'a [Vec<u8>],
606 lengths: &'a [u32],
607 gc_store: salmon_model::GcStore<'a>,
608 length_class: Option<&'a [usize]>,
609 expected_format: Option<LibraryFormat>,
610 ignore_incompat: bool,
611 incompat_prior: f64,
612 paired_lib: bool,
613 discard_orphans: bool,
615 pre_burnin: u64,
617 range_factorization_bins: u32,
618 use_error_model: bool,
619 error_bins: usize,
622 seq_bias: bool,
623 gc_bias: bool,
624 cond_gc_bins: usize,
626 gc_bins: usize,
627 pos_bias: bool,
628 need_seq: bool,
629 minibatch: usize,
630 nthreads: usize,
631 progress: Option<&'a salmon_core::ProgressCounters>,
633}
634
635struct PassAccum {
640 seq_obs: Option<(salmon_model::SBModel, salmon_model::SBModel)>,
641 gc_obs: Option<salmon_model::GcFragModel>,
642 pos_obs: Option<(
643 Vec<salmon_model::SimplePosBias>,
644 Vec<salmon_model::SimplePosBias>,
645 )>,
646 num_processed: u64,
647 num_mapped: u64,
648}
649
650fn run_online_pass<R, I>(
665 records: I,
666 header: &Header,
667 cfg: &PassCfg,
668 acc: &mut PassAccum,
669) -> Result<()>
670where
671 R: sam::alignment::Record + Send + Sync,
672 I: Iterator<Item = std::io::Result<R>> + Send,
673{
674 use crossbeam_channel::bounded;
675
676 const FLUSH_INTERVAL: usize = 64;
681
682 let shared_model = cfg
685 .use_error_model
686 .then(|| SharedAlignmentModel::new(1.0, cfg.error_bins));
687 let shared_model_ref = shared_model.as_ref();
688 let minibatch = cfg.minibatch;
689 let need_seq = cfg.need_seq;
690 let (tx, rx) = bounded::<(f64, Vec<Vec<R>>)>(2 * cfg.nthreads + 1);
694 let online_reader = cfg.online;
695
696 std::thread::scope(|scope| -> Result<()> {
697 let reader = scope.spawn(move || -> Result<()> {
699 let mut cur_name: Vec<u8> = Vec::new();
700 let mut have = false;
701 let mut group: Vec<R> = Vec::new();
702 let mut batch: Vec<Vec<R>> = Vec::with_capacity(minibatch);
703 for result in records {
704 let rec = result.context("reading alignment record")?;
705 if rec.flags().map(|f| f.is_unmapped()).unwrap_or(true) {
706 continue;
707 }
708 let Some(name) = rec.name() else { continue };
709 let cname = canonical_name(name.as_ref()).to_vec();
710 if !have {
711 cur_name = cname;
712 have = true;
713 } else if cname != cur_name {
714 batch.push(std::mem::take(&mut group));
715 cur_name = cname;
716 if batch.len() >= minibatch {
717 let fm = online_reader.map(|o| o.next_log_fm()).unwrap_or(0.0);
718 if tx.send((fm, std::mem::take(&mut batch))).is_err() {
719 return Ok(()); }
721 batch = Vec::with_capacity(minibatch);
722 }
723 }
724 group.push(rec);
725 }
726 if have && !group.is_empty() {
727 batch.push(group);
728 }
729 if !batch.is_empty() {
730 let fm = online_reader.map(|o| o.next_log_fm()).unwrap_or(0.0);
731 let _ = tx.send((fm, batch));
732 }
733 Ok(())
734 });
735
736 let mut workers = Vec::with_capacity(cfg.nthreads);
738 for _ in 0..cfg.nthreads {
739 let rx = rx.clone();
740 workers.push(scope.spawn(move || -> (Local, u64) {
741 let mut local = Local::new(
742 cfg.use_error_model,
743 cfg.error_bins,
744 cfg.seq_bias,
745 cfg.gc_bias,
746 cfg.cond_gc_bins,
747 cfg.gc_bins,
748 cfg.pos_bias,
749 );
750 let mut count = 0u64;
751 let mut frags: Vec<FragRecord> = Vec::new();
752 while let Ok((log_fm, raw_batch)) = rx.recv() {
753 let ctx = FragCtx {
754 model: shared_model_ref,
755 online: cfg.online,
756 fld: cfg.fld,
757 eq_builder: cfg.eq_builder,
758 ref_bytes: cfg.ref_bytes,
759 lengths: cfg.lengths,
760 gc_store: cfg.gc_store,
761 length_class: cfg.length_class,
762 expected_format: cfg.expected_format,
763 ignore_incompat: cfg.ignore_incompat,
764 incompat_prior: cfg.incompat_prior,
765 paired_lib: cfg.paired_lib,
766 discard_orphans: cfg.discard_orphans,
767 pre_burnin: cfg.pre_burnin,
768 range_factorization_bins: cfg.range_factorization_bins,
769 log_fm,
770 };
771 let mut since_flush = 0usize;
772 for raw_group in &raw_batch {
773 frags.clear();
774 for r in raw_group {
775 if let Some((_, f)) = record_to_frag(r, header, need_seq) {
776 frags.push(f);
777 }
778 }
779 process_fragment(&frags, &ctx, &mut local);
780 since_flush += 1;
787 if since_flush >= FLUSH_INTERVAL {
788 if let (Some(sm), Some(d)) = (shared_model_ref, local.model.as_mut()) {
789 sm.flush_from(d);
790 d.clear();
791 }
792 since_flush = 0;
793 }
794 }
795 if since_flush > 0 {
796 if let (Some(sm), Some(d)) = (shared_model_ref, local.model.as_mut()) {
797 sm.flush_from(d);
798 d.clear();
799 }
800 }
801 count += raw_batch.len() as u64;
802 if let Some(p) = cfg.progress {
805 let n = raw_batch.len() as u64;
806 p.processed
807 .fetch_add(n, std::sync::atomic::Ordering::Relaxed);
808 p.mapped.fetch_add(n, std::sync::atomic::Ordering::Relaxed);
809 }
810 }
811 (local, count)
812 }));
813 }
814 drop(rx); reader
817 .join()
818 .map_err(|_| anyhow::anyhow!("alignment reader thread panicked"))??;
819
820 let mut merged = Local::new(
822 cfg.use_error_model,
823 cfg.error_bins,
824 cfg.seq_bias,
825 cfg.gc_bias,
826 cfg.cond_gc_bins,
827 cfg.gc_bins,
828 cfg.pos_bias,
829 );
830 let mut total = 0u64;
831 for w in workers {
832 let (local, count) = w
833 .join()
834 .map_err(|_| anyhow::anyhow!("alignment worker thread panicked"))?;
835 merged = merged.merge(local);
836 total += count;
837 }
838 acc.seq_obs = merged.seq_obs;
839 acc.gc_obs = merged.gc_obs;
840 acc.pos_obs = merged.pos_obs;
841 acc.num_processed = total;
842 acc.num_mapped = total;
843 Ok(())
844 })
845}
846
847fn stream_online_pass(bam_path: &Path, cfg: &PassCfg, acc: &mut PassAccum) -> Result<()> {
851 if is_sam_path(bam_path) {
852 let mut reader = open_sam_reader(bam_path)?;
853 let header = reader.read_header().context("reading SAM header")?;
854 run_online_pass(reader.records(), &header, cfg, acc)
855 } else {
856 let file = std::fs::File::open(bam_path)
857 .with_context(|| format!("opening {}", bam_path.display()))?;
858 let workers = std::thread::available_parallelism()
859 .map(|n| n.get())
860 .unwrap_or(4)
861 .clamp(1, 4);
862 let workers = std::num::NonZeroUsize::new(workers).unwrap();
863 let decoder = noodles_bgzf::io::MultithreadedReader::with_worker_count(workers, file);
864 let mut reader = bam::io::Reader::from(decoder);
865 let header = reader.read_header().context("reading BAM header")?;
866 run_online_pass(reader.records(), &header, cfg, acc)
867 }
868}
869
870struct FragCtx<'a> {
874 model: Option<&'a SharedAlignmentModel>,
877 online: Option<&'a salmon_infer::OnlineInference>,
878 fld: &'a FragmentLengthDistribution,
879 eq_builder: &'a EquivalenceClassBuilder,
880 ref_bytes: &'a [Vec<u8>],
881 lengths: &'a [u32],
882 gc_store: salmon_model::GcStore<'a>,
883 length_class: Option<&'a [usize]>,
884 expected_format: Option<LibraryFormat>,
885 ignore_incompat: bool,
886 incompat_prior: f64,
887 paired_lib: bool,
888 discard_orphans: bool,
890 pre_burnin: u64,
892 range_factorization_bins: u32,
893 log_fm: f64,
895}
896
897struct Local {
902 model: Option<AlignmentModel>,
903 seq_obs: Option<(salmon_model::SBModel, salmon_model::SBModel)>,
904 gc_obs: Option<salmon_model::GcFragModel>,
905 pos_obs: Option<(
906 Vec<salmon_model::SimplePosBias>,
907 Vec<salmon_model::SimplePosBias>,
908 )>,
909}
910
911impl Local {
912 #[allow(clippy::too_many_arguments)]
913 fn new(
914 error_model: bool,
915 error_bins: usize,
916 seq_bias: bool,
917 gc_bias: bool,
918 cond_gc_bins: usize,
919 gc_bins: usize,
920 pos_bias: bool,
921 ) -> Self {
922 let mk = || {
923 (0..salmon_model::NUM_LENGTH_CLASSES)
924 .map(|_| salmon_model::SimplePosBias::default())
925 .collect::<Vec<_>>()
926 };
927 Self {
928 model: error_model.then(|| AlignmentModel::empty(error_bins)),
929 seq_obs: seq_bias.then(|| (salmon_model::SBModel::new(), salmon_model::SBModel::new())),
930 gc_obs: gc_bias.then(|| salmon_model::GcFragModel::new(cond_gc_bins, gc_bins)),
931 pos_obs: pos_bias.then(|| (mk(), mk())),
932 }
933 }
934
935 fn merge(mut self, other: Self) -> Self {
936 if let (Some(a), Some(b)) = (self.model.as_mut(), other.model.as_ref()) {
937 a.combine(b);
938 }
939 if let (Some(a), Some(b)) = (self.seq_obs.as_mut(), other.seq_obs.as_ref()) {
940 a.0.combine_counts(&b.0);
941 a.1.combine_counts(&b.1);
942 }
943 if let (Some(a), Some(b)) = (self.gc_obs.as_mut(), other.gc_obs.as_ref()) {
944 a.combine_counts(b);
945 }
946 if let (Some(a), Some(b)) = (self.pos_obs.as_mut(), other.pos_obs.as_ref()) {
947 for (x, y) in a.0.iter_mut().zip(&b.0) {
948 x.combine(y);
949 }
950 for (x, y) in a.1.iter_mut().zip(&b.1) {
951 x.combine(y);
952 }
953 }
954 self
955 }
956}
957
958fn process_fragment(recs: &[FragRecord], ctx: &FragCtx, local: &mut Local) {
965 use salmon_model::seqbias::{CONTEXT_LEFT, CONTEXT_LENGTH, CONTEXT_RIGHT};
966 const LOG_EPSILON: f64 = -23.998_158_637_57;
968
969 let placements = pair_records(recs);
972 let frag_len = recs.iter().map(|r| r.frag_len).max().unwrap_or(0);
973 if frag_len > 0 {
974 ctx.fld.add_val(frag_len as usize, 0.0);
975 }
976 let use_aux = ctx
977 .online
978 .is_none_or(|o| o.num_assigned() >= ctx.pre_burnin);
979
980 let mut sp_tid: Vec<u32> = Vec::with_capacity(placements.len());
983 let mut sp_eq: Vec<f64> = Vec::with_capacity(placements.len());
984 let mut sp_online: Vec<f64> = Vec::with_capacity(placements.len());
985 let mut sp_geom: Vec<(usize, usize, bool)> = Vec::with_capacity(placements.len());
986 let mut sp_pl: Vec<usize> = Vec::with_capacity(placements.len());
987 for (pi, pl) in placements.iter().enumerate() {
988 let tid = pl.tid;
989 let idxs = &pl.idxs;
990 if ctx.discard_orphans && ctx.paired_lib && idxs.len() < 2 {
993 continue;
994 }
995 let refseq = ctx
996 .ref_bytes
997 .get(tid as usize)
998 .map(|v| v.as_slice())
999 .unwrap_or(&[]);
1000 let basis = if let Some(m) = ctx.model {
1003 if refseq.is_empty() {
1004 0.0
1005 } else {
1006 let mut ll = 0.0;
1007 for (rank, &i) in idxs.iter().enumerate() {
1008 let r = &recs[i];
1009 let (fg, bg) = m.log_likelihood(&r.read_2bit, refseq, r.pos, &r.ops, rank == 0);
1010 ll += fg - bg;
1011 }
1012 ll
1013 }
1014 } else {
1015 0.0
1016 };
1017 let rl = ctx.lengths[tid as usize] as i32;
1018 let flen = idxs.iter().map(|&i| recs[i].frag_len).max().unwrap_or(0);
1019 let proper = idxs.len() >= 2 && flen > 0;
1020 let frag_start = recs[idxs[0]].pos;
1021 let frag_end = frag_start + (flen.max(1) as usize) - 1;
1022 let start_pos = if proper && flen <= rl {
1023 -(((rl - flen + 1) as f64).ln())
1024 } else {
1025 -((rl.max(1) as f64).ln())
1026 };
1027 let log_frag_prob = if proper {
1028 if use_aux {
1029 ctx.fld.pmf(flen as usize)
1030 } else {
1031 0.0
1032 }
1033 } else if ctx.paired_lib {
1034 LOG_EPSILON
1035 } else {
1036 0.0
1037 };
1038 let mut aux = basis + log_frag_prob;
1039 if let Some(exp) = ctx.expected_format {
1040 let (obs, is_fw, status) = frag_format(recs, idxs);
1041 if !is_compatible(exp, obs, is_fw, status) {
1042 if ctx.ignore_incompat {
1043 continue; }
1045 aux += ctx.incompat_prior.ln();
1046 }
1047 }
1048 sp_tid.push(tid);
1049 sp_eq.push(aux);
1050 sp_online.push(aux + start_pos);
1051 sp_geom.push((frag_start, frag_end, proper));
1052 sp_pl.push(pi);
1053 }
1054 if sp_tid.is_empty() {
1057 return;
1058 }
1059
1060 let mut agg: std::collections::BTreeMap<u32, Vec<usize>> = std::collections::BTreeMap::new();
1062 for (k, &t) in sp_tid.iter().enumerate() {
1063 agg.entry(t).or_default().push(k);
1064 }
1065 let tids: Vec<u32> = agg.keys().cloned().collect();
1066 let eq_log: Vec<f64> = agg
1067 .values()
1068 .map(|ks| logsumexp(&ks.iter().map(|&k| sp_eq[k]).collect::<Vec<_>>()))
1069 .collect();
1070 let online_log: Vec<f64> = agg
1071 .values()
1072 .map(|ks| logsumexp(&ks.iter().map(|&k| sp_online[k]).collect::<Vec<_>>()))
1073 .collect();
1074
1075 let maxe = eq_log.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
1077 let mut weights: Vec<f64> = eq_log.iter().map(|&l| (l - maxe).exp()).collect();
1078 let wsum: f64 = weights.iter().sum();
1079 if wsum > 0.0 {
1080 for w in &mut weights {
1081 *w /= wsum;
1082 }
1083 }
1084
1085 let post: Vec<f64> = match ctx.online {
1087 Some(o) => {
1088 let maps: Vec<(u32, f64)> = tids
1089 .iter()
1090 .cloned()
1091 .zip(online_log.iter().cloned())
1092 .collect();
1093 o.assign_fragment(&maps, ctx.log_fm)
1094 }
1095 None => weights.clone(),
1096 };
1097
1098 let collecting = ctx.online.is_none_or(|o| o.collecting());
1100 if collecting {
1101 for (ti, (tid, ks)) in agg.iter().enumerate() {
1102 let p_tid = post[ti];
1103 if p_tid <= 0.0 {
1104 continue;
1105 }
1106 let online_log_tid = online_log[ti];
1107 let refseq = ctx
1108 .ref_bytes
1109 .get(*tid as usize)
1110 .map(|v| v.as_slice())
1111 .unwrap_or(&[]);
1112 for &k in ks {
1113 let p = p_tid * (sp_online[k] - online_log_tid).exp();
1114 if p <= 0.0 {
1115 continue;
1116 }
1117 let idxs = &placements[sp_pl[k]].idxs;
1118 if let Some(m) = local.model.as_mut() {
1119 if !refseq.is_empty() {
1120 let lw = ctx.log_fm + p.ln();
1121 for (rank, &i) in idxs.iter().enumerate() {
1122 let r = &recs[i];
1123 m.update(&r.read_2bit, refseq, r.pos, &r.ops, rank == 0, lw);
1124 }
1125 }
1126 }
1127 if refseq.is_empty() {
1128 continue;
1129 }
1130 let (fs, fe, proper) = sp_geom[k];
1131 let rl = refseq.len();
1132 let (mut fwd_five, mut rev_five): (Option<usize>, Option<usize>) = (None, None);
1134 if idxs.len() == 1 {
1135 let r = &recs[idxs[0]];
1136 if r.is_reverse {
1137 rev_five = Some(r.five_prime());
1138 } else {
1139 fwd_five = Some(r.five_prime());
1140 }
1141 } else {
1142 let fwd = idxs.iter().map(|&i| &recs[i]).find(|r| !r.is_reverse);
1143 let rev = idxs.iter().map(|&i| &recs[i]).find(|r| r.is_reverse);
1144 if let (Some(fr), Some(rr)) = (fwd, rev) {
1145 let (fp, rp) = (fr.five_prime(), rr.five_prime());
1146 if fp < rp {
1147 fwd_five = Some(fp);
1148 rev_five = Some(rp);
1149 }
1150 }
1151 }
1152 if let Some(obs) = local.seq_obs.as_mut() {
1153 if let Some(five) = fwd_five {
1154 let s = five as i32 - CONTEXT_LEFT as i32;
1155 if s >= 0 && (s as usize + CONTEXT_LENGTH) <= rl {
1156 obs.0.add_context(
1157 &refseq[s as usize..s as usize + CONTEXT_LENGTH],
1158 false,
1159 p,
1160 );
1161 }
1162 }
1163 if let Some(five) = rev_five {
1164 let s = five as i32 - CONTEXT_RIGHT as i32;
1165 if s >= 0 && (s as usize + CONTEXT_LENGTH) <= rl {
1166 obs.1.add_context(
1167 &refseq[s as usize..s as usize + CONTEXT_LENGTH],
1168 true,
1169 p,
1170 );
1171 }
1172 }
1173 }
1174 if let (Some(gc), true) = (local.gc_obs.as_mut(), proper && fe < rl) {
1175 let view = ctx.gc_store.view(*tid as usize);
1176 if let Some((ff, cf)) = salmon_model::gc_desc(&view, fs as i32, fe as i32) {
1177 gc.inc(ff, cf, p);
1178 }
1179 }
1180 if let Some(pos) = local.pos_obs.as_mut() {
1181 let lc = ctx.length_class.unwrap()[*tid as usize];
1182 if let Some(five) = fwd_five {
1183 pos.0[lc].add_mass(five as i32, rl as i32, p.ln());
1184 }
1185 if let Some(five) = rev_five {
1186 pos.1[lc].add_mass(five as i32, rl as i32, p.ln());
1187 }
1188 }
1189 }
1190 }
1191 }
1192
1193 let group = if ctx.range_factorization_bins > 0 {
1194 let bins = range_factorize_bins(&weights, ctx.range_factorization_bins);
1195 TranscriptGroup::with_bins(tids, bins)
1196 } else {
1197 TranscriptGroup::from_sorted(tids)
1198 };
1199 ctx.eq_builder.add_group(group, weights, 1);
1200}
1201
1202fn coordinate_sorted_unusable(header: &noodles_sam::Header) -> bool {
1212 let Some(hd) = header.header() else {
1213 return false;
1214 };
1215 let mut so_coord = false;
1216 let mut go_query = false;
1217 for (tag, value) in hd.other_fields() {
1219 let t: &[u8; 2] = tag.as_ref();
1220 let v: &[u8] = value.as_ref();
1221 if t == b"SO" {
1222 so_coord = v == &b"coordinate"[..];
1223 } else if t == b"GO" {
1224 go_query = v == &b"query"[..];
1225 }
1226 }
1227 so_coord && !go_query
1228}
1229
1230pub fn quantify_alignments(opts: &AlignQuantOptions) -> Result<AlignQuantResult> {
1232 let start_time = asctime_now();
1233 let header = read_alignment_header(&opts.bam)?;
1234
1235 anyhow::ensure!(
1239 !coordinate_sorted_unusable(&header),
1240 "the input BAM/SAM appears to be coordinate-sorted (@HD SO:coordinate) and is not \
1241 grouped by read name (GO:query). Alignment-mode quantification requires that all \
1242 alignment records of a read (or read pair) are adjacent in the file. Please collate \
1243 it by read name first, e.g. `samtools collate` or `samtools sort -n`."
1244 );
1245
1246 let names: Vec<String> = header
1248 .reference_sequences()
1249 .keys()
1250 .map(|k| String::from_utf8_lossy(k.as_ref()).into_owned())
1251 .collect();
1252 let lengths: Vec<u32> = header
1253 .reference_sequences()
1254 .values()
1255 .map(|rs| rs.length().get() as u32)
1256 .collect();
1257 let num_refs = names.len();
1258 anyhow::ensure!(num_refs > 0, "BAM header has no reference sequences");
1259
1260 let eq_builder = EquivalenceClassBuilder::new();
1261 let mut fld =
1262 FragmentLengthDistribution::new(1.0, opts.fld_max, opts.fld_mean, opts.fld_sd, 4, 0.5, 1);
1263
1264 let use_error_model = opts.transcripts.is_some() && !opts.no_error_model;
1266 let bias_on = opts.seq_bias || opts.gc_bias || opts.pos_bias;
1267 anyhow::ensure!(
1268 !bias_on || opts.transcripts.is_some(),
1269 "--seqBias/--gcBias/--posBias in alignment mode require -t/--targets (the transcriptome FASTA)"
1270 );
1271 let ref_bytes: Vec<Vec<u8>> = if use_error_model || bias_on {
1272 load_ref_bytes(opts.transcripts.as_ref().unwrap(), &names)?
1273 } else {
1274 Vec::new()
1275 };
1276
1277 let ref_lens_u64: Vec<u64> = lengths.iter().map(|&l| l as u64).collect();
1281 let online = (use_error_model || bias_on).then(|| {
1282 salmon_infer::OnlineInference::new(
1283 &ref_lens_u64,
1284 0.05,
1285 opts.forgetting_factor,
1286 opts.num_aux_model_samples,
1287 )
1288 });
1289
1290 use salmon_model::seqbias::CONTEXT_LENGTH;
1293 let (gc_rank, gc_offsets): (Option<salmon_model::GcRank>, Vec<u64>) = if opts.gc_bias {
1298 let mut concat: Vec<u8> = Vec::new();
1299 let mut offs: Vec<u64> = Vec::with_capacity(ref_bytes.len() + 1);
1300 offs.push(0);
1301 for s in &ref_bytes {
1302 concat.extend_from_slice(s);
1303 offs.push(concat.len() as u64);
1304 }
1305 (Some(salmon_model::GcRank::new(&concat)), offs)
1306 } else {
1307 (None, Vec::new())
1308 };
1309 let gc_store = match &gc_rank {
1310 Some(r) => salmon_model::GcStore::Rank {
1311 rank: r,
1312 offsets: &gc_offsets,
1313 },
1314 None => salmon_model::GcStore::Dense(&[]),
1315 };
1316 let length_quantiles: Option<Vec<u32>> = opts.pos_bias.then(|| {
1317 salmon_model::compute_length_quantiles(&lengths, salmon_model::NUM_LENGTH_CLASSES)
1318 });
1319 let length_class: Option<Vec<usize>> = length_quantiles.as_ref().map(|q| {
1320 lengths
1321 .iter()
1322 .map(|&l| salmon_model::length_class_index(q, l))
1323 .collect()
1324 });
1325
1326 const MINIBATCH: usize = 1000;
1332 let paired_lib = !matches!(opts.lib_type.as_str(), "U" | "SF" | "SR" | "S");
1336 let expected_format = match opts.lib_type.as_str() {
1340 "A" => None,
1341 s => LibraryFormat::parse(s).ok(),
1342 };
1343 let ignore_incompat = opts.incompat_prior <= 0.0;
1344 let nthreads = rayon::current_num_threads().max(1);
1345
1346 let (seq_obs, gc_obs, pos_obs, num_processed, num_mapped) = {
1351 let cfg = PassCfg {
1352 online: online.as_ref(),
1353 fld: &fld,
1354 eq_builder: &eq_builder,
1355 ref_bytes: &ref_bytes,
1356 lengths: &lengths,
1357 gc_store,
1358 length_class: length_class.as_deref(),
1359 expected_format,
1360 ignore_incompat,
1361 incompat_prior: opts.incompat_prior,
1362 paired_lib,
1363 discard_orphans: opts.discard_orphans,
1364 pre_burnin: opts.num_pre_aux_model_samples,
1365 range_factorization_bins: opts.range_factorization_bins,
1366 use_error_model,
1367 error_bins: opts.num_error_bins,
1368 seq_bias: opts.seq_bias,
1369 gc_bias: opts.gc_bias,
1370 cond_gc_bins: opts.cond_gc_bins,
1371 gc_bins: opts.gc_bins,
1372 pos_bias: opts.pos_bias,
1373 need_seq: use_error_model || bias_on,
1374 minibatch: MINIBATCH,
1375 nthreads,
1376 progress: opts.progress.as_deref(),
1377 };
1378 let mut acc = PassAccum {
1379 seq_obs: None,
1380 gc_obs: None,
1381 pos_obs: None,
1382 num_processed: 0,
1383 num_mapped: 0,
1384 };
1385 stream_online_pass(&opts.bam, &cfg, &mut acc)?;
1386 (
1387 acc.seq_obs,
1388 acc.gc_obs,
1389 acc.pos_obs,
1390 acc.num_processed,
1391 acc.num_mapped,
1392 )
1393 };
1394
1395 fld.cache();
1397 let cond_means = fld.conditional_means();
1398 let mut eff_lengths = vec![0f64; num_refs];
1399 for (tid, len) in lengths.iter().enumerate() {
1400 eff_lengths[tid] = salmon_model::smoothed_effective_length(&cond_means, *len as usize);
1401 }
1402
1403 let mut collapsed = eq_builder.finish();
1404 collapsed.update_eff_lengths(&eff_lengths);
1405 let num_eq_classes = collapsed.len();
1406
1407 let init_alphas: Option<Vec<f64>> = online.as_ref().map(|o| {
1413 let masses: Vec<f64> = (0..num_refs).map(|t| o.mass_log(t).exp()).collect();
1414 let mass_sum: f64 = masses.iter().sum();
1415 let total_reads = num_mapped as f64;
1416 let projected: Vec<f64> = if mass_sum > 0.0 {
1418 masses.iter().map(|&m| m / mass_sum * total_reads).collect()
1419 } else {
1420 vec![0.0; num_refs]
1421 };
1422 let uniform_prior = if num_refs > 0 {
1423 total_reads / num_refs as f64
1424 } else {
1425 0.0
1426 };
1427 const NUM_REQUIRED_FRAGMENTS: f64 = 50_000_000.0;
1429 let frac_observed = (total_reads / NUM_REQUIRED_FRAGMENTS).min(0.999);
1430 projected
1431 .iter()
1432 .map(|&p| p * frac_observed + uniform_prior * (1.0 - frac_observed))
1433 .collect()
1434 });
1435 let mut em = if opts.skip_quant {
1438 salmon_infer::EmResult {
1441 alphas: vec![0.0; num_refs],
1442 iters: 0,
1443 converged: true,
1444 }
1445 } else if opts.init_uniform {
1446 optimize(&collapsed, num_refs, &opts.em, Some(&eff_lengths))
1447 } else {
1448 optimize_with_init(
1449 &collapsed,
1450 num_refs,
1451 &opts.em,
1452 init_alphas.as_deref(),
1453 Some(&eff_lengths),
1454 )
1455 };
1456
1457 let mut bias_dump = salmon_model::dumps::BiasDump::default();
1459 if bias_on && !opts.skip_quant {
1460 let log_pmf = fld.log_pmf();
1461 let pmf_lin: Vec<f64> = log_pmf.iter().map(|lp| lp.exp()).collect();
1462 let (fld_cdf, fld_low, fld_high) = salmon_model::seqbias::fld_cdf_and_bounds(&pmf_lin);
1463 let k = if opts.seq_bias { CONTEXT_LENGTH } else { 1 };
1464 let refseq_of = |t: usize| ref_bytes[t].as_slice();
1465
1466 let seq = seq_obs.map(|(mut of, mut or)| {
1467 of.normalize();
1468 or.normalize();
1469 let (ef, er) = salmon_model::build_expected(
1470 num_refs,
1471 refseq_of,
1472 &em.alphas,
1473 &eff_lengths,
1474 &fld_cdf,
1475 );
1476 (of, or, ef, er)
1477 });
1478 if let Some((of, or, ef, er)) = seq.as_ref() {
1479 bias_dump.obs5_seq = of.dump().to_vec();
1480 bias_dump.obs3_seq = or.dump().to_vec();
1481 bias_dump.exp5_seq = ef.dump().to_vec();
1482 bias_dump.exp3_seq = er.dump().to_vec();
1483 }
1484 let gc_ratio_model = if let Some(mut obs) = gc_obs {
1485 let mut exp = salmon_model::build_expected_gc(
1486 num_refs,
1487 refseq_of,
1488 |t| gc_store.view(t),
1489 &em.alphas,
1490 &eff_lengths,
1491 &fld_cdf,
1492 fld_low,
1493 fld_high,
1494 opts.cond_gc_bins,
1495 opts.gc_bins,
1496 k,
1497 opts.bias_speed_samp,
1498 );
1499 obs.normalize();
1500 exp.normalize();
1501 bias_dump.obs_gc = obs.dump().to_vec();
1502 bias_dump.exp_gc = exp.dump().to_vec();
1503 Some(salmon_model::gc_ratio(
1504 &mut obs,
1505 &mut exp,
1506 salmon_model::gcbias::GC_MAX_RATIO,
1507 ))
1508 } else {
1509 None
1510 };
1511 let pos_models = pos_obs.map(|(mut of, mut or)| {
1512 for x in of.iter_mut().chain(or.iter_mut()) {
1513 x.finalize();
1514 }
1515 let (ef, er) = salmon_model::build_expected_pos(
1516 num_refs,
1517 |t| lengths[t] as usize,
1518 &em.alphas,
1519 &eff_lengths,
1520 &fld_cdf,
1521 length_quantiles.as_ref().unwrap(),
1522 k,
1523 );
1524 (of, or, ef, er)
1525 });
1526 if let Some((ofw, orc, efw, erc)) = pos_models.as_ref() {
1527 let masses =
1528 |v: &[salmon_model::SimplePosBias]| v.iter().map(|m| m.masses().to_vec()).collect();
1529 bias_dump.obs5_pos = masses(ofw);
1530 bias_dump.obs3_pos = masses(orc);
1531 bias_dump.exp5_pos = masses(efw);
1532 bias_dump.exp3_pos = masses(erc);
1533 }
1534
1535 for tid in 0..num_refs {
1536 if em.alphas[tid] < 1e-8 {
1537 continue;
1538 }
1539 let s = ref_bytes[tid].as_slice();
1540 let pos_vecs: Option<(Vec<f64>, Vec<f64>)> =
1541 pos_models.as_ref().map(|(ofw, orc, efw, erc)| {
1542 let lc = length_class.as_ref().unwrap()[tid];
1543 let rl = s.len();
1544 let (mut o5, mut e5) = (vec![0.0; rl], vec![0.0; rl]);
1545 let (mut o3, mut e3) = (vec![0.0; rl], vec![0.0; rl]);
1546 ofw[lc].project_weights(&mut o5);
1547 efw[lc].project_weights(&mut e5);
1548 orc[lc].project_weights(&mut o3);
1549 erc[lc].project_weights(&mut e3);
1550 (
1551 salmon_model::positional_factor(&o5, &e5),
1552 salmon_model::positional_factor(&o3, &e3),
1553 )
1554 });
1555 let bias = salmon_model::BiasInputs {
1556 seq: seq.as_ref().map(|(of, or, ef, er)| (of, ef, or, er)),
1557 gc: gc_ratio_model.as_ref().map(|g| (g, gc_store.view(tid))),
1558 pos: pos_vecs
1559 .as_ref()
1560 .map(|(pf, pr)| (pf.as_slice(), pr.as_slice())),
1561 };
1562 eff_lengths[tid] = salmon_model::corrected_effective_length_full(
1563 s,
1564 &fld_cdf,
1565 fld_low,
1566 fld_high,
1567 &bias,
1568 eff_lengths[tid],
1569 opts.bias_speed_samp,
1570 opts.no_bias_length_threshold,
1571 );
1572 }
1573 collapsed.update_eff_lengths(&eff_lengths);
1574 em = optimize(&collapsed, num_refs, &opts.em, Some(&eff_lengths));
1575 }
1576 let counts = em.alphas;
1577
1578 let rates: Vec<f64> = (0..num_refs)
1579 .map(|i| {
1580 if eff_lengths[i] > 0.0 {
1581 counts[i] / eff_lengths[i]
1582 } else {
1583 0.0
1584 }
1585 })
1586 .collect();
1587 let rate_sum: f64 = rates.iter().sum();
1588 let tpm: Vec<f64> = rates
1589 .iter()
1590 .map(|r| {
1591 if rate_sum > 0.0 {
1592 r / rate_sum * 1e6
1593 } else {
1594 0.0
1595 }
1596 })
1597 .collect();
1598
1599 let length_classes =
1600 salmon_model::compute_length_quantiles(&lengths, salmon_model::NUM_LENGTH_CLASSES);
1601 let ambig = salmon_infer::ambiguity_counts(&salmon_infer::PackedEqClasses::from_collapsed(
1602 &collapsed, num_refs,
1603 ));
1604 let result = AlignQuantResult {
1605 names,
1606 lengths,
1607 eff_lengths,
1608 tpm,
1609 counts,
1610 num_processed,
1611 num_mapped,
1612 num_eq_classes,
1613 frag_len_mean: fld.mean(),
1614 frag_len_sd: fld.sd(),
1615 length_classes,
1616 frag_len_dist: fld.log_pmf().iter().map(|lp| lp.exp()).collect(),
1617 start_time,
1618 bias_dump,
1619 ambig,
1620 };
1621 write_outputs(opts, &result)?;
1622 Ok(result)
1623}
1624
1625fn write_outputs(opts: &AlignQuantOptions, res: &AlignQuantResult) -> Result<()> {
1626 let dir = &opts.output_dir;
1627 std::fs::create_dir_all(dir.join("aux_info")).context("creating output dirs")?;
1628
1629 if !opts.skip_quant {
1632 let sd = opts.sig_digits as usize;
1633 let mut w = std::io::BufWriter::new(std::fs::File::create(dir.join("quant.sf"))?);
1634 writeln!(w, "Name\tLength\tEffectiveLength\tTPM\tNumReads")?;
1635 for i in 0..res.names.len() {
1636 writeln!(
1637 w,
1638 "{}\t{}\t{:.*}\t{:.6}\t{:.*}",
1639 res.names[i], res.lengths[i], sd, res.eff_lengths[i], res.tpm[i], sd, res.counts[i]
1640 )?;
1641 }
1642 w.flush()?;
1643 }
1644
1645 #[derive(Serialize)]
1648 struct MetaInfo {
1649 salmon_version: String,
1650 samp_type: String,
1651 opt_type: String,
1652 quant_errors: Vec<String>,
1653 num_libraries: usize,
1654 library_types: Vec<String>,
1655 frag_dist_length: usize,
1656 frag_length_mean: f64,
1657 frag_length_sd: f64,
1658 seq_bias_correct: bool,
1659 gc_bias_correct: bool,
1660 num_bias_bins: usize,
1661 mapping_type: String,
1662 index_seq_hash: String,
1664 index_name_hash: String,
1665 index_seq_hash512: String,
1666 index_name_hash512: String,
1667 index_decoy_seq_hash: String,
1668 index_decoy_name_hash: String,
1669 num_valid_targets: usize,
1670 num_decoy_targets: usize,
1671 num_eq_classes: usize,
1672 serialized_eq_classes: bool,
1673 eq_class_properties: Vec<String>,
1674 length_classes: Vec<u32>,
1675 num_processed: u64,
1676 num_mapped: u64,
1677 num_dovetail_fragments: u64,
1678 num_fragments_filtered_vm: u64,
1679 num_alignments_below_threshold_for_mapped_fragments_vm: u64,
1680 percent_mapped: f64,
1681 num_decoy_fragments: u64,
1682 num_bootstraps: u32,
1683 call: String,
1684 start_time: String,
1685 end_time: String,
1686 }
1687 let pct = if res.num_processed > 0 {
1688 100.0 * res.num_mapped as f64 / res.num_processed as f64
1689 } else {
1690 0.0
1691 };
1692 let eq_class_properties = if opts.range_factorization_bins > 0 {
1693 vec!["range_factorized".to_string(), "gzipped".to_string()]
1694 } else {
1695 vec!["gzipped".to_string()]
1696 };
1697 let meta = MetaInfo {
1698 salmon_version: SALMON_VERSION.to_string(),
1699 samp_type: "none".to_string(),
1700 opt_type: if opts.em.use_vbem { "vb" } else { "em" }.to_string(),
1701 quant_errors: vec![],
1702 num_libraries: 1,
1703 library_types: vec![opts.lib_type.clone()],
1704 frag_dist_length: res.frag_len_dist.len(),
1705 frag_length_mean: res.frag_len_mean,
1706 frag_length_sd: res.frag_len_sd,
1707 seq_bias_correct: opts.seq_bias,
1708 gc_bias_correct: opts.gc_bias,
1709 num_bias_bins: 0,
1710 mapping_type: "alignment".to_string(),
1711 index_seq_hash: String::new(),
1712 index_name_hash: String::new(),
1713 index_seq_hash512: String::new(),
1714 index_name_hash512: String::new(),
1715 index_decoy_seq_hash: String::new(),
1716 index_decoy_name_hash: String::new(),
1717 num_valid_targets: res.names.len(),
1718 num_decoy_targets: 0,
1719 num_eq_classes: res.num_eq_classes,
1720 serialized_eq_classes: false,
1721 eq_class_properties,
1722 length_classes: res.length_classes.clone(),
1723 num_processed: res.num_processed,
1724 num_mapped: res.num_mapped,
1725 num_dovetail_fragments: 0,
1726 num_fragments_filtered_vm: 0,
1727 num_alignments_below_threshold_for_mapped_fragments_vm: 0,
1728 percent_mapped: pct,
1729 num_decoy_fragments: 0,
1730 num_bootstraps: 0,
1731 call: "quant".to_string(),
1732 start_time: res.start_time.clone(),
1733 end_time: asctime_now(),
1734 };
1735 std::fs::write(
1736 dir.join("aux_info").join("meta_info.json"),
1737 serde_json::to_string_pretty(&meta)?,
1738 )?;
1739
1740 std::fs::create_dir_all(dir.join("libParams")).context("creating libParams")?;
1742 salmon_model::dumps::write_flen_dist(
1743 &dir.join("libParams").join("flenDist.txt"),
1744 &res.frag_len_dist,
1745 )
1746 .context("writing flenDist.txt")?;
1747 salmon_model::dumps::write_fld_dump(&dir.join("aux_info").join("fld.gz"), &res.frag_len_dist)
1748 .context("writing fld.gz")?;
1749 salmon_model::dumps::write_aux_bias_dumps(&dir.join("aux_info"), &res.bias_dump)
1750 .context("writing aux bias dumps")?;
1751 std::fs::create_dir_all(dir.join("logs")).context("creating logs")?;
1752 let log = format!(
1753 "salmon (rust port, alignment mode) v{SALMON_VERSION}\nstart: {}\nend: {}\n\
1754 library type: {}\nobserved fragments: {}\nmapped fragments: {}\nmapping rate: {pct:.4}%\n\
1755 number of equivalence classes: {}\nfragment length mean (sd): {:.2} ({:.2})\n",
1756 res.start_time,
1757 asctime_now(),
1758 opts.lib_type,
1759 res.num_processed,
1760 res.num_mapped,
1761 res.num_eq_classes,
1762 res.frag_len_mean,
1763 res.frag_len_sd,
1764 );
1765 std::fs::write(dir.join("logs").join("salmon_quant.log"), log)
1766 .context("writing salmon_quant.log")?;
1767
1768 {
1770 let (uniq, ambig) = &res.ambig;
1771 let mut w = std::io::BufWriter::new(std::fs::File::create(
1772 dir.join("aux_info").join("ambig_info.tsv"),
1773 )?);
1774 writeln!(w, "UniqueCount\tAmbigCount")?;
1775 for i in 0..uniq.len() {
1776 writeln!(w, "{}\t{}", uniq[i], ambig[i])?;
1777 }
1778 w.flush()?;
1779 }
1780
1781 #[derive(Serialize)]
1783 struct CmdInfo {
1784 salmon_version: String,
1785 alignments: String,
1786 targets: String,
1787 #[serde(rename = "libType")]
1788 lib_type: String,
1789 output: String,
1790 #[serde(rename = "auxDir")]
1791 aux_dir: String,
1792 }
1793 let cmd = CmdInfo {
1794 salmon_version: SALMON_VERSION.to_string(),
1795 alignments: opts.bam.display().to_string(),
1796 targets: opts
1797 .transcripts
1798 .as_ref()
1799 .map(|p| p.display().to_string())
1800 .unwrap_or_default(),
1801 lib_type: opts.lib_type.clone(),
1802 output: opts.output_dir.display().to_string(),
1803 aux_dir: "aux_info".to_string(),
1804 };
1805 std::fs::write(
1806 dir.join("cmd_info.json"),
1807 serde_json::to_string_pretty(&cmd)?,
1808 )?;
1809 Ok(())
1810}
1811
1812#[cfg(test)]
1813mod tests {
1814 use super::*;
1815
1816 #[test]
1817 fn alignment_score_value_decoding() {
1818 assert_eq!(value_as_i32(&Value::Int32(-12)), Some(-12));
1819 assert_eq!(value_as_i32(&Value::Int8(0)), Some(0));
1820 assert_eq!(value_as_i32(&Value::UInt16(300)), Some(300));
1821 assert_eq!(value_as_i32(&Value::Character(b'A')), None);
1823 }
1824}